C0 code coverage information
Generated on Sat Jul 07 22:37:29 -0400 2007 with rcov 0.8.0
Code reported as executed by Ruby looks like this...
and this: this line is also marked as covered.
Lines considered as run by rcov, but not reported by Ruby, look like this,
and this: these lines were inferred by rcov (using simple heuristics).
Finally, here's a line marked as not executed.
| Name |
Total lines |
Lines of code |
Total coverage |
Code coverage |
|
IMDB.rb
|
505
|
306
|
|
|
1 #This class takes a name of a movie and gets data from imdb
2 # Author:: Stephen Becker IV (mailto:sbecker@x.y@gmail )
3 # Copyright:: Copyright
4 # License:: Distributes under the same terms as Ruby
5 # version:: High Plains Drifter (0.5)
6 ### IMDB class
7 ##thank you why for hpricot!
8 ## thank you for the text()!
9 require 'rubygems'
10 #needs version >= 0.5.00 of hpricot
11 #works with version 6.0 ruby
12 require 'hpricot'
13 require 'open-uri'
14 require 'uri'
15 require 'pp'
16 require "rexml/document"
17
18 #read about my suggestion @ http://code.whytheluckystiff.net/hpricot/ticket/37
19 #this will just run next node X times
20 #node_at just did not float my boat.
21 module Hpricot
22 module Traverse
23 # Returns the node neighboring this node to the south: just below it.
24 # This method includes text nodes and comments and such.
25 def next_node(loop=1)
26 sib = parent.children
27 sib[sib.index(self) + loop] if parent
28 end
29 end
30 end
31
32 class IMDB
33 class << self
34 #returns a hash of the differnt titles and what they are
35 # {"Titles (Exact Matches)"=>["Office Space"],
36 #"Titles (Partial Matches)"=>["'Office Space': Out of the Office"],
37 #"Popular Titles"=>["Office Space"],
38 #"Titles (Approx Matches)"=>["Spice Girls: One Hour of Girl Power"]}
39 def title_search(title)
40 movie_name = title
41 movie_name.downcase!
42 #_ is used in the folder names
43 movie_name.gsub!("_","+")
44 movie_name.gsub!(" ","+")
45 #first search
46 #create a hpricot object
47 doc = Hpricot(open(URI.encode("http://www.imdb.com/find?s=all&q=#{movie_name}")))
48 z = []
49 doc.search("p").each{|x| z.push(x) if !x.search("table").nil? && !x.search("b").first.nil? && x.search("b").first.inner_html.to_s.downcase.include?("title") }
50 p = {}
51 z.each{|x| p.merge!({x.search("b").first.inner_html => x.search("a").collect{|z| z.inner_html}.delete_if{|u| u.include?("<img")}.uniq})}
52 return p
53 end
54
55 end
56
57 #We save some things for a single load of the page
58 #other things do not need to be saved and i will try
59 #benchmarking the diffrence one day
60 def initialize(movie="",interactive_load=false)
61 @imdb_link = nil
62 #max guessing links
63 @MAXCOUNT = 15
64 @html_info_tags = nil
65 #does command line menu if true
66 @interactive_load = interactive_load
67 #not title
68 @movie_name = movie
69 #html section of imdb that includes the poster year title director
70 @short_details = nil
71 #html of the whole page. only get this once
72 @page_html = nil
73 #viewing time
74 @movie_length = nil
75 #plot
76 @movie_plot = nil
77 #cast_html that will be replaced with a better representation
78 @cast_html = nil
79 @movie_title = nil
80 @movie_year = nil
81 end
82
83 ##Complete html of page
84 def page_html
85 load_page.to_html
86 end
87 alias_method :to_html,:page_html
88 alias_method :html,:page_html
89
90 # the link to the poster
91 def poster_link
92 doc=load_page
93 #just following the xpath
94 return (doc/"//a[@name='poster']/img").first["src"]
95 end
96
97 #the link
98 def imdb_link
99 @imdb_link
100 end
101
102
103 def to_xml
104 data = REXML::Document.new("<?xml version='1.0' encoding='ISO-8859-1'?>")
105 base = data.add_element("movie")
106 base.attributes["name"]=self.title
107 base.attributes["api_version"]="0.2"
108 a=base.add_element("cast")
109 actors.each{|key,value|
110 b=a.add_element("actor")
111 b.add_element("name").text=key
112 b.add_element("role").text=value
113 }
114 base.add_element("run_time").text=runtime
115 base.add_element("plot").text=plot
116 base.add_element("director").text=directors.join(",")
117 base.add_element("writer").text=writers.join(",")
118 #dont think the link plays nice with xml, need to use
119 #base.add_element("poster link").text=poster
120 #CGI::escapeHTML(string) or ERB::Util.html_escape
121 base.add_element("link").text=imdb_link
122 base.add_element("title").text=title
123 base.add_element("date").text=date.to_s
124 base.add_element("rating").text=mpaa
125 base.add_element("user_comments").text=user_comments
126 base.add_element("tag_line").text=tagline
127 base.add_element("plot").text=plot
128 #base.add_element("keywords").text=unescapeHTML(keywords.join(","))
129 g=base.add_element("genres")
130 genres.each{|value|
131 g.add_element("type").text=value
132 }
133 result = ""
134 data.write(result)
135 return result
136 end
137
138 #returns an array of the possable titles for a search.
139 #if its not in the list try a differnt name
140 def title_search
141 return IMDB.title_search(@movie_name)
142 end
143 ###############olds
144
145
146 def tagline
147 #"Work Sucks."
148 z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="tagline"}
149 return z.first.search("h5").first.next_node.to_s.strip unless z.first.nil?
150 ""
151 end
152
153 #returns an array of genres
154 def genre
155 #["Comedy", "Crime"]
156 z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="genre"}
157 return z.first.search("a").select{|z| !z.classes.include?("inline")}.collect{|b| b.inner_html} unless z.first.nil?
158 []
159 end
160 alias_method :genres,:genre
161
162 #returns a date object with the release date
163 def release_date
164 #19 February 1999 (USA)
165 z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="release_date"}
166 return Date.new if z.first.nil?
167 date2 = z.first.search("h5").first.next_node.to_s.strip.split
168 day = date2[0]
169 month = Date::MONTHNAMES.index(date2[2])
170 year = date2[3]
171 x=Date.parse(date2[0...3].join(" "),"%d %B %Y")
172 x
173 end
174 alias_method :date,:release_date
175
176 #returns an array of writers
177 def writer
178 #["Mike Judge"]
179 z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="writers"}
180 return z.first.search("a").select{|z| !z.inner_html.include?('(WGA)') && !z.classes.include?("tn15more")}.collect{|b| b.inner_html.to_s.strip}.uniq unless z.first.nil?
181 []
182 end
183 alias_method :writers,:writer
184
185 #returns an array of directors
186 def director
187 #["Mike Judge"]
188 z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="director"}
189 return z.first.search("a").select{|z| !z.classes.include?("tn15more")}.collect{|b| b.inner_html.to_s.strip}.uniq unless z.first.nil?
190 []
191 end
192 alias_method :directors,:director
193
194 #a string with what hollywood calls plot these days
195 def plot
196 #Comedic tale of company workers who hate their jobs and decide to rebel against their greedy boss.
197 z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="plot_outline"}
198 return z.first.search("h5").first.next_node.to_s.strip unless z.first.nil?
199 ""
200 end
201 alias_method :plot_outline,:plot
202
203 #not sure about supporting this yet
204 #["This plot synopsis is empty. Add a synopsis"]
205 #@html_info_tags.select{|x| text_clean(x.search("h5").text())=="plot_synopsis"}.first.search("a").select{|z| !z.classes.include?("inline")}.collect{|b| b.inner_html}
206
207 #an array of key words?
208 #["Hypnosis", "Cult Comedy", "Kung Fu", "Post It", "Arson"]
209 def plot_keywords
210 z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="plot_keywords"}
211 return z.first.search("a").select{|z| !z.classes.include?("inline")}.collect{|b| unescapeHTML(b.inner_html)} unless z.first.nil?
212 []
213 end
214 alias_method :keywords,:plot_keywords
215
216 #string i hope.
217 #2 nominations
218 def awards
219 z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="awards"}
220 return z.first.search("h5").first.next_node.to_s.strip unless z.first.nil?
221 ""
222 end
223
224 #some say its cheating... @html_info_tags.select{|x| text_clean(x.search("h5").text())=="[[%s]]"}.first.search("h5").first.next_node.to_s.strip#
225 #string of some user comments
226 def user_comments
227 z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="user_comments"}
228 return z.first.search("h5").first.next_node.to_s.strip unless z.first.nil?
229 ""
230 end
231
232 #title else where in the world
233 def also_known_as
234 z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="also_known_as"}
235 return z.first.search("h5").first.next_node.to_s.strip unless z.first.nil?
236 ""
237 end
238
239 alias_method :aka,:also_known_as
240
241 #string of the rating and why
242 def mpaa
243 z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="mpaa"}
244 return z.first.search("h5").first.next_node.to_s.strip unless z.first.nil?
245 ""
246 end
247 alias_method :ratings,:mpaa
248 alias_method :rating,:mpaa
249
250 #string # min (hopefully is the format)
251 def runtime
252 z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="runtime"}
253 return z.first.search("h5").first.next_node.to_s.strip unless z.first.nil?
254 ""
255 end
256
257 #country as a string. most likely abbrv
258 def country
259 z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="country"}
260 return z.first.search("h5").first.next_node(2).inner_html unless z.first.nil?
261 ""
262 end
263
264 #orgrinal lang
265 def language
266 z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="language"}
267 return z.first.search("h5").first.next_node(2).inner_html unless z.first.nil?
268 ""
269 end
270 #was it shot in color?
271 def color
272 z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="color"}
273 return z.first.search("h5").first.next_node(2).inner_html unless z.first.nil?
274 ""
275 end
276
277 #string
278 def aspect_ratio
279 z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="aspect_ratio"}
280 return z.first.search("h5").first.next_node.to_s.strip unless z.first.nil?
281 ""
282 end
283 alias_method :aspect,:aspect_ratio
284
285 #string
286 def company
287 z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="company"}
288 return z.first.search("h5").first.next_node(2).inner_html unless z.first.nil?
289 ""
290 end
291
292 #ratings around the world. its a hash!
293 def certification
294 cert_hash = {}
295 begin
296 #{"UK"=>"15", "Ireland"=>"15", "Chile"=>"TE", "Australia"=>"M", "Argentina"=>"Atp", "Iceland"=>"L", "Sweden"=>"Btl", "Portugal"=>"M/12", "Spain"=>"T", "USA"=>"R", "Finland"=>"S", "France"=>"U", "Peru"=>"PT", "Canada"=>"AA", "Norway"=>"7", "Germany"=>"12", "Netherlands"=>"AL"}
297 page_info_tags.select{|x| text_clean(x.search("h5").text())=="certification"}.first.search("a").select{|z| !z.classes.include?("inline")}.each{|b| xx = b.inner_html.split(':')
298 cert_hash.merge!({xx[0]=>xx[1]})
299 }
300 rescue Exception => e
301 end
302 cert_hash
303 end
304 alias_method :certs,:certification
305 alias_method :certifications,:certification
306
307 #just the title incase it is differnt then the one you had used
308 def title
309 doc = load_page
310 title = doc.search("h1").first.inner_html
311 clean_html_tags(title)
312 clean_9_0(title)
313 title
314 end
315
316 def actors
317 doc = load_page
318 p = {}
319 begin
320 doc.search("table[@class='cast']").first.search("tr").each{|x| p.merge!({x.search("td[@class='nm']/a").inner_html => x.search("td[@class='char']").inner_html})}
321 rescue Exception =>e
322 end
323 p
324 end
325 alias_method :cast,:actors
326
327 private
328
329
330 #needs better xpath
331 #gets the upper level to find the poster html. should be broken out in xpaths
332 def details_based_on_poster_attribute
333 doc = load_page
334 data = []
335 doc.search("table").each{|rate| (rate/"a").each{|link| data.push(rate) if link.attributes['name']=="poster" }}
336 return data
337 end
338 #so i dont have to run the info tag search alot
339 def page_info_tags
340 return @html_info_tags if @html_info_tags
341 doc = load_page
342 @html_info_tags = doc.search("div[@class='info']")
343 end
344
345 #loads the page data once and stores it in @page_html
346 def load_page
347 #check if we have page html if so return
348 if @page_html
349
350 doc = Hpricot(@page_html)
351
352 else
353 #alter the passed in name to fit the imdb search
354 movie_name = @movie_name
355 movie_name.downcase!
356 #_ is used in the folder names
357 movie_name.gsub!("_","+")
358 movie_name.gsub!(" ","+")
359 #first search
360 #create a hpricot object
361 #for testing...
362 #File.open("/home/sbecker/moo.html","r") { |f| @page_html=f.read }
363 #@imdb_link="adff"
364 #doc=Hpricot(@page_html)
365
366 doc = Hpricot(open(URI.encode("http://www.imdb.com/find?s=all&q=#{movie_name}")))
367 #find all links
368 elements = doc.search("a")
369 arr = []
370 #if the inner html of the link is the same as the movie name add it to the
371 #list.
372 elements.each{|link| arr.push(link.attributes['href']) if link.attributes['href'] && link.attributes['href'].include?("title")}
373 if doc.search("h1").first.nil?
374 #use the first link with the same name as what we search for create a
375 #menu system if more or create more then one entry?
376 if !@interactive_load
377 @imdb_link = "http://www.imdb.com#{arr.first}"
378 else
379 @imdb_link = "http://www.imdb.com"+movie_menu(elements)
380 end
381 doc=Hpricot(open(@imdb_link))
382 @page_html = doc.to_html
383 @html_info_tags = doc.search("div[@class='info']")
384 #some movies do not take you to a search page example is robin hood men
385 #in tights i guess sometimes there is no need for a search page
386 elsif (doc/"/html/head/title").inner_html!="IMDb Search"
387 @imdb_link = "http://www.imdb.com/find?s=all&q=#{movie_name}"
388 @page_html = doc.to_html
389 @html_info_tags = doc.search("div[@class='info']")
390 else
391 #all searches have failed
392 raise "Error: No Inner HTML links found!"
393 end
394 end
395 return doc
396 end
397
398
399 #@html_info_tags = doc.search("div[@class='info']")
400 def movie_menu(elements)
401 count = 0
402 array = []
403 puts "Pick a number to load"
404 elements.each{|link|
405 if link.attributes['href'] && link.attributes['href'].include?('/title/') && count<@MAXCOUNT
406 puts count.to_s+")"+unescapeHTML(link.inner_html)+" "+link.next_node.to_s
407 array.push(link.attributes['href'])
408 count = count+1
409 end
410 }
411 number = gets
412 return array[number.to_i%array.size]
413 end
414
415 #needed to clean a few things
416 def text_clean(text)
417 cleaned = text
418 cleaned.downcase!
419 clean_html_tags(cleaned)
420 clean_9_0(cleaned)
421 cleaned.gsub!(/[^a-z\s]*/,'')
422 cleaned.strip!
423 cleaned.gsub!(/ /,'_')
424 cleaned
425 end
426 def clean_html_tags(cleaned)
427 cleaned.gsub!(/<[^<]*>/,'')
428 end
429 def clean_9_0(cleaned)
430 cleaned.gsub!(/\([^\(]*\)/,'')
431 end
432 #Jacked from http://www.rubycentral.com/book/tut_stdtypes.html
433 def unescapeHTML(string)
434 str = string.dup
435 str.gsub!(/&(.*?);/n) {
436 match = $1.dup
437 case match
438 when /\Aamp\z/ni then '&'
439 when /\Aquot\z/ni then '"'
440 when /\Agt\z/ni then '>'
441 when /\Alt\z/ni then '<'
442 when /\A#(\d+)\z/n then Integer($1).chr
443 when /\A#x([0-9a-f]+)\z/ni then $1.hex.chr
444 end
445 }
446 str
447 end
448
449 end
450 #begin
451 # ["scotland pa","MaLlRaTs","Doctor+Zhivago","blue_velvet","die hard","die hard 2","ghost in the shell","pi","office_space","11:14","high plains drifter"].each{|movie|
452 # puts movie
453 # movie=IMDB.new(movie)# or office_space or OFFICE_SPACE
454 # #pp movie.get_links
455 # pp movie.actors
456 # pp movie.title
457 # pp movie.poster_link
458 # pp movie.rating
459 # pp movie.aka
460 # pp movie.also_known_as
461 # pp movie.aspect
462 # pp movie.aspect_ratio
463 # pp movie.awards
464 # pp movie.certification
465 # pp movie.certifications
466 # pp movie.certs
467 # pp movie.color
468 # pp movie.company
469 # pp movie.country
470 # pp movie.date
471 # pp movie.director
472 # pp movie.directors
473 # pp movie.genre
474 # pp movie.genres
475 # pp movie.imdb_link
476 # pp movie.keywords
477 # pp movie.language
478 # pp movie.mpaa
479 # pp movie.page_html
480 # pp movie.plot
481 # pp movie.plot_keywords
482 # pp movie.plot_outline
483 # pp movie.poster_link
484 # pp movie.rating
485 # pp movie.ratings
486 # pp movie.release_date
487 # pp movie.runtime
488 # pp movie.tagline
489 # pp movie.user_comments
490 # pp movie.writer
491 # pp movie.writers
492 # pp movie.title_search
493 # pp movie.to_xml
494 # sleep 30
495 # }
496 #rescue Exception=>e
497 # puts "------------"+e
498 # puts e.backtrace*"\n"
499 #end
500 #
501 #begin
502 # pp IMDB.title_search("white strips")
503 #rescue Exception => e
504 # pp e
505 #end
Generated using the rcov code coverage analysis tool for Ruby version 0.8.0.