#This class takes a name of a movie and gets data from imdb # Author:: Stephen Becker IV (mailto:sbecker@x.y@gmail ) # Copyright:: Copyright # License:: Distributes under the same terms as Ruby # version:: High Plains Drifter (0.5) ### IMDB class ##thank you why for hpricot! ## thank you for the text()! require 'rubygems' #needs version >= 0.5.00 of hpricot #works with version 6.0 ruby require 'hpricot' require 'open-uri' require 'uri' require 'pp' require "rexml/document" #read about my suggestion @ http://code.whytheluckystiff.net/hpricot/ticket/37 #this will just run next node X times #node_at just did not float my boat. module Hpricot module Traverse # Returns the node neighboring this node to the south: just below it. # This method includes text nodes and comments and such. def next_node(loop=1) sib = parent.children sib[sib.index(self) + loop] if parent end end end class IMDB class << self #returns a hash of the differnt titles and what they are # {"Titles (Exact Matches)"=>["Office Space"], #"Titles (Partial Matches)"=>["'Office Space': Out of the Office"], #"Popular Titles"=>["Office Space"], #"Titles (Approx Matches)"=>["Spice Girls: One Hour of Girl Power"]} def title_search(title) movie_name = title movie_name.downcase! #_ is used in the folder names movie_name.gsub!("_","+") movie_name.gsub!(" ","+") #first search #create a hpricot object doc = Hpricot(open(URI.encode("http://www.imdb.com/find?s=all&q=#{movie_name}"))) z = [] doc.search("p").each{|x| z.push(x) if !x.search("table").nil? && !x.search("b").first.nil? && x.search("b").first.inner_html.to_s.downcase.include?("title") } p = {} z.each{|x| p.merge!({x.search("b").first.inner_html => x.search("a").collect{|z| z.inner_html}.delete_if{|u| u.include?("") base = data.add_element("movie") base.attributes["name"]=self.title base.attributes["api_version"]="0.2" a=base.add_element("cast") actors.each{|key,value| b=a.add_element("actor") b.add_element("name").text=key b.add_element("role").text=value } base.add_element("run_time").text=runtime base.add_element("plot").text=plot base.add_element("director").text=directors.join(",") base.add_element("writer").text=writers.join(",") #dont think the link plays nice with xml, need to use #base.add_element("poster link").text=poster #CGI::escapeHTML(string) or ERB::Util.html_escape base.add_element("link").text=imdb_link base.add_element("title").text=title base.add_element("date").text=date.to_s base.add_element("rating").text=mpaa base.add_element("user_comments").text=user_comments base.add_element("tag_line").text=tagline base.add_element("plot").text=plot #base.add_element("keywords").text=unescapeHTML(keywords.join(",")) g=base.add_element("genres") genres.each{|value| g.add_element("type").text=value } result = "" data.write(result) return result end #returns an array of the possable titles for a search. #if its not in the list try a differnt name def title_search return IMDB.title_search(@movie_name) end ###############olds def tagline #"Work Sucks." z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="tagline"} return z.first.search("h5").first.next_node.to_s.strip unless z.first.nil? "" end #returns an array of genres def genre #["Comedy", "Crime"] z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="genre"} return z.first.search("a").select{|z| !z.classes.include?("inline")}.collect{|b| b.inner_html} unless z.first.nil? [] end alias_method :genres,:genre #returns a date object with the release date def release_date #19 February 1999 (USA) z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="release_date"} return Date.new if z.first.nil? date2 = z.first.search("h5").first.next_node.to_s.strip.split day = date2[0] month = Date::MONTHNAMES.index(date2[2]) year = date2[3] x=Date.parse(date2[0...3].join(" "),"%d %B %Y") x end alias_method :date,:release_date #returns an array of writers def writer #["Mike Judge"] z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="writers"} return z.first.search("a").select{|z| !z.inner_html.include?('(WGA)') && !z.classes.include?("tn15more")}.collect{|b| b.inner_html.to_s.strip}.uniq unless z.first.nil? [] end alias_method :writers,:writer #returns an array of directors def director #["Mike Judge"] z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="director"} return z.first.search("a").select{|z| !z.classes.include?("tn15more")}.collect{|b| b.inner_html.to_s.strip}.uniq unless z.first.nil? [] end alias_method :directors,:director #a string with what hollywood calls plot these days def plot #Comedic tale of company workers who hate their jobs and decide to rebel against their greedy boss. z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="plot_outline"} return z.first.search("h5").first.next_node.to_s.strip unless z.first.nil? "" end alias_method :plot_outline,:plot #not sure about supporting this yet #["This plot synopsis is empty. Add a synopsis"] #@html_info_tags.select{|x| text_clean(x.search("h5").text())=="plot_synopsis"}.first.search("a").select{|z| !z.classes.include?("inline")}.collect{|b| b.inner_html} #an array of key words? #["Hypnosis", "Cult Comedy", "Kung Fu", "Post It", "Arson"] def plot_keywords z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="plot_keywords"} return z.first.search("a").select{|z| !z.classes.include?("inline")}.collect{|b| unescapeHTML(b.inner_html)} unless z.first.nil? [] end alias_method :keywords,:plot_keywords #string i hope. #2 nominations def awards z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="awards"} return z.first.search("h5").first.next_node.to_s.strip unless z.first.nil? "" end #some say its cheating... @html_info_tags.select{|x| text_clean(x.search("h5").text())=="[[%s]]"}.first.search("h5").first.next_node.to_s.strip# #string of some user comments def user_comments z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="user_comments"} return z.first.search("h5").first.next_node.to_s.strip unless z.first.nil? "" end #title else where in the world def also_known_as z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="also_known_as"} return z.first.search("h5").first.next_node.to_s.strip unless z.first.nil? "" end alias_method :aka,:also_known_as #string of the rating and why def mpaa z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="mpaa"} return z.first.search("h5").first.next_node.to_s.strip unless z.first.nil? "" end alias_method :ratings,:mpaa alias_method :rating,:mpaa #string # min (hopefully is the format) def runtime z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="runtime"} return z.first.search("h5").first.next_node.to_s.strip unless z.first.nil? "" end #country as a string. most likely abbrv def country z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="country"} return z.first.search("h5").first.next_node(2).inner_html unless z.first.nil? "" end #orgrinal lang def language z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="language"} return z.first.search("h5").first.next_node(2).inner_html unless z.first.nil? "" end #was it shot in color? def color z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="color"} return z.first.search("h5").first.next_node(2).inner_html unless z.first.nil? "" end #string def aspect_ratio z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="aspect_ratio"} return z.first.search("h5").first.next_node.to_s.strip unless z.first.nil? "" end alias_method :aspect,:aspect_ratio #string def company z = page_info_tags.select{|x| text_clean(x.search("h5").text())=="company"} return z.first.search("h5").first.next_node(2).inner_html unless z.first.nil? "" end #ratings around the world. its a hash! def certification cert_hash = {} begin #{"UK"=>"15", "Ireland"=>"15", "Chile"=>"TE", "Australia"=>"M", "Argentina"=>"Atp", "Iceland"=>"L", "Sweden"=>"Btl", "Portugal"=>"M/12", "Spain"=>"T", "USA"=>"R", "Finland"=>"S", "France"=>"U", "Peru"=>"PT", "Canada"=>"AA", "Norway"=>"7", "Germany"=>"12", "Netherlands"=>"AL"} page_info_tags.select{|x| text_clean(x.search("h5").text())=="certification"}.first.search("a").select{|z| !z.classes.include?("inline")}.each{|b| xx = b.inner_html.split(':') cert_hash.merge!({xx[0]=>xx[1]}) } rescue Exception => e end cert_hash end alias_method :certs,:certification alias_method :certifications,:certification #just the title incase it is differnt then the one you had used def title doc = load_page title = doc.search("h1").first.inner_html clean_html_tags(title) clean_9_0(title) title end def actors doc = load_page p = {} begin doc.search("table[@class='cast']").first.search("tr").each{|x| p.merge!({x.search("td[@class='nm']/a").inner_html => x.search("td[@class='char']").inner_html})} rescue Exception =>e end p end alias_method :cast,:actors private #needs better xpath #gets the upper level to find the poster html. should be broken out in xpaths def details_based_on_poster_attribute doc = load_page data = [] doc.search("table").each{|rate| (rate/"a").each{|link| data.push(rate) if link.attributes['name']=="poster" }} return data end #so i dont have to run the info tag search alot def page_info_tags return @html_info_tags if @html_info_tags doc = load_page @html_info_tags = doc.search("div[@class='info']") end #loads the page data once and stores it in @page_html def load_page #check if we have page html if so return if @page_html doc = Hpricot(@page_html) else #alter the passed in name to fit the imdb search movie_name = @movie_name movie_name.downcase! #_ is used in the folder names movie_name.gsub!("_","+") movie_name.gsub!(" ","+") #first search #create a hpricot object #for testing... #File.open("/home/sbecker/moo.html","r") { |f| @page_html=f.read } #@imdb_link="adff" #doc=Hpricot(@page_html) doc = Hpricot(open(URI.encode("http://www.imdb.com/find?s=all&q=#{movie_name}"))) #find all links elements = doc.search("a") arr = [] #if the inner html of the link is the same as the movie name add it to the #list. elements.each{|link| arr.push(link.attributes['href']) if link.attributes['href'] && link.attributes['href'].include?("title")} if doc.search("h1").first.nil? #use the first link with the same name as what we search for create a #menu system if more or create more then one entry? if !@interactive_load @imdb_link = "http://www.imdb.com#{arr.first}" else @imdb_link = "http://www.imdb.com"+movie_menu(elements) end doc=Hpricot(open(@imdb_link)) @page_html = doc.to_html @html_info_tags = doc.search("div[@class='info']") #some movies do not take you to a search page example is robin hood men #in tights i guess sometimes there is no need for a search page elsif (doc/"/html/head/title").inner_html!="IMDb Search" @imdb_link = "http://www.imdb.com/find?s=all&q=#{movie_name}" @page_html = doc.to_html @html_info_tags = doc.search("div[@class='info']") else #all searches have failed raise "Error: No Inner HTML links found!" end end return doc end #@html_info_tags = doc.search("div[@class='info']") def movie_menu(elements) count = 0 array = [] puts "Pick a number to load" elements.each{|link| if link.attributes['href'] && link.attributes['href'].include?('/title/') && count<@MAXCOUNT puts count.to_s+")"+unescapeHTML(link.inner_html)+" "+link.next_node.to_s array.push(link.attributes['href']) count = count+1 end } number = gets return array[number.to_i%array.size] end #needed to clean a few things def text_clean(text) cleaned = text cleaned.downcase! clean_html_tags(cleaned) clean_9_0(cleaned) cleaned.gsub!(/[^a-z\s]*/,'') cleaned.strip! cleaned.gsub!(/ /,'_') cleaned end def clean_html_tags(cleaned) cleaned.gsub!(/<[^<]*>/,'') end def clean_9_0(cleaned) cleaned.gsub!(/\([^\(]*\)/,'') end #Jacked from http://www.rubycentral.com/book/tut_stdtypes.html def unescapeHTML(string) str = string.dup str.gsub!(/&(.*?);/n) { match = $1.dup case match when /\Aamp\z/ni then '&' when /\Aquot\z/ni then '"' when /\Agt\z/ni then '>' when /\Alt\z/ni then '<' when /\A#(\d+)\z/n then Integer($1).chr when /\A#x([0-9a-f]+)\z/ni then $1.hex.chr end } str end end #begin # ["scotland pa","MaLlRaTs","Doctor+Zhivago","blue_velvet","die hard","die hard 2","ghost in the shell","pi","office_space","11:14","high plains drifter"].each{|movie| # puts movie # movie=IMDB.new(movie)# or office_space or OFFICE_SPACE # #pp movie.get_links # pp movie.actors # pp movie.title # pp movie.poster_link # pp movie.rating # pp movie.aka # pp movie.also_known_as # pp movie.aspect # pp movie.aspect_ratio # pp movie.awards # pp movie.certification # pp movie.certifications # pp movie.certs # pp movie.color # pp movie.company # pp movie.country # pp movie.date # pp movie.director # pp movie.directors # pp movie.genre # pp movie.genres # pp movie.imdb_link # pp movie.keywords # pp movie.language # pp movie.mpaa # pp movie.page_html # pp movie.plot # pp movie.plot_keywords # pp movie.plot_outline # pp movie.poster_link # pp movie.rating # pp movie.ratings # pp movie.release_date # pp movie.runtime # pp movie.tagline # pp movie.user_comments # pp movie.writer # pp movie.writers # pp movie.title_search # pp movie.to_xml # sleep 30 # } #rescue Exception=>e # puts "------------"+e # puts e.backtrace*"\n" #end # #begin # pp IMDB.title_search("white strips") #rescue Exception => e # pp e #end