#This class takes a name of a movie and gets data from imdb # Author:: Stephen Becker IV (mailto:sbecker@x.y@gmail ) # Copyright:: Copyright # License:: Distributes under the same terms as Ruby # version:: 0.2 ### IMDB class ##thank you why for hpricot! ## thank you for the text()! require 'rubygems' #needs version >= 0.5.00 of hpricot require 'hpricot' require 'open-uri' require 'uri' require 'pp' require "rexml/document" #read about my suggestion @ http://code.whytheluckystiff.net/hpricot/ticket/37 #this will just run next node X times #node_at just did not float my boat. module Hpricot module Traverse # Returns the node neighboring this node to the south: just below it. # This method includes text nodes and comments and such. def next_node(loop=1) sib = parent.children sib[sib.index(self) + loop] if parent end end end #TODO #get any thing else #get all directors class IMDB #We save some things for a single load of the page #other things do not need to be saved and i will try #benchmarking the diffrence one day def initialize(movie="",interactive_load=false) @imdb_link=nil #max guessing links @MAXCOUNT=15 #does command line menu if true @interactive_load=interactive_load #not title @movie_name=movie #html section of imdb that includes the poster year title director @short_details=nil #html of the whole page. only get this once @page_html=nil #viewing time @movie_length=nil #plot @movie_plot=nil #cast_html that will be replaced with a better representation @cast_html=nil @movie_title=nil @movie_year=nil end ##Complete html of page ## def page_html load_page.to_html end #html that is the cast list #with pics and striping def get_cast_html return @cast_html if @cast_html data=details_based_on_poster_attribute @cast_html=data.first.search("table").collect{|table| table if table.at("b[@class='blackcatheader']")}.compact.last.to_html @cast_html end #html section of imdb that includes the poster year title director def get_short_details_html return @short_details if @short_details data=details_based_on_poster_attribute #kills part of the table that displays imdb related info data[1].at(:table).search("td").last.attributes['style'] = 'display:none;' short_details=data[1].at(:table).to_html @short_details=short_details end #arry of genres #no html for this def genres doc=load_page doc.search("a").collect{|x| x.inner_html if x.attributes['href'] && x.attributes['href'].include?("Genres")}.compact end #Rating and reason def movie_rating_html load_page if !@page_html #we really just want the .* part of the regex. can be killed with a gsub or using xpath rate = /MPAA<\/a>:<\/b>.*
/.match(@page_html) rate||= "None provided" end #Same as above maybe i should strip the html? def mpaa movie_rating_html.to_s end #returns an array of strings with the titles listed def other_titles return @other_titles if @other_titles set_plot_and_time @other_titles end #year of release def year return @movie_year if @movie_year doc=load_page @movie_year=(doc.search("//h1/strong/small/a/text" )).to_s.strip end #The movie name might notbe the proper title def title return @movie_title if @movie_title doc=load_page @movie_title=(doc.search("//h1/strong/text" )).to_s.strip end #the link def imdb_link @imdb_link end # the link to the poster def poster_link doc=load_page #just following the xpath return (doc/"//a[@name='poster']/img").first["src"] end #just the first one def director doc=load_page return doc.search("b[@class='blackcatheader']")[0].next_node(3).inner_html end # How long the movie is #returns string time is in mins def run_time return @movie_length if @movie_length set_plot_and_time return @movie_length end #most movies do not have plot def plot return @movie_plot if @movie_plot set_plot_and_time return @movie_plot end def to_xml data=REXML::Document.new("") base=data.add_element("movie") puts self.title base.attributes["name"]=self.title base.attributes["api_version"]="0.1" a=base.add_element("cast") actors.each{|key,value| b=a.add_element("actor") b.add_element("name").text=key b.add_element("role").text=value } base.add_element("run_time").text=run_time base.add_element("plot").text=plot base.add_element("director").text=director #dont think the link plays nice with xml, need to use #base.add_element("poster link").text=poster #CGI::escapeHTML(string) or ERB::Util.html_escape base.add_element("link").text=imdb_link base.add_element("title").text=title base.add_element("year").text=year base.add_element("rating").text=mpaa g=base.add_element("genres") genres.each{|value| g.add_element("type").text=value } result="" data.write(result) return result end #a hash where the name is the key and the value is the role #needs better xpath def actors #moo cow=get_cast_html #cluck a=cow.split(/<[^>]*>/) a=a.delete_if{|x| x.size<=1 || x==" .... " || x=="Cast overview, first billed only: " || x=="  " || x=="(more)"} #quack actors_hash={} (0..a.size-1/2).each{|x| actors_hash.merge!({a[x*2]=>a[(x*2)+1]}) if a[x*2] } actors_hash end #returns an array of the possable titles for a search. #if its not in the list try a differnt name def get_links movie_name=@movie_name movie_name.downcase! #_ is used in the folder names movie_name.gsub!("_","+") movie_name.gsub!(" ","+") #first search #create a hpricot object doc = Hpricot(open(URI.encode("http://www.imdb.com/find?s=all&q=#{movie_name}"))) #find all links elements = doc.search("a") arr=[] #if the inner html of the link is the same as the movie name add it to the list. elements.each{|link| count=0 if link.attributes['href'] && link.attributes['href'].include?('/title/') && count<@MAXCOUNT arr.push(unescapeHTML(link.inner_html)+" "+link.next_node.to_s) count=count+1 end } return arr end private #both bits of data are found at the same time so set them both. def set_plot_and_time doc=load_page doc.search("//b"){|x| case x.inner_html when "Plot Outline:" @movie_plot=x.next_node.to_s.strip when "Also Known As:" count=2; @other_titles=[] #have to find all text nodes until x.next_node(count).to_s.include?("0 #use the first link with the same name as what we search for create a #menu system if more or create more then one entry? if !@interactive_load @imdb_link="http://www.imdb.com#{arr.first}" else @imdb_link="http://www.imdb.com"+movie_menu(elements) end doc=Hpricot(open(@imdb_link)) @page_html=doc.to_html #some movies do not take you to a search page example is robin hood men #in tights i guess sometimes there is no need for a search page elsif (doc/"/html/head/title").inner_html!="IMDb Search" @imdb_link="http://www.imdb.com/find?s=all&q=#{movie_name}" @page_html=doc.to_html else #all searches have failed raise "Error: No Inner HTML links found!" end end return doc end def movie_menu(elements) count=0 array=[] puts "Pick a number to load" elements.each{|link| if link.attributes['href'] && link.attributes['href'].include?('/title/') && count<@MAXCOUNT puts count.to_s+")"+unescapeHTML(link.inner_html)+" "+link.next_node.to_s array.push(link.attributes['href']) count=count+1 end } number=gets return array[number.to_i%array.size] end #Jacked from http://www.rubycentral.com/book/tut_stdtypes.html def unescapeHTML(string) str = string.dup str.gsub!(/&(.*?);/n) { match = $1.dup case match when /\Aamp\z/ni then '&' when /\Aquot\z/ni then '"' when /\Agt\z/ni then '>' when /\Alt\z/ni then '<' when /\A#(\d+)\z/n then Integer($1).chr when /\A#x([0-9a-f]+)\z/ni then $1.hex.chr end } str end end #movie=IMDB.new("office space") ##pp movie.get_links #pp movie.poster_link #pp "#####################################" #pp movie.actors #pp "#####################################" #pp movie.plot #pp "#####################################" #pp movie.run_time #pp "#####################################" #pp movie.year #pp "#####################################" #pp movie.title #pp "#####################################" #pp movie.director #pp "#####################################" #pp movie.genres #pp "#####################################" #pp movie.mpaa #pp "#####################################" #pp movie.movie_rating_html #pp "#####################################" #pp movie.other_titles