#!/usr/bin/env ruby require "net/http" require "uri" require "cgi" require "yaml" start = 100 url = URI.parse("http://thepiratebay.org/") cats = Hash.new #starting from 100 look for empty pages. while start<700 Net::HTTP.start(url.host, url.port) {|http| res = http.get("/browse/#{start}") if res.body.size > 0 cats[res.body.scan(/(.*?)<\/title>/).to_s] = start start += 1 else start = ((start/100) + 1) * 100 end } pp start end # starting from 699 work backwards for empty pages start = 699 while start>0 Net::HTTP.start(url.host, url.port) {|http| res = http.get("/browse/#{start}") if res.body.size > 0 cats[res.body.scan(/<title>(.*?)<\/title>/).to_s] = start start -= 1 else start = ((start/100) - 1) * 100 start -= 1 end } pp start end #convert poorly named categories hash to something # slightly useful master_hash = Hash.new master_hash["_RAW"] = cats master_hash["_RAW"].keys.each{|raw_key| html = CGI::unescapeHTML(raw_key).split(">") key = html.first.strip master_hash[key] ||= Hash.new name = html.last.strip master_hash[key][name.sub(" - TPB","")] = master_hash["_RAW"][raw_key] } File.open(File.join(File.dirname(__FILE__),"config","pirate_categories.yml"),"w+"){|f| f.puts( YAML::dump(master_hash)) }