Skip to content

Instantly share code, notes, and snippets.

@eungju
Created January 6, 2011 13:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eungju/767906 to your computer and use it in GitHub Desktop.
Save eungju/767906 to your computer and use it in GitHub Desktop.
Download House M.D. Transcripts
#!/usr/bin/env ruby
require 'rubygems'
require 'hpricot'
require 'open-uri'
INDEX_PAGE = "http://community.livejournal.com/clinic_duty/12225.html"
(open(INDEX_PAGE) { |f| Hpricot(f) }/"table > tbody > tr > td > a").each do |a|
#Skip season links
next if (a/"b").any?
#Download a transcript
season, episode = a.previous.previous.inner_text.strip.split(".")
title = a.inner_text.strip
url = a["href"]
puts "Download %s.%s %s" % [season, episode, title]
transcript = (open(url) { |f| Hpricot(f) }/"body div#content-wrapper > div > p").inner_html
File.open("#{season}-#{episode}.html", "w") { |f|
f.write('<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>')
f.write(transcript)
f.write('</body></html>')
}
end
#!/usr/bin/env ruby
require 'rubygems'
require 'hpricot'
character = "House"
character = ARGV[0] if ARGV.length == 1
pattern = Regexp.compile("^\\s*#{character}\\s*:.+$", Regexp::IGNORECASE)
for episode in Dir.glob('?-??.html').shuffle
lines = open(episode) { |f|
Hpricot(f).search("*").select { |node|
node.text? and pattern.match(node.content)
}.shuffle
}
unless lines.empty?
puts lines.shuffle[0]
break
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment