public
Created

Goes through the HTML files in the directory /Users/dan/j-archive/html, which have come from http://www.j-archive.com/showgame.php?game_id=<NUMBER> and are called <NUMBER>.html, and parses out the questions, answers, categories, etc. Generates SQL which can be loaded into a SQLite database.

  • Download Gist
generate_db.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
#!/usr/bin/ruby
 
require 'rubygems'
require 'nokogiri'
 
class Clue
attr_accessor :text, :answer, :value, :category, :round, :difficulty, :shownumber
end
 
def escape(text)
t = text.clone
# Get rid of single-quotes already escaped for JS
t.gsub!(/\\'/, "'")
# Escape single-quotes
t.gsub!(/'/, "''")
 
t
end
 
 
def write_round(page, round_name)
title = page.css("title")
title.text =~ /Show #(\d+)/
show_number = $1
 
category_cells = page.css("div##{round_name} td.category_name")
 
clue_tables = page.css("div##{round_name} td.clue>table")
clue_tables.each do |clue_table|
c = Clue.new
 
# Populate the clue text
clue_text_cell = clue_table.css("td.clue_text").first
clue_id = clue_text_cell["id"]
c.text = clue_text_cell.text
 
# Populate the clue category
clue_id =~ /^clue_([FD]?J)_(\d)_(\d)/
category_num = $2.to_i - 1
c.category = category_cells[category_num].text
 
# Populate the clue round and difficulty
c.round = $1
c.difficulty = $3
 
# Populate the clue shownumber
c.shownumber = show_number
 
# Populate the clue value
values = clue_table.css("td.clue_value")
dd_values = clue_table.css("td.clue_value_daily_double")
if values.length > 0
c.value = values.first.text
elsif dd_values.length > 0
c.value = "$0"
else
next
end
 
# Populate the clue answer
shitty_div = clue_table.css("div").first
shitty_div["onmouseover"] =~ /correct_response">(.*?)<\//
answer = $1
answer = answer.downcase
answer.sub!(/^<i>/, "")
answer.sub!(/^"/, "")
answer.sub!(/"$/, "")
c.answer = answer
 
puts "INSERT INTO clue (text, answer, value, category, round, difficulty, shownumber) VALUES
('%s', '%s', '%s', '%s', '%s', '%s', '%s');" % [
escape(c.text),
escape(c.answer),
c.value,
escape(c.category),
c.round,
c.difficulty,
c.shownumber
]
end
end
 
def write_fj(page)
title = page.css("title")
title.text =~ /Show #(\d+)/
show_number = $1
 
category_cell = page.css("div#final_jeopardy_round td.category_name")
 
clue_table = page.css("div#final_jeopardy_round>table").first
return if clue_table.nil?
c = Clue.new
 
# Populate the clue text
clue_text_cell = clue_table.css("td.clue_text").first
clue_id = clue_text_cell["id"]
c.text = clue_text_cell.text
 
# Populate the clue category
c.category = clue_table.css("td.category_name").text
 
# Populate the clue round
c.round = "FJ"
c.difficulty = "0"
 
# Populate the clue shownumber
c.shownumber = show_number
 
# Populate the clue value
c.value = "$0"
 
# Populate the clue answer
shitty_div = clue_table.css("div").first
shitty_div["onmouseover"] =~ /correct_response\\">(.*?)<\//
answer = $1
answer = answer.downcase
answer.sub!(/^<i>/, "")
answer.sub!(/^"/, "")
answer.sub!(/"$/, "")
c.answer = answer
 
puts "INSERT INTO clue (text, answer, value, category, round, difficulty, shownumber) VALUES
('%s', '%s', '%s', '%s', '%s', '%s', '%s');" % [
escape(c.text),
escape(c.answer),
c.value,
escape(c.category),
c.round,
c.difficulty,
c.shownumber
]
end
 
puts "CREATE TABLE clue (text STRING, answer STRING, value STRING, category STRING, round STRING, difficulty STRING, shownumber STRING);"
 
Dir.foreach("/Users/dan/j-archive/html") do |f|
next unless (f =~ /\.html$/)
next unless (File.size("/Users/dan/j-archive/html/#{f}") > 0)
page = Nokogiri::HTML(open("/Users/dan/j-archive/html/#{f}", "r"))
next if page.css("div#jeopardy_round td.clue>table").empty?
 
["jeopardy_round", "double_jeopardy_round"].each do |round_name|
write_round(page, round_name)
end
 
write_fj(page)
end

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.