public
Last active

Convert cables to text files - CableGate 2010-11 release

  • Download Gist
cablegate_extract.rb
Ruby
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
require 'rubygems'
require 'nokogiri'
require 'hpricot'
require 'open-uri'
 
####
# Wikileak CableGate Parser
#
# First release CableGate release (220 cables) has an index with info about all cables.
# Extract these info and fetch the content of each file to rebuild a text only version of the content
#
 
web_root = "http://localhost/wikileaks"
scrape_root = "/tmp/wikileaks"
 
5.times do |page|
p "#{web_root}/reldate/2010-11_#{page}.html"
document = Hpricot(open("#{web_root}/reldate/2010-11_#{page}.html"))
 
(document/"//tr").each do |c|
cable = {}
current_index = 0
if cable_info = (c/"//td")
if cable_info[0]
5.times do |index|
case index
when 0
cable[:id] = (cable_info[index]/"/a").inner_html
when 1
cable[:title] = cable_info[index].inner_html
when 2
cable[:date] = Time.parse((cable_info[index]/"/a").inner_html)
when 3
cable[:classification] = (cable_info[index]/"/a").inner_html
when 4
cable[:origin] = (cable_info[index]/"/a").inner_html
end
end
unless File.exists? "#{scrape_root}/dates/#{cable[:date].strftime("%Y/%m")}/#{cable[:id]}.txt"
p "#{web_root}/cable/#{cable[:date].strftime("%Y/%m")}/#{cable[:id]}.txt"
cable_document = Hpricot(open("#{web_root}/cable/#{cable[:date].strftime("%Y/%m")}/#{cable[:id]}.html"))
cable[:content] = ""
(cable_document/"//pre").each do |content|
cable[:content] << content.inner_html.gsub("#x000A;", "\n").gsub(/&$/, "").gsub(/<a.[^>]*>/, "").gsub("</a>", "")
end
cable_file = ""
cable_file << cable[:id]
cable_file << "\n"
cable_file << cable[:title]
cable_file << "\n"
cable_file << cable[:date].to_s
cable_file << "\n"
cable_file << cable[:classification]
cable_file << "\n"
cable_file << cable[:origin]
cable_file << "\n"
cable_file << cable[:content]
[
"#{scrape_root}/cables",
"#{scrape_root}/dates/#{cable[:date].strftime("%Y/%m")}",
"#{scrape_root}/classification/#{cable[:classification]}",
"#{scrape_root}/origin/#{cable[:origin]}",
"#{scrape_root}/rel_date/2010/11/"
].each do |folder|
FileUtils.mkdir_p folder
p "folder: #{File.join(folder, "#{cable[:id]}.txt").to_s}"
File.open(File.join(folder, "#{cable[:id]}.txt"), "w") do |f|
f.write cable_file
end
end
end
end
end
end
end

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.