Skip to content

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Convert cables to text files - CableGate 2010-11 release
require 'rubygems'
require 'nokogiri'
require 'hpricot'
require 'open-uri'
####
# Wikileak CableGate Parser
#
# First release CableGate release (220 cables) has an index with info about all cables.
# Extract these info and fetch the content of each file to rebuild a text only version of the content
#
web_root = "http://localhost/wikileaks"
scrape_root = "/tmp/wikileaks"
5.times do |page|
p "#{web_root}/reldate/2010-11_#{page}.html"
document = Hpricot(open("#{web_root}/reldate/2010-11_#{page}.html"))
(document/"//tr").each do |c|
cable = {}
current_index = 0
if cable_info = (c/"//td")
if cable_info[0]
5.times do |index|
case index
when 0
cable[:id] = (cable_info[index]/"/a").inner_html
when 1
cable[:title] = cable_info[index].inner_html
when 2
cable[:date] = Time.parse((cable_info[index]/"/a").inner_html)
when 3
cable[:classification] = (cable_info[index]/"/a").inner_html
when 4
cable[:origin] = (cable_info[index]/"/a").inner_html
end
end
unless File.exists? "#{scrape_root}/dates/#{cable[:date].strftime("%Y/%m")}/#{cable[:id]}.txt"
p "#{web_root}/cable/#{cable[:date].strftime("%Y/%m")}/#{cable[:id]}.txt"
cable_document = Hpricot(open("#{web_root}/cable/#{cable[:date].strftime("%Y/%m")}/#{cable[:id]}.html"))
cable[:content] = ""
(cable_document/"//pre").each do |content|
cable[:content] << content.inner_html.gsub("#x000A;", "\n").gsub(/&$/, "").gsub(/<a.[^>]*>/, "").gsub("</a>", "")
end
cable_file = ""
cable_file << cable[:id]
cable_file << "\n"
cable_file << cable[:title]
cable_file << "\n"
cable_file << cable[:date].to_s
cable_file << "\n"
cable_file << cable[:classification]
cable_file << "\n"
cable_file << cable[:origin]
cable_file << "\n"
cable_file << cable[:content]
[
"#{scrape_root}/cables",
"#{scrape_root}/dates/#{cable[:date].strftime("%Y/%m")}",
"#{scrape_root}/classification/#{cable[:classification]}",
"#{scrape_root}/origin/#{cable[:origin]}",
"#{scrape_root}/rel_date/2010/11/"
].each do |folder|
FileUtils.mkdir_p folder
p "folder: #{File.join(folder, "#{cable[:id]}.txt").to_s}"
File.open(File.join(folder, "#{cable[:id]}.txt"), "w") do |f|
f.write cable_file
end
end
end
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.