Created
November 29, 2010 01:27
-
-
Save alx/719468 to your computer and use it in GitHub Desktop.
Convert cables to text files - CableGate 2010-11 release
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rubygems' | |
require 'nokogiri' | |
require 'hpricot' | |
require 'open-uri' | |
#### | |
# Wikileak CableGate Parser | |
# | |
# First release CableGate release (220 cables) has an index with info about all cables. | |
# Extract these info and fetch the content of each file to rebuild a text only version of the content | |
# | |
web_root = "http://localhost/wikileaks" | |
scrape_root = "/tmp/wikileaks" | |
5.times do |page| | |
p "#{web_root}/reldate/2010-11_#{page}.html" | |
document = Hpricot(open("#{web_root}/reldate/2010-11_#{page}.html")) | |
(document/"//tr").each do |c| | |
cable = {} | |
current_index = 0 | |
if cable_info = (c/"//td") | |
if cable_info[0] | |
5.times do |index| | |
case index | |
when 0 | |
cable[:id] = (cable_info[index]/"/a").inner_html | |
when 1 | |
cable[:title] = cable_info[index].inner_html | |
when 2 | |
cable[:date] = Time.parse((cable_info[index]/"/a").inner_html) | |
when 3 | |
cable[:classification] = (cable_info[index]/"/a").inner_html | |
when 4 | |
cable[:origin] = (cable_info[index]/"/a").inner_html | |
end | |
end | |
unless File.exists? "#{scrape_root}/dates/#{cable[:date].strftime("%Y/%m")}/#{cable[:id]}.txt" | |
p "#{web_root}/cable/#{cable[:date].strftime("%Y/%m")}/#{cable[:id]}.txt" | |
cable_document = Hpricot(open("#{web_root}/cable/#{cable[:date].strftime("%Y/%m")}/#{cable[:id]}.html")) | |
cable[:content] = "" | |
(cable_document/"//pre").each do |content| | |
cable[:content] << content.inner_html.gsub("#x000A;", "\n").gsub(/&$/, "").gsub(/<a.[^>]*>/, "").gsub("</a>", "") | |
end | |
cable_file = "" | |
cable_file << cable[:id] | |
cable_file << "\n" | |
cable_file << cable[:title] | |
cable_file << "\n" | |
cable_file << cable[:date].to_s | |
cable_file << "\n" | |
cable_file << cable[:classification] | |
cable_file << "\n" | |
cable_file << cable[:origin] | |
cable_file << "\n" | |
cable_file << cable[:content] | |
[ | |
"#{scrape_root}/cables", | |
"#{scrape_root}/dates/#{cable[:date].strftime("%Y/%m")}", | |
"#{scrape_root}/classification/#{cable[:classification]}", | |
"#{scrape_root}/origin/#{cable[:origin]}", | |
"#{scrape_root}/rel_date/2010/11/" | |
].each do |folder| | |
FileUtils.mkdir_p folder | |
p "folder: #{File.join(folder, "#{cable[:id]}.txt").to_s}" | |
File.open(File.join(folder, "#{cable[:id]}.txt"), "w") do |f| | |
f.write cable_file | |
end | |
end | |
end | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment