Created

Embed URL

HTTPS clone URL

SSH clone URL

You can clone with HTTPS or SSH.

Download Gist

Convert cables to text files - CableGate 2010-11 release

View cablegate_extract.rb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
require 'rubygems'
require 'nokogiri'
require 'hpricot'
require 'open-uri'
 
####
# Wikileak CableGate Parser
#
# First release CableGate release (220 cables) has an index with info about all cables.
# Extract these info and fetch the content of each file to rebuild a text only version of the content
#
 
web_root = "http://localhost/wikileaks"
scrape_root = "/tmp/wikileaks"
 
5.times do |page|
p "#{web_root}/reldate/2010-11_#{page}.html"
document = Hpricot(open("#{web_root}/reldate/2010-11_#{page}.html"))
 
(document/"//tr").each do |c|
cable = {}
current_index = 0
if cable_info = (c/"//td")
if cable_info[0]
5.times do |index|
case index
when 0
cable[:id] = (cable_info[index]/"/a").inner_html
when 1
cable[:title] = cable_info[index].inner_html
when 2
cable[:date] = Time.parse((cable_info[index]/"/a").inner_html)
when 3
cable[:classification] = (cable_info[index]/"/a").inner_html
when 4
cable[:origin] = (cable_info[index]/"/a").inner_html
end
end
unless File.exists? "#{scrape_root}/dates/#{cable[:date].strftime("%Y/%m")}/#{cable[:id]}.txt"
p "#{web_root}/cable/#{cable[:date].strftime("%Y/%m")}/#{cable[:id]}.txt"
cable_document = Hpricot(open("#{web_root}/cable/#{cable[:date].strftime("%Y/%m")}/#{cable[:id]}.html"))
cable[:content] = ""
(cable_document/"//pre").each do |content|
cable[:content] << content.inner_html.gsub("#x000A;", "\n").gsub(/&$/, "").gsub(/<a.[^>]*>/, "").gsub("</a>", "")
end
cable_file = ""
cable_file << cable[:id]
cable_file << "\n"
cable_file << cable[:title]
cable_file << "\n"
cable_file << cable[:date].to_s
cable_file << "\n"
cable_file << cable[:classification]
cable_file << "\n"
cable_file << cable[:origin]
cable_file << "\n"
cable_file << cable[:content]
[
"#{scrape_root}/cables",
"#{scrape_root}/dates/#{cable[:date].strftime("%Y/%m")}",
"#{scrape_root}/classification/#{cable[:classification]}",
"#{scrape_root}/origin/#{cable[:origin]}",
"#{scrape_root}/rel_date/2010/11/"
].each do |folder|
FileUtils.mkdir_p folder
p "folder: #{File.join(folder, "#{cable[:id]}.txt").to_s}"
File.open(File.join(folder, "#{cable[:id]}.txt"), "w") do |f|
f.write cable_file
end
end
end
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.