Created
November 30, 2010 16:22
-
-
Save sgsinclair/721912 to your computer and use it in GitHub Desktop.
This *simply* converts HTML files from a directory into plain text files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *Simple* script by Stéfan Sinclair to extract text from Wikileaks Cablegate | |
# | |
# Usage: | |
# ruby cablegateHtml2text [input_directory] [output_directory] | |
# | |
# If input directory and output directory are not specified, the current directory is used. | |
# process a directory recursively | |
def process_directory(input_directory, output_directory) | |
Dir.new(input_directory).each do |file| | |
full_file = File.expand_path(file, input_directory) | |
if file[0,1] == '.' # skip dot files | |
next | |
elsif File.directory? full_file | |
process_directory full_file, output_directory | |
elsif File.extname(file) == '.html' | |
process_file full_file, output_directory | |
end | |
end | |
end | |
# process an XML file | |
def process_file(file, output_directory) | |
contents = File.read file | |
entry = {} | |
# grab metadata | |
start_table = contents.index("<table class='cable'>") | |
end_table = contents.index("</table>") | |
table = contents[start_table,end_table-start_table+8] | |
%w(cable date classification origin).each do |key| | |
match = table.match('<a href=\'\/'+key+'\/.+?\'>(.+?)<\/a>') | |
entry[key.intern] = match[1] # should raise error if no match | |
end | |
# grab text | |
entry[:text] = '' | |
contents.scan(/<pre>(.+?)<\/pre>/) {|p| entry[:text] += p[0]} | |
entry[:text].gsub!(/
/,"\n") # make new lines more readable | |
entry[:text].gsub!(/<\/?\w+\b.*?>/,'') # get rid of tags | |
# output file | |
filename = [entry[:date],entry[:cable],entry[:classification],entry[:origin]].join(' ') | |
filename.gsub!(/[:\/]/,'_') | |
out = File.new(output_directory+'/'+filename+'.txt', 'w') | |
out.write(entry[:text]) | |
out.close | |
end | |
process_directory ARGV[0].nil? ? Dir.getwd : ARGV[0], ARGV[1].nil? ? Dir.getwd : ARGV[1] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment