Created
February 9, 2009 16:53
-
-
Save indirect/60863 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby -w | |
require 'fileutils' | |
$pageBreakTag = /<hr>(\s<A name=[0-9]+><\/a>)?/i | |
$brTag = /<br>/i | |
$bodyTag = /<body[^>]*>/i | |
$fileHeaderTag = /file.*<br>/i | |
# Sadness, I know, but how else am I going to kill these? | |
$linesToKill = [ | |
] | |
class String | |
def plaintext_length | |
self.gsub(/<[^>]*>/, '').length | |
end | |
end | |
def cleanHTML( html ) | |
html = html.gsub $pageBreakTag, '' | |
html = html.gsub $bodyTag, '<body>' | |
lines = html.to_a | |
final = "" | |
line_lengths = lines.inject(0){|s,l| s + l.plaintext_length } | |
avg_line_length = 0.8 * line_lengths / lines.size | |
lines.each do |line| | |
next if $fileHeaderTag.match(line) || $linesToKill.include?(line) | |
@inBody ||= $bodyTag.match(line) | |
if $brTag.match( line ) and line.plaintext_length > avg_line_length then | |
line = line.gsub $brTag, ' ' | |
else | |
line << "<br>" if @inBody | |
end | |
final << line | |
end | |
return final | |
end | |
if $0 == __FILE__ then | |
threads = [] | |
ARGV.find_all{|f| f =~ /.pdf$/ }.each do |filename| | |
threads << Thread.new(filename) do |pdf_file| | |
File.open(pdf_file.sub(/.pdf$/, ".html"), "w") do |f| | |
f << cleanHTML(`pdftohtml -stdout -q "#{pdf_file}"`.chomp) | |
end | |
puts filename | |
end | |
end | |
threads.each{|t| t.join } | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment