Created
August 22, 2008 20:23
-
-
Save indirect/6848 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby -w | |
require 'fileutils' | |
$pageBreakTag = /<hr>\s<A name=[0-9]+><\/a>/i | |
$brTag = /<br>/i | |
$bodyTag = /<body[^>]*>/i | |
$fileHeaderTag = /file.*<br>/i | |
# Sadness, I know, but how else am I going to kill these? | |
$linesToKill = [ "Ursula K. LeGuin - The Ekumen 01 - ROCANNON'S WORLD<br>\n" ] | |
def cleanHTML( html ) | |
html = html.gsub $pageBreakTag, '' | |
html = html.gsub $bodyTag, '<body>' | |
final = "" | |
sum = 0 | |
count = 0 | |
html.each_line do |line| | |
sum += line_length(line) #.length() | |
count += 1 | |
end | |
avg_line_length = 0.8*sum/count | |
html.each_line do |line| #for line in html.split( "\n" ) do | |
if( $fileHeaderTag.match( line ) or $linesToKill.include?( line ) ) then | |
next | |
end | |
@inBody ||= $bodyTag.match(line) | |
if $brTag.match( line ) and line_length(line) > avg_line_length then | |
line = line.gsub $brTag, '' | |
else | |
line << "<br>" if @inBody | |
end | |
final << line | |
end | |
return final | |
end | |
def line_length( html ) | |
return removeTags(html).length() | |
end | |
def removeTags( html ) | |
tag = /<[^>]*>/ | |
return html.gsub(tag, '') | |
end | |
if $0 == __FILE__ then | |
pdf_files = ARGV.detect{|f| ".pdf" == File.extname(f).downcase } | |
pdf_files.each do |pdf_file| | |
directory = File.dirname(pdf_file) | |
filename = File.basename(pdf_file, ".pdf") + ".html" | |
html = `pdftohtml -stdout -q "#{ARGV[0]}"` | |
File.open(filename, "w"){|f| f.write(cleanHTML(html))} | |
puts filename | |
end | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment