Skip to content

Instantly share code, notes, and snippets.

@jakevose
Created October 19, 2016 19:19
Show Gist options
  • Save jakevose/57dcc26765f38bb9b138164301c99993 to your computer and use it in GitHub Desktop.
Save jakevose/57dcc26765f38bb9b138164301c99993 to your computer and use it in GitHub Desktop.
Rewriting munged PDF files.
class PdfFixer
def initialize
@paths = Dir.glob('in/**/*.pdf').sort
`rm -rf out && mkdir out`
end
def process_files
start_logging
@paths.each do |path|
validate_and_repair(path)
end
ensure
stop_logging
end
private
def start_logging
@log = File.open('error.log', 'w')
@good = File.open('good.log', 'w')
@bad = File.open('bad.log', 'w')
@log.puts('Beginning pdf fix.')
end
def good_start?(file)
first = file.readline
file.rewind
first.start_with?('%PDF')
end
def validate_and_repair(path)
file = File.open(path, 'r')
if good_start?(file)
@good.puts(path)
else
@bad.puts(path)
try_repair(file)
end
end
def try_repair(file)
any_pdf_data = nil
output_path = "out/#{file.path.split('/').slice(1..-2).join('/')}"
out_file = nil
file.each_line do |line|
any_pdf_data = any_pdf_data || line.start_with?('%PDF')
if any_pdf_data
`mkdir -p #{output_path}` unless Dir.exists?(output_path)
out_file ||= File.open(output_path + '/' + file.path.split('/').last, 'w')
out_file.write(line)
end
end
rescue StandardError => e
@log.puts("Error in try_repair for file #{file.path}: #{e.message}")
ensure
out_file.close if out_file
end
def stop_logging
@log.puts('Finishing pdf fix.')
@log.close
@good.close
@bad.close
end
end
PdfFixer.new.process_files
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment