Skip to content

Instantly share code, notes, and snippets.

@rjpower
Created March 18, 2012 17:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rjpower/2078056 to your computer and use it in GitHub Desktop.
Save rjpower/2078056 to your computer and use it in GitHub Desktop.
Extracting a title from a pdf document
from BeautifulSoup import BeautifulStoneSoup
import subprocess
import sys
import tempfile
def extract_pdf_title(pdfdata):
src_file = tempfile.NamedTemporaryFile(delete=True)
src_file.write(pdfdata)
src_file.flush()
try:
command = ' '.join(['pdftohtml', '-c -s -i', '-stdout', '-f 1', '-l 1',
'-xml', src_file.name, '/tmp/pdftoxml'])
xmlout, xmlerr = subprocess.Popen(command, shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT).communicate('')
xml_data = open('/tmp/pdftoxml.xml').read()
except:
print 'Error in pdftohtml '
return ''
dom = BeautifulStoneSoup(xml_data)
text = dom.findAll('text')
# let the title be the first set of text elements until we see a change in font
title_text = ''
last_font = None
for t in text:
if last_font is not None and t.get('font') != last_font:
if len(title_text) > 5: break
else: title_text = ''
title_text += t.getText().encode('utf-8') + ' '
last_font = t.get('font')
return title_text
if __name__ == '__main__':
for f in sys.argv[1:]:
print f, ' -- ', extract_pdf_title(open(f).read())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment