Skip to content

Instantly share code, notes, and snippets.

@prcastro
Created November 23, 2015 21:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save prcastro/914b384703bd941319bb to your computer and use it in GitHub Desktop.
Save prcastro/914b384703bd941319bb to your computer and use it in GitHub Desktop.
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
def main():
# Open a PDF file.
name=raw_input("Name of pdf document: ")
fp = open(name+".pdf", 'rb')
# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)
# Create a PDF document object that stores the document structure.
# Password for initialization as 2nd parameter
document = PDFDocument(parser)
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF device object.
device = PDFDevice(rsrcmgr)
# BEGIN LAYOUT ANALYSIS
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
file_w=open(name+"_coord.txt", 'w')
# loop over all pages in the document
for page in PDFPage.create_pages(document):
# read the page into a layout object
interpreter.process_page(page)
layout = device.get_result()
# extract text from this object
a=words_coord(parse_obj(layout._objs))
for i in range(len(a)):
b=""
for j in range(len(a[i])):
b+=a[i][j].encode('utf-8')+" "
file_w.write(b+'\n')
file_w.close()
def parse_obj(lt_objs):
# loop over the object list
for obj in lt_objs:
# if it's a textbox, print text and location
if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
"""print (obj.bbox[0], obj.bbox[1], obj.bbox[2], obj.bbox[3], obj.get_text().replace('\n', '_'))"""
words=[]
for line in obj:
if (isinstance(line, pdfminer.layout.LTTextLineHorizontal)):
for char in line:
pos=0
a=char.__repr__()[8:]
for i in range(len(a)):
if (a[i]==" "):
pos=i
break
words.append([a[:pos], char.get_text()])
# if it's a container, recurse
elif isinstance(obj, pdfminer.layout.LTFigure):
parse_obj(obj._objs)
return words
def get_ymax(coord):
#maximxum y-coordinate of a character
comma=0
pos=0
for i in coord:
if (i==","):
comma+=1
if (comma==3):
break
pos+=1
return coord[pos+1:]
def get_ymin(coord):
#minimum y-coordinate of a character
comma=0
comma1=0
pos=0
for i in coord:
pos+=1
if (i==","):
comma+=1
if (comma==1):
comma1=pos
if (comma==2):
break
return coord[comma1:pos-1]
def get_xmin(coord):
#maximxum x-coordinate of a character
pos=0
for j in coord:
pos+=1
if (j==","):
break
return coord[:pos-1]
def get_xmax(coord):
#minimum x-coordinate of a character
comma=0
comma1=0
pos=0
for i in coord:
pos+=1
if (i==","):
comma+=1
if (comma==2):
comma1=pos
if (comma==3):
break
return coord[comma1:pos-1]
def max(y):
#max(y[i]), maximum y coordinate of a word
max=0
for i in range(len(y)):
if (y[i]>max):
max=y[i]
return max
def words_coord(words):
#separates characters in words and returns a matrix in the format:
#words_fin[i]=[word, minimum x-coord, minimum y-coord, max x-coord of the last character and maximum value of y in the word]
words_fin=[]
word=""
for i in range(len(words)):
a=words[i][1]
if (a!=" " and a!="\n" and a!='!' and a!="?" and a!='.' and a!="," and a!="(" and a!=")" and a!=":" and a!=" -" and a!=";"):
if (word==""):
y=[]
x=get_xmin(words[i][0])
y.append(get_ymax(words[i][0]))
else:
y.append(get_ymax(words[i][0]))
word+=words[i][1]
else:
if (word!=""):
words_fin.append([word, x, get_ymin(words[i-1][0]), get_xmax(words[i-1][0]), max(y)])
word=""
return words_fin
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment