prcastro/coord.py

## coord.py
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer

def main():
    # Open a PDF file.
    name=raw_input("Name of pdf document: ")
    fp = open(name+".pdf", 'rb')

    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)

    # Create a PDF document object that stores the document structure.
    # Password for initialization as 2nd parameter
    document = PDFDocument(parser)

    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()

    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)

    # BEGIN LAYOUT ANALYSIS
    # Set parameters for analysis.
    laparams = LAParams()

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    file_w=open(name+"_coord.txt", 'w')


    # loop over all pages in the document
    for page in PDFPage.create_pages(document):

        # read the page into a layout object
        interpreter.process_page(page)
        layout = device.get_result()

        # extract text from this object
        a=words_coord(parse_obj(layout._objs))
        for i in range(len(a)):
            b=""
            for j in range(len(a[i])):
                b+=a[i][j].encode('utf-8')+" "
            file_w.write(b+'\n')

    file_w.close()


def parse_obj(lt_objs):
    # loop over the object list
    for obj in lt_objs:

        # if it's a textbox, print text and location
        if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
            """print (obj.bbox[0], obj.bbox[1], obj.bbox[2], obj.bbox[3], obj.get_text().replace('\n', '_'))"""
            words=[]
            for line in obj:

                if (isinstance(line, pdfminer.layout.LTTextLineHorizontal)):
                    for char in line:
                        pos=0
                        a=char.__repr__()[8:]
                        for i in range(len(a)):
                            if (a[i]==" "):
                                pos=i
                                break
                        words.append([a[:pos], char.get_text()])


        # if it's a container, recurse
        elif isinstance(obj, pdfminer.layout.LTFigure):
            parse_obj(obj._objs)
    return words

def get_ymax(coord):
    #maximxum y-coordinate of a character
    comma=0
    pos=0
    for i in coord:
        if (i==","):
            comma+=1
        if (comma==3):
            break
        pos+=1
    return coord[pos+1:]

def get_ymin(coord):
    #minimum y-coordinate of a character
    comma=0
    comma1=0
    pos=0
    for i in coord:
        pos+=1
        if (i==","):
            comma+=1
            if (comma==1):
                comma1=pos
        if (comma==2):
            break
    return coord[comma1:pos-1]


def get_xmin(coord):
    #maximxum x-coordinate of a character
    pos=0
    for j in coord:
        pos+=1
        if (j==","):
            break
    return coord[:pos-1]

def get_xmax(coord):
    #minimum x-coordinate of a character
    comma=0
    comma1=0
    pos=0
    for i in coord:
        pos+=1
        if (i==","):
            comma+=1
            if (comma==2):
                comma1=pos
        if (comma==3):
            break
    return coord[comma1:pos-1]

def max(y):
    #max(y[i]), maximum y coordinate of a word
    max=0
    for i in range(len(y)):
        if (y[i]>max):
            max=y[i]
    return max

def words_coord(words):
    #separates characters in words and returns a matrix in the format:
    #words_fin[i]=[word, minimum x-coord, minimum y-coord, max x-coord of the last character and maximum value of y in the word]
    words_fin=[]
    word=""
    for i in range(len(words)):
        a=words[i][1]
        if (a!=" " and a!="\n" and a!='!' and a!="?" and a!='.' and a!="," and a!="(" and a!=")" and a!=":" and a!=" -" and a!=";"):
            if (word==""):
                y=[]
                x=get_xmin(words[i][0])
                y.append(get_ymax(words[i][0]))
            else:
                y.append(get_ymax(words[i][0]))
            word+=words[i][1]
        else:
            if (word!=""):
                words_fin.append([word, x, get_ymin(words[i-1][0]), get_xmax(words[i-1][0]), max(y)])
                word=""

    return words_fin

main()
	from pdfminer.pdfparser import PDFParser
	from pdfminer.pdfdocument import PDFDocument
	from pdfminer.pdfpage import PDFPage
	from pdfminer.pdfpage import PDFTextExtractionNotAllowed
	from pdfminer.pdfinterp import PDFResourceManager
	from pdfminer.pdfinterp import PDFPageInterpreter
	from pdfminer.pdfdevice import PDFDevice
	from pdfminer.layout import LAParams
	from pdfminer.converter import PDFPageAggregator
	import pdfminer

	def main():
	# Open a PDF file.
	name=raw_input("Name of pdf document: ")
	fp = open(name+".pdf", 'rb')

	# Create a PDF parser object associated with the file object.
	parser = PDFParser(fp)

	# Create a PDF document object that stores the document structure.
	# Password for initialization as 2nd parameter
	document = PDFDocument(parser)

	# Check if the document allows text extraction. If not, abort.
	if not document.is_extractable:
	raise PDFTextExtractionNotAllowed

	# Create a PDF resource manager object that stores shared resources.
	rsrcmgr = PDFResourceManager()

	# Create a PDF device object.
	device = PDFDevice(rsrcmgr)

	# BEGIN LAYOUT ANALYSIS
	# Set parameters for analysis.
	laparams = LAParams()

	# Create a PDF page aggregator object.
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)

	# Create a PDF interpreter object.
	interpreter = PDFPageInterpreter(rsrcmgr, device)

	file_w=open(name+"_coord.txt", 'w')


	# loop over all pages in the document
	for page in PDFPage.create_pages(document):

	# read the page into a layout object
	interpreter.process_page(page)
	layout = device.get_result()

	# extract text from this object
	a=words_coord(parse_obj(layout._objs))
	for i in range(len(a)):
	b=""
	for j in range(len(a[i])):
	b+=a[i][j].encode('utf-8')+" "
	file_w.write(b+'\n')

	file_w.close()



	def parse_obj(lt_objs):
	# loop over the object list
	for obj in lt_objs:

	# if it's a textbox, print text and location
	if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
	"""print (obj.bbox[0], obj.bbox[1], obj.bbox[2], obj.bbox[3], obj.get_text().replace('\n', '_'))"""
	words=[]
	for line in obj:

	if (isinstance(line, pdfminer.layout.LTTextLineHorizontal)):
	for char in line:
	pos=0
	a=char.__repr__()[8:]
	for i in range(len(a)):
	if (a[i]==" "):
	pos=i
	break
	words.append([a[:pos], char.get_text()])



	# if it's a container, recurse
	elif isinstance(obj, pdfminer.layout.LTFigure):
	parse_obj(obj._objs)
	return words

	def get_ymax(coord):
	#maximxum y-coordinate of a character
	comma=0
	pos=0
	for i in coord:
	if (i==","):
	comma+=1
	if (comma==3):
	break
	pos+=1
	return coord[pos+1:]

	def get_ymin(coord):
	#minimum y-coordinate of a character
	comma=0
	comma1=0
	pos=0
	for i in coord:
	pos+=1
	if (i==","):
	comma+=1
	if (comma==1):
	comma1=pos
	if (comma==2):
	break
	return coord[comma1:pos-1]


	def get_xmin(coord):
	#maximxum x-coordinate of a character
	pos=0
	for j in coord:
	pos+=1
	if (j==","):
	break
	return coord[:pos-1]

	def get_xmax(coord):
	#minimum x-coordinate of a character
	comma=0
	comma1=0
	pos=0
	for i in coord:
	pos+=1
	if (i==","):
	comma+=1
	if (comma==2):
	comma1=pos
	if (comma==3):
	break
	return coord[comma1:pos-1]

	def max(y):
	#max(y[i]), maximum y coordinate of a word
	max=0
	for i in range(len(y)):
	if (y[i]>max):
	max=y[i]
	return max

	def words_coord(words):
	#separates characters in words and returns a matrix in the format:
	#words_fin[i]=[word, minimum x-coord, minimum y-coord, max x-coord of the last character and maximum value of y in the word]
	words_fin=[]
	word=""
	for i in range(len(words)):
	a=words[i][1]
	if (a!=" " and a!="\n" and a!='!' and a!="?" and a!='.' and a!="," and a!="(" and a!=")" and a!=":" and a!=" -" and a!=";"):
	if (word==""):
	y=[]
	x=get_xmin(words[i][0])
	y.append(get_ymax(words[i][0]))
	else:
	y.append(get_ymax(words[i][0]))
	word+=words[i][1]
	else:
	if (word!=""):
	words_fin.append([word, x, get_ymin(words[i-1][0]), get_xmax(words[i-1][0]), max(y)])
	word=""

	return words_fin

	main()