Skip to content

Instantly share code, notes, and snippets.

@samber
Last active January 5, 2020 03:13
Show Gist options
  • Save samber/936cebc4eedc758d30984ebc8d34cc5f to your computer and use it in GitHub Desktop.
Save samber/936cebc4eedc758d30984ebc8d34cc5f to your computer and use it in GitHub Desktop.
const PDFJS = require('pdfjs-dist');
PDFJS.disableTextLayer = false;
PDFJS.disableWorker = true;
function getText(pdfUrl, fromPage=0, nbrPages=-1){
var pdf = PDFJS.getDocument(pdfUrl);
return pdf
.then(function(pdf) { // get all pages text
var maxPages = pdf._pdfInfo.numPages;
if (nbrPages == -1)
nbrPages = maxPages - fromPage;
if (nbrPages + fromPage > maxPages)
nbrPages = maxPages - fromPage
var pagePromises = []; // collecting all page promises
for (var j = 1; j <= nbrPages; j++)
pagePromises.push(pdf.getPage(j + fromPage));
return Promise.all(pagePromises);
})
.then((pages) => {
return Promise.all(pages.map((page) => {
return page.getTextContent();
}));
});
}
// waiting on gettext to finish completion, or error
getText(process.argv[2], 1, 1)
.then(function (text) {
console.log(text[0]);
}, function (reason) {
console.error(reason);
});
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
# Open a PDF document.
fp = open(sys.argv[1], 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser, "")
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
def get_table_content():
# Get the outlines of the document.
outlines = document.get_outlines()
for (level,title,dest,a,se) in outlines:
print (level, title, dest,a,se)
def process_pages():
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
# receive the LTPage object for the page.
layout = device.get_result()
print(layout)
for element in layout:
print(element)
print("\n\n\n")
get_table_content()
process_pages()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment