Skip to content

Instantly share code, notes, and snippets.

@gooooloo
Created February 13, 2021 06:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gooooloo/8ab040720e7d17c04d4c13c2d4a05e06 to your computer and use it in GitHub Desktop.
Save gooooloo/8ab040720e7d17c04d4c13c2d4a05e06 to your computer and use it in GitHub Desktop.
script extract text in pdf to text file
#!/usr/bin/python3
import sys
from PDFNetPython3 import *
def main():
if len(sys.argv) < 3:
print("Usage: python3 pdf_to_txt.py path_to_pdf path_to_txt")
return
infile = sys.argv[1]
outfile = sys.argv[2]
PDFNet.Initialize()
doc = PDFDoc(infile)
doc.InitSecurityHandler()
pgcnt = doc.GetPageCount()
with open(outfile, 'wt', encoding='utf-8') as f:
for pgidx in range(1,pgcnt+1):
print(f'handling page {pgidx}/{pgcnt} ...')
txt = TextExtractor()
txt.Begin(doc.GetPage(pgidx))
txt = txt.GetAsText()
lines = txt.split('\n')
lines = lines[1:-1]
print(f'----- page {pgidx} -----', file=f)
for line in lines:
print(line, file=f)
print('', file=f)
print(f'saved in {outfile}')
doc.Close()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment