Skip to content

Instantly share code, notes, and snippets.

@bufke
Last active May 28, 2023 13:31
Show Gist options
  • Save bufke/8798262 to your computer and use it in GitHub Desktop.
Save bufke/8798262 to your computer and use it in GitHub Desktop.
Convert odt, doc, docx, pdf to text with python and some linux programs. Doesn't require Libreoffice.
from subprocess import Popen, PIPE
from docx import opendocx, getdocumenttext
#http://stackoverflow.com/questions/5725278/python-help-using-pdfminer-as-a-library
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
def convert_pdf_to_txt(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
def document_to_text(filename, file_path):
if filename[-4:] == ".doc":
cmd = ['antiword', file_path]
p = Popen(cmd, stdout=PIPE)
stdout, stderr = p.communicate()
return stdout.decode('ascii', 'ignore')
elif filename[-5:] == ".docx":
document = opendocx(file_path)
paratextlist = getdocumenttext(document)
newparatextlist = []
for paratext in paratextlist:
newparatextlist.append(paratext.encode("utf-8"))
return '\n\n'.join(newparatextlist)
elif filename[-4:] == ".odt":
cmd = ['odt2txt', file_path]
p = Popen(cmd, stdout=PIPE)
stdout, stderr = p.communicate()
return stdout.decode('ascii', 'ignore')
elif filename[-4:] == ".pdf":
return convert_pdf_to_txt(file_path)
@arishtanemi3007
Copy link

Hi. I'm trying to build a module where i can convert PDF to .docx. The PDFs contain images and formatted text. can you suggest changes in your script which will help me achieve this output.

@ganeshkharad2
Copy link

ganeshkharad2 commented Aug 5, 2020

.doc conversion not working
returns empty string only

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment