Skip to content

Instantly share code, notes, and snippets.

@giasuddin90
Created October 30, 2019 11:16
Show Gist options
  • Save giasuddin90/a50eb06cde48dc89e593291de1f46639 to your computer and use it in GitHub Desktop.
Save giasuddin90/a50eb06cde48dc89e593291de1f46639 to your computer and use it in GitHub Desktop.
By using python PyPDF2 and openpyxl library read data from pdf file and write data in xlsx file
#-*- coding: utf-8 -*-
import PyPDF2
from openpyxl import Workbook
def pypd():
pdfFileObj = open('principle of marketing.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# allpage= pdfReader.numPages
pages = range(939, 959)
# pages = range(939, 940)
wb = Workbook()
ws = wb.active
for page in pages:
pageObj = pdfReader.getPage(page)
page_text=pageObj.extractText().encode('utf8').split('.')
print(len(page_text))
for item in page_text:
list_item=item.split("—")
ws.append(list_item)
print(list_item)
wb.save('jargon.xlsx')
if __name__ == '__main__':
pypd()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment