Skip to content

Instantly share code, notes, and snippets.

@scionoftech
Last active June 14, 2019 07:39
Show Gist options
  • Save scionoftech/5a635a0fe39aa5e226476545da0f406a to your computer and use it in GitHub Desktop.
Save scionoftech/5a635a0fe39aa5e226476545da0f406a to your computer and use it in GitHub Desktop.
this is a python script to extract tables from pdf and convert to excel
import click
from pathlib import Path
import pdfplumber
import pandas as pd
from pandas import ExcelWriter
def to_excel(path, output_path):
with pdfplumber.open(path) as pdf:
data = []
for i in range(len(pdf.pages)):
page = pdf.pages[i]
# print(first_page.chars[0])
data += page.extract_table()
# print(data)
df = pd.DataFrame(data)
filename = Path(path).name.split('.')
writer = ExcelWriter(output_path + os.sep + filename[0] + '.xlsx')
df.to_excel(writer, 'Sheet1', index=False)
writer.save()
@click.command()
@click.option('--input_path', '-ip', help='input file path')
@click.option('--output_path', '-op', help='output folder path')
def convert(input_path, output_path):
"""This program generates xlsx files from pdf."""
to_excel(input_path, output_path)
if __name__ == '__main__':
convert()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment