Skip to content

Instantly share code, notes, and snippets.

@petrinkae
Last active September 21, 2023 20:56
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save petrinkae/5e53d080088042f97bbd2dbf7518adae to your computer and use it in GitHub Desktop.
Save petrinkae/5e53d080088042f97bbd2dbf7518adae to your computer and use it in GitHub Desktop.
Framework for basic Python table extraction from multiple PDFs, using pandas, os, and pdfplumber
#!/usr/bin/env python
# coding: utf-8
# In[18]:
import pdfplumber as pp
import pandas as pd
import os
import numpy as np
import fnmatch
import glob
# In[19]:
pdf_dir = 'pdfs/'
output_dir = 'extracts/'
# In[20]:
print(len(fnmatch.filter(os.listdir(pdf_directory), '*.pdf')))
# In[23]:
# set wildcard match list to create PDF sets
dataset = ['Hires','Resignations','Firings']
# In[28]:
for data in dataset:
all_files = glob.glob(pdf_dir + '*' + data + '*.pdf',recursive=True)
print(all_files)
file_cnt = 1 # used to log progress
for file in all_files:
# iterate through files, pull extension for validity check
f = os.path.join(file)
split_tup = os.path.splitext(f)
ext = split_tup[1]
# grab file length to remove header and extension text
length = len(file)
file_name = file[0:length - 4]
# extract data from valid PDFs
if (ext == '.pdf'):
# access pdf
pdf = pp.open(f)
pages = pdf.pages
for page in pages:
tables = page.extract_tables()
for table in tables:
# create empty filename column
# we use this as a unique ID for each PDF
# commented out because headers are not consistent
table[0].insert(0, file_name)
# starts at 2 to skip double header
row_cnt = 1
while row_cnt < len(table):
# add school name from filename
table[row_cnt].insert(0, file_name)
row_cnt += 1
# create csv name based on table number
csv_name = output_dir + data
# create dataframe (skipping double header rows)
df = pd.DataFrame(table[:],columns=table[0])
# create new csv for first pdf then append to existing after
# !!!!!! remember to delete old files from output folder/change output directory
# or this check will add rows to your previous scrape
if os.path.exists(csv_name + '.csv'):
# for existing file: appends df and excludes header
df.to_csv(csv_name + '.csv', mode='a', index=False, header=False)
else:
# for nonexisting file: creates new csv w/ header
df.to_csv(csv_name + '.csv', index=False)
# log out progress
print(str(file_cnt) + " PDF has been scraped")
print(f)
file_cnt += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment