petrinkae/pdfplumb.py

## pdfplumb.py
#!/usr/bin/env python
# coding: utf-8

# In[18]:


import pdfplumber as pp
import pandas as pd
import os
import numpy as np
import fnmatch
import glob


# In[19]:


pdf_dir = 'pdfs/'
output_dir = 'extracts/'


# In[20]:


print(len(fnmatch.filter(os.listdir(pdf_directory), '*.pdf')))


# In[23]:


# set wildcard match list to create PDF sets

dataset = ['Hires','Resignations','Firings']


# In[28]:


for data in dataset:
    all_files = glob.glob(pdf_dir + '*' + data + '*.pdf',recursive=True)

    print(all_files)
    file_cnt = 1 # used to log progress

    for file in all_files:

        # iterate through files, pull extension for validity check

        f = os.path.join(file)
        split_tup = os.path.splitext(f)
        ext = split_tup[1]

        # grab file length to remove header and extension text

        length = len(file)
        file_name = file[0:length - 4]

        # extract data from valid PDFs

        if (ext == '.pdf'):

            # access pdf

            pdf = pp.open(f)
            pages = pdf.pages

            for page in pages:
                tables = page.extract_tables()

                for table in tables:

                    # create empty filename column
                    # we use this as a unique ID for each PDF
                    # commented out because headers are not consistent

                    table[0].insert(0, file_name)

                    # starts at 2 to skip double header

                    row_cnt = 1

                    while row_cnt < len(table):

                        # add school name from filename

                        table[row_cnt].insert(0, file_name)

                        row_cnt += 1

                    # create csv name based on table number

                    csv_name = output_dir + data

                    # create dataframe (skipping double header rows)

                    df = pd.DataFrame(table[:],columns=table[0])

                    # create new csv for first pdf then append to existing after

                    # !!!!!! remember to delete old files from output folder/change output directory
                    # or this check will add rows to your previous scrape

                    if os.path.exists(csv_name + '.csv'):

                        # for existing file: appends df and excludes header

                        df.to_csv(csv_name + '.csv', mode='a', index=False, header=False)

                    else:

                        # for nonexisting file: creates new csv w/ header

                        df.to_csv(csv_name + '.csv', index=False)


            # log out progress

            print(str(file_cnt) + " PDF has been scraped")
            print(f)

            file_cnt += 1
	#!/usr/bin/env python
	# coding: utf-8

	# In[18]:


	import pdfplumber as pp
	import pandas as pd
	import os
	import numpy as np
	import fnmatch
	import glob


	# In[19]:


	pdf_dir = 'pdfs/'
	output_dir = 'extracts/'


	# In[20]:


	print(len(fnmatch.filter(os.listdir(pdf_directory), '*.pdf')))


	# In[23]:


	# set wildcard match list to create PDF sets

	dataset = ['Hires','Resignations','Firings']


	# In[28]:


	for data in dataset:
	all_files = glob.glob(pdf_dir + '' + data + '.pdf',recursive=True)

	print(all_files)
	file_cnt = 1 # used to log progress

	for file in all_files:

	# iterate through files, pull extension for validity check

	f = os.path.join(file)
	split_tup = os.path.splitext(f)
	ext = split_tup[1]

	# grab file length to remove header and extension text

	length = len(file)
	file_name = file[0:length - 4]

	# extract data from valid PDFs

	if (ext == '.pdf'):

	# access pdf

	pdf = pp.open(f)
	pages = pdf.pages

	for page in pages:
	tables = page.extract_tables()

	for table in tables:

	# create empty filename column
	# we use this as a unique ID for each PDF
	# commented out because headers are not consistent

	table[0].insert(0, file_name)

	# starts at 2 to skip double header

	row_cnt = 1

	while row_cnt < len(table):

	# add school name from filename

	table[row_cnt].insert(0, file_name)

	row_cnt += 1

	# create csv name based on table number

	csv_name = output_dir + data

	# create dataframe (skipping double header rows)

	df = pd.DataFrame(table[:],columns=table[0])

	# create new csv for first pdf then append to existing after

	# !!!!!! remember to delete old files from output folder/change output directory
	# or this check will add rows to your previous scrape

	if os.path.exists(csv_name + '.csv'):

	# for existing file: appends df and excludes header

	df.to_csv(csv_name + '.csv', mode='a', index=False, header=False)

	else:

	# for nonexisting file: creates new csv w/ header

	df.to_csv(csv_name + '.csv', index=False)


	# log out progress

	print(str(file_cnt) + " PDF has been scraped")
	print(f)

	file_cnt += 1