Skip to content

Instantly share code, notes, and snippets.

@saveyak
Created September 16, 2020 00:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save saveyak/313d91847f4645d5f8769d43b7f848b9 to your computer and use it in GitHub Desktop.
Save saveyak/313d91847f4645d5f8769d43b7f848b9 to your computer and use it in GitHub Desktop.
Using Python's Tika package to read hundreds of pages of PDFs and create a dataframe
Display the source blob
Display the rendered blob
Raw
#In the midst of a two-year-long investigation into restraint and seclusion in private, special education schools, a source gave me a box filled with hundreds of pages of incident reports she had received over the course of five years whenever her children were restrained and secluded. I scanned the pages, converted them to PDF, and wrote this code in order to collect the data into an analyzable file.
#Read the full investigation here -- a partnership between USA TODAY and the Teacher Project at Columbia University: https://www.usatoday.com/story/news/education/2020/07/25/disability-special-education-private-school-restraint/4737971002/
#Below is an example of the code I wrote for the years 2015 and 2016.
#%%cmd
#pip install tika
from tika import parser #allows me to read PDFs and conver them to text strings
import pandas as pd #allows me to build dataframes
import re #allows me to use regular expressions
pdfs = ["apr_jun2015","feb2011_june2013","jan_jun2014","jan_mar2015","jan_may2016","jul_dec2013",
"jul_dec2014","jul_dec2015"]
new_pdfs = [("pdfs/" + s + ".pdf") for s in pdfs]
#Parse each PDF in the list
dict= {} # create an empty dictionary
for x in range(len(pdfs)):
dict[pdfs[x]] = parser.from_file(new_pdfs[x])
print(pdfs[x])
#can see the dictionary's keys with dict.keys()
#can see content of the PDF with print(dict['jan_mar2015']['content'])
#Use regular expressions to find the following data from each report: the name of the student, the date of the incident, and the report description.
#Start with January-March 2015
name = re.findall("Student Name: (.+) Gay", dict['jan_mar2015']['content'])
date = re.findall("...ident Date: (.+) \n", dict['jan_mar2015']['content'])
date.append('1/14/2015')
print(len(name)) #26 names
print(len(date)) #26 dates
print(name)
print(date)
desc = re.findall("Description of Incident:(.*?)\n\n\n\n", dict['jan_mar2015']['content'], re.DOTALL)
print(len(desc)) #20 descriptions found - due to inconsistent formatting, this code could not find every description in the reports.
for x in range(len(desc)):
print(desc[x]) #print descriptions
#I added rows to desc so it has the same number of rows as name and date. I then fixed the mismatch after I exported the dataframe to Excel, checking multiple times against the original copies of the reports.
for x in range(6):
desc.append("")
pd.set_option('display.max_colwidth', -1)
df = pd.DataFrame(list(zip(name, date, desc)), columns=['name', 'date', 'desc'])
df
#April-June 2015
name = re.findall("Student Name: (.+) Gay", dict['apr_jun2015']['content'])
date = re.findall("...ident Date: (.+) \n", dict['apr_jun2015']['content'])
name.append('Shirley')
print(len(name)) #74 names
print(len(date)) #74 dates
print(name)
print(date)
desc = re.findall("Description of Incident:(.*?)\n\n\n\n", dict['apr_jun2015']['content'], re.DOTALL)
print(len(desc)) #49 descriptions
for x in range(len(desc)):
print(desc[x])
print("🍌🍌🍌🍌🍌") #emojis added to clearly demarcate descriptions
for x in range(25):
desc.append('')
print(len(desc)) #now there are 74 descriptions so it matches with name and date
df = pd.DataFrame(list(zip(name, date, desc)), columns=['name', 'date', 'desc'])
pd.options.display.max_rows=100
df
#July-December 2015
name = re.findall("Student Name: (.+) Gay", dict['jul_dec2015']['content'])
date = re.findall("...ident Date: (.+) \n", dict['jul_dec2015']['content'])
desc = re.findall("Description of Incident:(.*?)\n\n\n\n", dict['jul_dec2015']['content'], re.DOTALL)
print(len(name)) #39 names
print(len(date)) #38 dates, need to append 1 row
print(len(desc)) #21 descriptions, need to append 18 rows
date.append("")
for x in range(18):
desc.append('')
print(len(name)) #39 rows
print(len(date)) #39 rows
print(len(desc)) #39 rows
for x in range(len(desc)):
print(desc[x])
print("🍌🍌🍌🍌🍌")
df = pd.DataFrame(list(zip(name, date, desc)), columns=['name', 'date', 'desc'])
df
#January-May 2016
name = re.findall("Student Name: (.+) Gay", dict['jan_may2016']['content'])
date = re.findall("...ident Date: (.+) \n", dict['jan_may2016']['content'])
desc = re.findall("Description of Incident:(.*?)\n\n\n\n", dict['jan_may2016']['content'], re.DOTALL)
print(len(name)) #43
print(len(date)) #40
print(len(desc)) #24
for x in range(3):
date.append("")
for x in range (19):
desc.append("")
print(len(name)) #43
print(len(date)) #43
print(len(desc)) #43
df = pd.DataFrame(list(zip(name, date, desc)), columns=['name', 'date', 'desc'])
df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment