Created
September 16, 2020 00:28
-
-
Save saveyak/313d91847f4645d5f8769d43b7f848b9 to your computer and use it in GitHub Desktop.
Using Python's Tika package to read hundreds of pages of PDFs and create a dataframe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#In the midst of a two-year-long investigation into restraint and seclusion in private, special education schools, a source gave me a box filled with hundreds of pages of incident reports she had received over the course of five years whenever her children were restrained and secluded. I scanned the pages, converted them to PDF, and wrote this code in order to collect the data into an analyzable file. | |
#Read the full investigation here -- a partnership between USA TODAY and the Teacher Project at Columbia University: https://www.usatoday.com/story/news/education/2020/07/25/disability-special-education-private-school-restraint/4737971002/ | |
#Below is an example of the code I wrote for the years 2015 and 2016. | |
#%%cmd | |
#pip install tika | |
from tika import parser #allows me to read PDFs and conver them to text strings | |
import pandas as pd #allows me to build dataframes | |
import re #allows me to use regular expressions | |
pdfs = ["apr_jun2015","feb2011_june2013","jan_jun2014","jan_mar2015","jan_may2016","jul_dec2013", | |
"jul_dec2014","jul_dec2015"] | |
new_pdfs = [("pdfs/" + s + ".pdf") for s in pdfs] | |
#Parse each PDF in the list | |
dict= {} # create an empty dictionary | |
for x in range(len(pdfs)): | |
dict[pdfs[x]] = parser.from_file(new_pdfs[x]) | |
print(pdfs[x]) | |
#can see the dictionary's keys with dict.keys() | |
#can see content of the PDF with print(dict['jan_mar2015']['content']) | |
#Use regular expressions to find the following data from each report: the name of the student, the date of the incident, and the report description. | |
#Start with January-March 2015 | |
name = re.findall("Student Name: (.+) Gay", dict['jan_mar2015']['content']) | |
date = re.findall("...ident Date: (.+) \n", dict['jan_mar2015']['content']) | |
date.append('1/14/2015') | |
print(len(name)) #26 names | |
print(len(date)) #26 dates | |
print(name) | |
print(date) | |
desc = re.findall("Description of Incident:(.*?)\n\n\n\n", dict['jan_mar2015']['content'], re.DOTALL) | |
print(len(desc)) #20 descriptions found - due to inconsistent formatting, this code could not find every description in the reports. | |
for x in range(len(desc)): | |
print(desc[x]) #print descriptions | |
#I added rows to desc so it has the same number of rows as name and date. I then fixed the mismatch after I exported the dataframe to Excel, checking multiple times against the original copies of the reports. | |
for x in range(6): | |
desc.append("") | |
pd.set_option('display.max_colwidth', -1) | |
df = pd.DataFrame(list(zip(name, date, desc)), columns=['name', 'date', 'desc']) | |
df | |
#April-June 2015 | |
name = re.findall("Student Name: (.+) Gay", dict['apr_jun2015']['content']) | |
date = re.findall("...ident Date: (.+) \n", dict['apr_jun2015']['content']) | |
name.append('Shirley') | |
print(len(name)) #74 names | |
print(len(date)) #74 dates | |
print(name) | |
print(date) | |
desc = re.findall("Description of Incident:(.*?)\n\n\n\n", dict['apr_jun2015']['content'], re.DOTALL) | |
print(len(desc)) #49 descriptions | |
for x in range(len(desc)): | |
print(desc[x]) | |
print("πππππ") #emojis added to clearly demarcate descriptions | |
for x in range(25): | |
desc.append('') | |
print(len(desc)) #now there are 74 descriptions so it matches with name and date | |
df = pd.DataFrame(list(zip(name, date, desc)), columns=['name', 'date', 'desc']) | |
pd.options.display.max_rows=100 | |
df | |
#July-December 2015 | |
name = re.findall("Student Name: (.+) Gay", dict['jul_dec2015']['content']) | |
date = re.findall("...ident Date: (.+) \n", dict['jul_dec2015']['content']) | |
desc = re.findall("Description of Incident:(.*?)\n\n\n\n", dict['jul_dec2015']['content'], re.DOTALL) | |
print(len(name)) #39 names | |
print(len(date)) #38 dates, need to append 1 row | |
print(len(desc)) #21 descriptions, need to append 18 rows | |
date.append("") | |
for x in range(18): | |
desc.append('') | |
print(len(name)) #39 rows | |
print(len(date)) #39 rows | |
print(len(desc)) #39 rows | |
for x in range(len(desc)): | |
print(desc[x]) | |
print("πππππ") | |
df = pd.DataFrame(list(zip(name, date, desc)), columns=['name', 'date', 'desc']) | |
df | |
#January-May 2016 | |
name = re.findall("Student Name: (.+) Gay", dict['jan_may2016']['content']) | |
date = re.findall("...ident Date: (.+) \n", dict['jan_may2016']['content']) | |
desc = re.findall("Description of Incident:(.*?)\n\n\n\n", dict['jan_may2016']['content'], re.DOTALL) | |
print(len(name)) #43 | |
print(len(date)) #40 | |
print(len(desc)) #24 | |
for x in range(3): | |
date.append("") | |
for x in range (19): | |
desc.append("") | |
print(len(name)) #43 | |
print(len(date)) #43 | |
print(len(desc)) #43 | |
df = pd.DataFrame(list(zip(name, date, desc)), columns=['name', 'date', 'desc']) | |
df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment