saveyak/restraint_scrape.ipynb

## restraint_scrape.ipynb
#In the midst of a two-year-long investigation into restraint and seclusion in private, special education schools, a source gave me a box filled with hundreds of pages of incident reports she had received over the course of five years whenever her children were restrained and secluded. I scanned the pages, converted them to PDF, and wrote this code in order to collect the data into an analyzable file.
#Read the full investigation here -- a partnership between USA TODAY and the Teacher Project at Columbia University: https://www.usatoday.com/story/news/education/2020/07/25/disability-special-education-private-school-restraint/4737971002/
#Below is an example of the code I wrote for the years 2015 and 2016.

#%%cmd
#pip install tika

from tika import parser #allows me to read PDFs and conver them to text strings
import pandas as pd #allows me to build dataframes
import re #allows me to use regular expressions

pdfs = ["apr_jun2015","feb2011_june2013","jan_jun2014","jan_mar2015","jan_may2016","jul_dec2013",
        "jul_dec2014","jul_dec2015"]

new_pdfs = [("pdfs/" + s + ".pdf") for s in pdfs]

#Parse each PDF in the list

dict= {} # create an empty dictionary

for x in range(len(pdfs)):
    dict[pdfs[x]] = parser.from_file(new_pdfs[x])
    print(pdfs[x])

#can see the dictionary's keys with dict.keys()
#can see content of the PDF with print(dict['jan_mar2015']['content'])

#Use regular expressions to find the following data from each report: the name of the student, the date of the incident, and the report description.

#Start with January-March 2015

name = re.findall("Student Name: (.+) Gay", dict['jan_mar2015']['content'])
date = re.findall("...ident Date: (.+) \n", dict['jan_mar2015']['content'])
date.append('1/14/2015')

print(len(name)) #26 names
print(len(date)) #26 dates
print(name)
print(date)

desc = re.findall("Description of Incident:(.*?)\n\n\n\n", dict['jan_mar2015']['content'], re.DOTALL)

print(len(desc)) #20 descriptions found - due to inconsistent formatting, this code could not find every description in the reports.

for x in range(len(desc)):
    print(desc[x]) #print descriptions

#I added rows to desc so it has the same number of rows as name and date. I then fixed the mismatch after I exported the dataframe to Excel, checking multiple times against the original copies of the reports.

for x in range(6):
    desc.append("")

pd.set_option('display.max_colwidth', -1)
df = pd.DataFrame(list(zip(name, date, desc)), columns=['name', 'date', 'desc'])
df

#April-June 2015

name = re.findall("Student Name: (.+) Gay", dict['apr_jun2015']['content'])
date = re.findall("...ident Date: (.+) \n", dict['apr_jun2015']['content'])

name.append('Shirley')

print(len(name)) #74 names
print(len(date)) #74 dates
print(name)
print(date)

desc = re.findall("Description of Incident:(.*?)\n\n\n\n", dict['apr_jun2015']['content'], re.DOTALL)

print(len(desc)) #49 descriptions
for x in range(len(desc)):
    print(desc[x])
    print("🍌🍌🍌🍌🍌") #emojis added to clearly demarcate descriptions

for x in range(25):
    desc.append('')

print(len(desc)) #now there are 74 descriptions so it matches with name and date

df = pd.DataFrame(list(zip(name, date, desc)), columns=['name', 'date', 'desc'])
pd.options.display.max_rows=100
df

#July-December 2015

name = re.findall("Student Name: (.+) Gay", dict['jul_dec2015']['content'])
date = re.findall("...ident Date: (.+) \n", dict['jul_dec2015']['content'])
desc = re.findall("Description of Incident:(.*?)\n\n\n\n", dict['jul_dec2015']['content'], re.DOTALL)

print(len(name)) #39 names
print(len(date)) #38 dates, need to append 1 row
print(len(desc)) #21 descriptions, need to append 18 rows

date.append("")
for x in range(18):
    desc.append('')

print(len(name)) #39 rows
print(len(date)) #39 rows
print(len(desc)) #39 rows

for x in range(len(desc)):
    print(desc[x])
    print("🍌🍌🍌🍌🍌")

df = pd.DataFrame(list(zip(name, date, desc)), columns=['name', 'date', 'desc'])
df

#January-May 2016

name = re.findall("Student Name: (.+) Gay", dict['jan_may2016']['content'])
date = re.findall("...ident Date: (.+) \n", dict['jan_may2016']['content'])
desc = re.findall("Description of Incident:(.*?)\n\n\n\n", dict['jan_may2016']['content'], re.DOTALL)

print(len(name)) #43
print(len(date)) #40
print(len(desc)) #24

for x in range(3):
    date.append("")

for x in range (19):
    desc.append("")

print(len(name)) #43
print(len(date)) #43
print(len(desc)) #43

df = pd.DataFrame(list(zip(name, date, desc)), columns=['name', 'date', 'desc'])
df
	#In the midst of a two-year-long investigation into restraint and seclusion in private, special education schools, a source gave me a box filled with hundreds of pages of incident reports she had received over the course of five years whenever her children were restrained and secluded. I scanned the pages, converted them to PDF, and wrote this code in order to collect the data into an analyzable file.
	#Read the full investigation here -- a partnership between USA TODAY and the Teacher Project at Columbia University: https://www.usatoday.com/story/news/education/2020/07/25/disability-special-education-private-school-restraint/4737971002/
	#Below is an example of the code I wrote for the years 2015 and 2016.

	#%%cmd
	#pip install tika

	from tika import parser #allows me to read PDFs and conver them to text strings
	import pandas as pd #allows me to build dataframes
	import re #allows me to use regular expressions

	pdfs = ["apr_jun2015","feb2011_june2013","jan_jun2014","jan_mar2015","jan_may2016","jul_dec2013",
	"jul_dec2014","jul_dec2015"]

	new_pdfs = [("pdfs/" + s + ".pdf") for s in pdfs]

	#Parse each PDF in the list

	dict= {} # create an empty dictionary

	for x in range(len(pdfs)):
	dict[pdfs[x]] = parser.from_file(new_pdfs[x])
	print(pdfs[x])

	#can see the dictionary's keys with dict.keys()
	#can see content of the PDF with print(dict['jan_mar2015']['content'])

	#Use regular expressions to find the following data from each report: the name of the student, the date of the incident, and the report description.

	#Start with January-March 2015

	name = re.findall("Student Name: (.+) Gay", dict['jan_mar2015']['content'])
	date = re.findall("...ident Date: (.+) \n", dict['jan_mar2015']['content'])
	date.append('1/14/2015')

	print(len(name)) #26 names
	print(len(date)) #26 dates
	print(name)
	print(date)

	desc = re.findall("Description of Incident:(.*?)\n\n\n\n", dict['jan_mar2015']['content'], re.DOTALL)

	print(len(desc)) #20 descriptions found - due to inconsistent formatting, this code could not find every description in the reports.

	for x in range(len(desc)):
	print(desc[x]) #print descriptions

	#I added rows to desc so it has the same number of rows as name and date. I then fixed the mismatch after I exported the dataframe to Excel, checking multiple times against the original copies of the reports.

	for x in range(6):
	desc.append("")

	pd.set_option('display.max_colwidth', -1)
	df = pd.DataFrame(list(zip(name, date, desc)), columns=['name', 'date', 'desc'])
	df

	#April-June 2015

	name = re.findall("Student Name: (.+) Gay", dict['apr_jun2015']['content'])
	date = re.findall("...ident Date: (.+) \n", dict['apr_jun2015']['content'])

	name.append('Shirley')

	print(len(name)) #74 names
	print(len(date)) #74 dates
	print(name)
	print(date)

	desc = re.findall("Description of Incident:(.*?)\n\n\n\n", dict['apr_jun2015']['content'], re.DOTALL)

	print(len(desc)) #49 descriptions
	for x in range(len(desc)):
	print(desc[x])
	print("🍌🍌🍌🍌🍌") #emojis added to clearly demarcate descriptions

	for x in range(25):
	desc.append('')

	print(len(desc)) #now there are 74 descriptions so it matches with name and date

	df = pd.DataFrame(list(zip(name, date, desc)), columns=['name', 'date', 'desc'])
	pd.options.display.max_rows=100
	df

	#July-December 2015

	name = re.findall("Student Name: (.+) Gay", dict['jul_dec2015']['content'])
	date = re.findall("...ident Date: (.+) \n", dict['jul_dec2015']['content'])
	desc = re.findall("Description of Incident:(.*?)\n\n\n\n", dict['jul_dec2015']['content'], re.DOTALL)

	print(len(name)) #39 names
	print(len(date)) #38 dates, need to append 1 row
	print(len(desc)) #21 descriptions, need to append 18 rows

	date.append("")
	for x in range(18):
	desc.append('')

	print(len(name)) #39 rows
	print(len(date)) #39 rows
	print(len(desc)) #39 rows

	for x in range(len(desc)):
	print(desc[x])
	print("🍌🍌🍌🍌🍌")

	df = pd.DataFrame(list(zip(name, date, desc)), columns=['name', 'date', 'desc'])
	df

	#January-May 2016

	name = re.findall("Student Name: (.+) Gay", dict['jan_may2016']['content'])
	date = re.findall("...ident Date: (.+) \n", dict['jan_may2016']['content'])
	desc = re.findall("Description of Incident:(.*?)\n\n\n\n", dict['jan_may2016']['content'], re.DOTALL)

	print(len(name)) #43
	print(len(date)) #40
	print(len(desc)) #24

	for x in range(3):
	date.append("")

	for x in range (19):
	desc.append("")

	print(len(name)) #43
	print(len(date)) #43
	print(len(desc)) #43

	df = pd.DataFrame(list(zip(name, date, desc)), columns=['name', 'date', 'desc'])
	df