jiobu1/great_schools_scraper.py

## great_schools_scraper.py
# Looping through each city in the file
cities = pd.read_csv('csv/cities.csv')

records = []

# selenium driver
driver = webdriver.Chrome()

# url for greatschools pre_url and post_url (with state/city inbetween)
url_pre = 'http://www.greatschools.org/'

for i in cities['city']:
  fetching = True

  page = 0

  while fetching:
        page += 1
        url = url_pre +  urllib.parse.quote(i) + '/schools/?page={}&tableView=Overview&view=table'.format(page)
        print("Fetching ", url)

        driver.get(url)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

         # check if last page
        page_status = soup.find('div', {'class': 'pagination-summary'})
        # create list of pagination summary text [Showing, 1, to, 25, of, 1,109, schools, found, in, New, York, NY]
        page_status_list = page_status.text.strip().split()
        ending = (page_status_list[3]).replace(',', '') # postion 3 is the number of the last item on page
        total = (page_status_list[5]).replace(',' , '') # position 5 is teh total number of schools
        if int(ending) >= int(total):
            fetching = False # stops the loop

        table = soup.find("table", { "class" : "" })
        for row in table.find_all("tr"):
            cell = row.find_all("td")
            if len(cell) == 7:
                school = row.find('a', {'class':'name'}).text.strip()
                try:
                    score = row.find('div', {'class': 'circle-rating--small'}).text.strip()
                except AttributeError:
                    score = '0/10'
                rating = row.find('div', {'class': 'scale'}).text.strip()
                try:
                    address = row.find('div', {'class': 'address'}).text.strip()
                except AttributeError:
                    address = "Unavailable"
                school_type = cell[1].find(text=True)
                grade =  cell[2].find(text=True)
                students =  cell[3].find(text=True)
                student_teacher_ratio =  cell[4].find(text=True)
                try:
                    district =  cell[6].find(text=True)
                except AttributeError:
                    district = 'Unavailable'

                records.append({
                    'School': school,
                    'Score': score,
                    'Rating': rating,
                    'Address': address,
                    'Type': school_type,
                    'Grades' : grade,
                    'Total Students Enrolled': students,
                    'Students per teacher' : student_teacher_ratio,
                    'District': district
                    })

driver.close()

school_df = pd.DataFrame.from_dict(records)
	# Looping through each city in the file
	cities = pd.read_csv('csv/cities.csv')

	records = []

	# selenium driver
	driver = webdriver.Chrome()

	# url for greatschools pre_url and post_url (with state/city inbetween)
	url_pre = 'http://www.greatschools.org/'

	for i in cities['city']:
	fetching = True

	page = 0

	while fetching:
	page += 1
	url = url_pre + urllib.parse.quote(i) + '/schools/?page={}&tableView=Overview&view=table'.format(page)
	print("Fetching ", url)

	driver.get(url)
	html = driver.page_source
	soup = BeautifulSoup(html, 'html.parser')

	# check if last page
	page_status = soup.find('div', {'class': 'pagination-summary'})
	# create list of pagination summary text [Showing, 1, to, 25, of, 1,109, schools, found, in, New, York, NY]
	page_status_list = page_status.text.strip().split()
	ending = (page_status_list[3]).replace(',', '') # postion 3 is the number of the last item on page
	total = (page_status_list[5]).replace(',' , '') # position 5 is teh total number of schools
	if int(ending) >= int(total):
	fetching = False # stops the loop

	table = soup.find("table", { "class" : "" })
	for row in table.find_all("tr"):
	cell = row.find_all("td")
	if len(cell) == 7:
	school = row.find('a', {'class':'name'}).text.strip()
	try:
	score = row.find('div', {'class': 'circle-rating--small'}).text.strip()
	except AttributeError:
	score = '0/10'
	rating = row.find('div', {'class': 'scale'}).text.strip()
	try:
	address = row.find('div', {'class': 'address'}).text.strip()
	except AttributeError:
	address = "Unavailable"
	school_type = cell[1].find(text=True)
	grade = cell[2].find(text=True)
	students = cell[3].find(text=True)
	student_teacher_ratio = cell[4].find(text=True)
	try:
	district = cell[6].find(text=True)
	except AttributeError:
	district = 'Unavailable'

	records.append({
	'School': school,
	'Score': score,
	'Rating': rating,
	'Address': address,
	'Type': school_type,
	'Grades' : grade,
	'Total Students Enrolled': students,
	'Students per teacher' : student_teacher_ratio,
	'District': district
	})

	driver.close()

	school_df = pd.DataFrame.from_dict(records)