Skip to content

Instantly share code, notes, and snippets.

@jiobu1
Created April 26, 2021 03:03
Show Gist options
  • Save jiobu1/e0d388692cab90cb9e0ea82acc79c215 to your computer and use it in GitHub Desktop.
Save jiobu1/e0d388692cab90cb9e0ea82acc79c215 to your computer and use it in GitHub Desktop.
web scraping with selenium and beautiful soup
# Looping through each city in the file
cities = pd.read_csv('csv/cities.csv')
records = []
# selenium driver
driver = webdriver.Chrome()
# url for greatschools pre_url and post_url (with state/city inbetween)
url_pre = 'http://www.greatschools.org/'
for i in cities['city']:
fetching = True
page = 0
while fetching:
page += 1
url = url_pre + urllib.parse.quote(i) + '/schools/?page={}&tableView=Overview&view=table'.format(page)
print("Fetching ", url)
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
# check if last page
page_status = soup.find('div', {'class': 'pagination-summary'})
# create list of pagination summary text [Showing, 1, to, 25, of, 1,109, schools, found, in, New, York, NY]
page_status_list = page_status.text.strip().split()
ending = (page_status_list[3]).replace(',', '') # postion 3 is the number of the last item on page
total = (page_status_list[5]).replace(',' , '') # position 5 is teh total number of schools
if int(ending) >= int(total):
fetching = False # stops the loop
table = soup.find("table", { "class" : "" })
for row in table.find_all("tr"):
cell = row.find_all("td")
if len(cell) == 7:
school = row.find('a', {'class':'name'}).text.strip()
try:
score = row.find('div', {'class': 'circle-rating--small'}).text.strip()
except AttributeError:
score = '0/10'
rating = row.find('div', {'class': 'scale'}).text.strip()
try:
address = row.find('div', {'class': 'address'}).text.strip()
except AttributeError:
address = "Unavailable"
school_type = cell[1].find(text=True)
grade = cell[2].find(text=True)
students = cell[3].find(text=True)
student_teacher_ratio = cell[4].find(text=True)
try:
district = cell[6].find(text=True)
except AttributeError:
district = 'Unavailable'
records.append({
'School': school,
'Score': score,
'Rating': rating,
'Address': address,
'Type': school_type,
'Grades' : grade,
'Total Students Enrolled': students,
'Students per teacher' : student_teacher_ratio,
'District': district
})
driver.close()
school_df = pd.DataFrame.from_dict(records)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment