Skip to content

Instantly share code, notes, and snippets.

@MrN00b0t
Last active May 19, 2020 09:54
Show Gist options
  • Save MrN00b0t/617f077e460e98f61c4889c0f709b88f to your computer and use it in GitHub Desktop.
Save MrN00b0t/617f077e460e98f61c4889c0f709b88f to your computer and use it in GitHub Desktop.
Codecademy: Beautiful Soup Review
import requests
from bs4 import BeautifulSoup
import pandas as pd
prefix = "https://s3.amazonaws.com/codecademy-content/courses/beautifulsoup/"
webpage_response = requests.get('https://s3.amazonaws.com/codecademy-content/courses/beautifulsoup/shellter.html')
webpage = webpage_response.content
soup = BeautifulSoup(webpage, "html.parser")
turtle_links = soup.find_all("a")
links = []
#go through all of the a tags and get the links associated with them"
for a in turtle_links:
links.append(prefix+a["href"])
#Define turtle_data:
turtle_data = {}
#follow each link:
for link in links:
webpage = requests.get(link)
turtle = BeautifulSoup(webpage.content, "html.parser")
turtle_name = turtle.select(".name")[0].get_text()
stats = turtle.find("ul")
stats_text = stats.get_text("|")
turtle_data[turtle_name] = stats_text.split("|")
turtle_df = pd.DataFrame. from_dict(turtle_data, orient='index')
#All newlines are even columns, drop those:
drop_columns = []
for i in range(0, 12, 2):
drop_columns.append(i)
turtle_df = turtle_df.drop(drop_columns, axis=1)
#Provide numerical index instead of name:
turtle_df = turtle_df.reset_index()
#Give appropriate names to remaining columns:
turtle_df = turtle_df.rename(columns={'index': 'name', 1: 'years_old', 3: 'weight_lbs', 5: 'gender', 7: 'breed', 9: 'source'})
#Remove extraneous data
turtle_df.source = turtle_df.source.replace('SOURCE: ', '', regex= True)
turtle_df.breed = turtle_df.breed.replace('BREED: ', '', regex= True)
turtle_df.gender = turtle_df.gender.replace('SEX: ', '', regex= True)
#Age and weight likely to benefit from being numerical; strip out string values
new_weight = turtle_df.weight_lbs.str.split(' ', expand= True)
new_age = turtle_df.years_old.str.split(' ', expand= True)
turtle_df.weight_lbs = new_weight[1]
turtle_df.years_old = new_age[1]
#Convert to numeric
turtle_df.weight_lbs = pd.to_numeric(turtle_df.weight_lbs)
turtle_df.years_old = pd.to_numeric(turtle_df.years_old)
#Aggregate functions can now be performed on age/weight!
print(turtle_df.years_old.mean())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment