Last active
May 19, 2020 09:54
-
-
Save MrN00b0t/617f077e460e98f61c4889c0f709b88f to your computer and use it in GitHub Desktop.
Codecademy: Beautiful Soup Review
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
prefix = "https://s3.amazonaws.com/codecademy-content/courses/beautifulsoup/" | |
webpage_response = requests.get('https://s3.amazonaws.com/codecademy-content/courses/beautifulsoup/shellter.html') | |
webpage = webpage_response.content | |
soup = BeautifulSoup(webpage, "html.parser") | |
turtle_links = soup.find_all("a") | |
links = [] | |
#go through all of the a tags and get the links associated with them" | |
for a in turtle_links: | |
links.append(prefix+a["href"]) | |
#Define turtle_data: | |
turtle_data = {} | |
#follow each link: | |
for link in links: | |
webpage = requests.get(link) | |
turtle = BeautifulSoup(webpage.content, "html.parser") | |
turtle_name = turtle.select(".name")[0].get_text() | |
stats = turtle.find("ul") | |
stats_text = stats.get_text("|") | |
turtle_data[turtle_name] = stats_text.split("|") | |
turtle_df = pd.DataFrame. from_dict(turtle_data, orient='index') | |
#All newlines are even columns, drop those: | |
drop_columns = [] | |
for i in range(0, 12, 2): | |
drop_columns.append(i) | |
turtle_df = turtle_df.drop(drop_columns, axis=1) | |
#Provide numerical index instead of name: | |
turtle_df = turtle_df.reset_index() | |
#Give appropriate names to remaining columns: | |
turtle_df = turtle_df.rename(columns={'index': 'name', 1: 'years_old', 3: 'weight_lbs', 5: 'gender', 7: 'breed', 9: 'source'}) | |
#Remove extraneous data | |
turtle_df.source = turtle_df.source.replace('SOURCE: ', '', regex= True) | |
turtle_df.breed = turtle_df.breed.replace('BREED: ', '', regex= True) | |
turtle_df.gender = turtle_df.gender.replace('SEX: ', '', regex= True) | |
#Age and weight likely to benefit from being numerical; strip out string values | |
new_weight = turtle_df.weight_lbs.str.split(' ', expand= True) | |
new_age = turtle_df.years_old.str.split(' ', expand= True) | |
turtle_df.weight_lbs = new_weight[1] | |
turtle_df.years_old = new_age[1] | |
#Convert to numeric | |
turtle_df.weight_lbs = pd.to_numeric(turtle_df.weight_lbs) | |
turtle_df.years_old = pd.to_numeric(turtle_df.years_old) | |
#Aggregate functions can now be performed on age/weight! | |
print(turtle_df.years_old.mean()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment