Skip to content

Instantly share code, notes, and snippets.

@dubpirate
Last active February 13, 2020 03:42
Show Gist options
  • Save dubpirate/144b5f877e70d5f1f88bbd433f5ead69 to your computer and use it in GitHub Desktop.
Save dubpirate/144b5f877e70d5f1f88bbd433f5ead69 to your computer and use it in GitHub Desktop.
I'm sure we've all heard the joke about "eating the rich" so here's the menu! It scrapes the Wikipedia page of richest Americans, then scrapes their wikipedia pages for their net worth, then saves it all as a csv. Please not this is NOT how I actually name variables, I just thought it was funny :). Something I whipped up for a job application at…
from bs4 import BeautifulSoup
import csv
import requests
url = "https://en.wikipedia.org/wiki/List_of_Americans_by_net_worth"
req = requests.get(url)
soup = BeautifulSoup(req.content, 'html5lib')
tables = soup.find("div", attrs={"id":"mw-content-text"})
edible = []
for row in tables.findAll("li"):
try:
ref = row.a["href"]
if ref.find("List") == -1 and ref.find("#") == -1:
edible.append(ref)
except KeyError:
pass
bills = edible[0:146]
def clean_name(url):
return url[len("/wiki/"):].replace("_"," ")
def clean_money(net_worth):
if net_worth == "Not Found":
print("Net worth not found")
return 0.0
else:
fil = lambda char : char.isdigit() or char == "."
net_worth = net_worth[0:net_worth.find("b")]
clean_worth = ''.join(filter(fil, net_worth))
clean_worth = clean_worth[:4]
clean_worth = round(float(clean_worth),1)
return clean_worth
def scrape_net_worth(wiki_url):
req = requests.get("https://en.wikipedia.org" + wiki_url)
soup = BeautifulSoup(req.content, 'html5lib')
# The card is the info brief below individuals pictures, you know.
card = soup.find("tbody")
# Text from each row of the Bio
texts = []
for row in card.findAll("tr"):
texts.append(row.text)
net_worth = "Not Found"
for i in range(0,len(texts)):
if texts[i].find("Net") == 0:
net_worth = texts[i]
return net_worth
with open("the_menu.csv", "w") as menu:
writer = csv.writer(menu)
writer.writerow(["Name", "Net Worth"])
for bill in bills:
name = clean_name(bill)
value = clean_money(scrape_net_worth(bill))
writer.writerow([name, value])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment