Skip to content

Instantly share code, notes, and snippets.

@gareththomasnz
Forked from nhomble/scraper.py
Created August 10, 2019 12:50
Show Gist options
  • Save gareththomasnz/218cbe880f53a6e3cc4be15714d38936 to your computer and use it in GitHub Desktop.
Save gareththomasnz/218cbe880f53a6e3cc4be15714d38936 to your computer and use it in GitHub Desktop.
playing with web scraper (Beautiful Soup) to expose BodyBuilding.com's exercise database (to later be turned into a restful service maybe)
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import pycurl
from io import BytesIO
import re
import sys
END = "abcdefghijklmnopqrstuvwxyz0123456789"
BB_URL = "http://www.bodybuilding.com/exercises/list/index/selected/"
class Exercise():
#
# name
# also known as
# type
# main muscle worked
# other muscles
# equipment
# mechanics
# level
# sport
# force
#
def __init__(self, kwargs):
self.name = kwargs['name']
self.aka = kwargs['Also Known As']
self.type = kwargs['Type']
self.main_muscle = kwargs['Main Muscle Worked']
self.other_muscles = kwargs['Other Muscles']
self.equipment = kwargs['Equipment']
self.mechanics = kwargs['Mechanics Type']
self.level = kwargs['Level']
self.sport = kwargs['Sport']
self.force = kwargs['Force']
def __str__(self):
return str(self.__dict__)
## extract html contents
def get_html(url):
print(url)
buffer = BytesIO()
c = pycurl.Curl()
c.setopt(c.URL, url)
c.setopt(c.WRITEDATA, buffer)
c.perform()
c.close()
try:
text = buffer.getvalue().decode('UTF-8')
return text
except:
print(sys.exc_info()[0])
return None
## helper method for listing page
def _get_list_of_exercises(listing_html):
if listing_html is None:
return None
soup = BeautifulSoup(listing_html, "html.parser")
try:
listResults = soup.find("div", {"id": "listResults"})
names = listResults.find_all("div", {"class": "exerciseName"})
a = map(lambda x: x.find("a"), names)
get_href = lambda x: x['href']
hrefs = map(get_href, a)
return list(hrefs)
except:
print(sys.exc_info()[0])
return None
## helper method for detail page
def _get_exercise_details(details_html):
try:
soup = BeautifulSoup(details_html, "html.parser")
details = soup.find("div", {"id": "exerciseDetails"})
rows = details.find_all("span", {"class": "row"})
except:
print(sys.exc_info()[0])
return None
name = details.h1.text.strip().replace(" ", "-").replace('\n', "")
args = {
"name": name,
"Also Known As": [],
"Type": "",
"Main Muscle Worker": "",
"Other Muscles": "",
"Equipment": "",
"Mechanics Type": "",
"Level": "",
"Sport": "",
"Force": ""
}
for row in rows:
groups = re.search('(.+):(.+)', row.text.replace('\n', '').strip())
args[groups.group(1).strip()] = groups.group(2).strip()
try:
if groups.group(1).strip() == "Also Known As":
args["Also Known As"] = map(replace(" ", "-"), args["Also Known As"].split(', '))
except:
print(sys.exc_info()[0])
return Exercise(args)
def main():
exercises = []
for letter in END:
_exercises = _get_list_of_exercises(get_html(BB_URL + letter))
if _exercises is None:
continue
else:
exercises += _exercises
for exercise in exercises:
e = _get_exercise_details(get_html(exercise))
if e is None:
continue
print(e)
if __name__ == "__main__":
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment