Instantly share code, notes, and snippets.

Embed
What would you like to do?
Scrape ballroom dance competition result from (HTML) result generated by Skating System Software.
#!/usr/local/bin/python
from bs4 import BeautifulSoup
import urllib2
import re
import sys
import codecs
def process_folder(folder_url):
page = urllib2.urlopen(folder_url)
soup = BeautifulSoup(page, "html.parser")
# Print event (folder) name
print('"' + soup.find("span", class_="competition").get_text().strip() + '",,')
rows = soup.find_all("tr", class_=re.compile("(row|row0dd)"))
for i in range(0, len(rows)):
# Print ranking, competitor bib number, competitor name
print('%s,%s,%s' % (
rows[i].find("td", class_="center").get_text(),
rows[i].find("td", class_="centerHeader").get_text(),
rows[i].find("td", class_="left").get_text(),
))
print(",,")
def main():
# TODO: Set your URL path here (excluding "index.html").
main_page = "http://www.danceoption.com/results/180811%20johor%202018/"
page = urllib2.urlopen(main_page)
soup = BeautifulSoup(page, "html.parser")
for folder in soup.find_all("a", class_="nav1"):
path = folder.get("href")
if path.find("folder") == -1:
continue
process_folder(main_page + path.replace(".html", ".res.html"))
if __name__ == "__main__":
# Ref: https://stackoverflow.com/a/1169209/58542
# Fix for `UnicodeEncodeError` piped output issue.
sys.stdout = codecs.getwriter('utf8')(sys.stdout)
sys.stderr = codecs.getwriter('utf8')(sys.stderr)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment