Skip to content

Instantly share code, notes, and snippets.

@lanky
Created May 20, 2021 20:29
Show Gist options
  • Save lanky/598f77cb92c61e256821dfef83f5760b to your computer and use it in GitHub Desktop.
Save lanky/598f77cb92c61e256821dfef83f5760b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
#!/usr/bin/env python3
# pattern matching
import re
# HTML fetching
import requests
# HTML parsing
from bs4 import BeautifulSoup
# for CSV output
from csv import DictWriter
if __name__ == "__main__":
# a list to hold our matched table rows
records = []
# field titles for the output
fields = ['date', 'artist', 'title', 'wks']
# try and fetch the URL
req = requests.get("https://www.officialcharts.com/chart-news/all-the-number-1-singles__7931/")
if req.ok:
# we managed to fetch the webpage, now parse it
soup = BeautifulSoup(req.content, features="lxml")
# find all the tables in the webpage
for table in soup.findAll('table'):
for tr in table.findAll('tr'):
# find text content for each 'td' element.
# these are all one-item lists so [0] gets the only entry
# we're also converting to 'title case' here
data = [td.findChildren(text=True)[0].title() for td in tr.findAll("td")]
# reject rows that don't start with a date in DD/MM/YYYY format
# this could probably be much simpler
if re.match(r'\d{2}/\d{2}/\d{4}', data[0]):
records.append(dict(zip(fields, data)))
# open a new CSV file to write to
with open("numberones.csv", "w") as out:
writer = DictWriter(out, fieldnames=fields, restval='')
writer.writeheader()
writer.writerows(records)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment