Created
August 22, 2016 19:43
-
-
Save lewiseason/e0f0b6b61f30e3cc0a23abbbcee9c551 to your computer and use it in GitHub Desktop.
Hammer the official UK chart from a known starting point and write out a CSV of title, artist and highest position achieved
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
from lxml.html import parse | |
inf = float('inf') | |
base_uri = 'http://www.officialcharts.com' | |
start_page = '/charts/singles-chart/20150828/7501/' | |
positions = {} | |
def parse_page(page): | |
tree = parse(base_uri + page) | |
for row in tree.xpath('//table[@class="chart-positions"]/tr'): | |
track = row.xpath('./td/div[@class="track"]') | |
if track == []: continue | |
position = int(row.xpath('./td[4]/text()')[0]) | |
title = row.xpath('./td[3]/div/div[2]/div[1]/a/text()')[0] | |
artist = row.xpath('./td[3]/div/div[2]/div[2]/a/text()')[0] | |
track = (title, artist) | |
last_known_position = positions.get(track, inf) | |
if position < last_known_position: | |
positions[track] = position | |
next_links = tree.xpath('//a[@class="next chart-date-directions"]/@href') | |
if len(next_links) > 0: | |
return next_links[0] | |
page = start_page | |
count = 0 | |
while page: | |
page = parse_page(page) | |
count += 1 | |
with open('/tmp/chart.csv', 'wb') as csvfile: | |
w = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) | |
for track, position in positions.items(): | |
w.writerow([track[1], track[0], position]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment