Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Scraping Prog Archives for top albums
import dataset
import requests
import sys
import re
from lxml import html
db = dataset.connect('sqlite:///prog.db')
albums_table = db['albums']
for year in range(2005, 2015):
base_url = "http://www.progarchives.com/top-prog-albums.asp?syears={0}"
url = base_url.format(year)
response = requests.get(url)
if response.status_code != 200:
sys.exit('Non 200 status code received')
parsed = html.fromstring(response.text)
albums = parsed.xpath('//table[2]/tr')
for album in albums:
album_url = album.xpath('*//@href')[0]
album_id = int(re.search(r'\?id=(\d+)$', album_url).groups()[0])
position = int(album.xpath('*//text()')[0])
rating = float(album.xpath('*//text()')[5])
title = album.xpath('*//text()')[14]
artist = album.xpath('*//text()')[15]
genre = album.xpath('*//text()')[16]
albums_table.insert(dict(
id=album_id,
position=position,
rating=rating,
title=title,
artist=artist,
genre=genre,
year=year
))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment