Skip to content

Instantly share code, notes, and snippets.

@arturhoo
Created July 25, 2015 15:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save arturhoo/a4c47c26e32f2f1bf49f to your computer and use it in GitHub Desktop.
Save arturhoo/a4c47c26e32f2f1bf49f to your computer and use it in GitHub Desktop.
Scraping Prog Archives for top albums
import dataset
import requests
import sys
import re
from lxml import html
db = dataset.connect('sqlite:///prog.db')
albums_table = db['albums']
for year in range(2005, 2015):
base_url = "http://www.progarchives.com/top-prog-albums.asp?syears={0}"
url = base_url.format(year)
response = requests.get(url)
if response.status_code != 200:
sys.exit('Non 200 status code received')
parsed = html.fromstring(response.text)
albums = parsed.xpath('//table[2]/tr')
for album in albums:
album_url = album.xpath('*//@href')[0]
album_id = int(re.search(r'\?id=(\d+)$', album_url).groups()[0])
position = int(album.xpath('*//text()')[0])
rating = float(album.xpath('*//text()')[5])
title = album.xpath('*//text()')[14]
artist = album.xpath('*//text()')[15]
genre = album.xpath('*//text()')[16]
albums_table.insert(dict(
id=album_id,
position=position,
rating=rating,
title=title,
artist=artist,
genre=genre,
year=year
))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment