Scraping Prog Archives for top albums
import dataset | |
import requests | |
import sys | |
import re | |
from lxml import html | |
db = dataset.connect('sqlite:///prog.db') | |
albums_table = db['albums'] | |
for year in range(2005, 2015): | |
base_url = "http://www.progarchives.com/top-prog-albums.asp?syears={0}" | |
url = base_url.format(year) | |
response = requests.get(url) | |
if response.status_code != 200: | |
sys.exit('Non 200 status code received') | |
parsed = html.fromstring(response.text) | |
albums = parsed.xpath('//table[2]/tr') | |
for album in albums: | |
album_url = album.xpath('*//@href')[0] | |
album_id = int(re.search(r'\?id=(\d+)$', album_url).groups()[0]) | |
position = int(album.xpath('*//text()')[0]) | |
rating = float(album.xpath('*//text()')[5]) | |
title = album.xpath('*//text()')[14] | |
artist = album.xpath('*//text()')[15] | |
genre = album.xpath('*//text()')[16] | |
albums_table.insert(dict( | |
id=album_id, | |
position=position, | |
rating=rating, | |
title=title, | |
artist=artist, | |
genre=genre, | |
year=year | |
)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment