Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save ramskyi/8d831e561d835ef0659bcfb8788ca4e0 to your computer and use it in GitHub Desktop.
Save ramskyi/8d831e561d835ef0659bcfb8788ca4e0 to your computer and use it in GitHub Desktop.
#! /usr/bin/env python
#
# Script for scraping band names and basic associated information from
# http://www.metal-archives.com
#
# Author: Jon Charest (http://github.com/jonchar
# Year: 2016
#
# Approach:
# For each {NBR, A-Z}
# Read number of entries for given letter using result from `get_url`
# Determine how many requests of 500 entries are required, issue requests
# Read JSON in the `Requests` object returned by `get_url` using `r.json()`
# Read contents in 'aaData' key into a pandas `DataFrame`
# Set column names to `column_names`
# Clean up columns
# Concatenate & store outputs in a DataFrame
# Save final DataFrame to csv
import time
import datetime
import requests
import json
from pandas import DataFrame
BASEURL = 'http://www.metal-archives.com'
RELURL = '/browse/ajax-letter/json/1/l/'
response_len = 500
def get_url(letter='A', start=0, length=500):
"""Gets the listings displayed as alphabetical tables on M-A for input
`letter`, starting at `start` and ending at `start` + `length`.
Returns a `Response` object. Data can be accessed by callingt the `json()`
method of the returned `Response` object."""
payload = {'sEcho': 0, # if not set, response text is not valid JSON
'iDisplayStart': start, # set start index of band names returned
'iDisplayLength': length} # only response lengths of 500 work
r = requests.get(BASEURL + RELURL + letter, params=payload)
return r
def json_fix_and_get(txt):
"""Adds 0 to 11th position of the 4th row, because site returns
invalid JSON data
"""
txt = r.text.split('\n')
txt[3] = txt[3][:10] + '0,'
txt = '\n'.join(txt)
return json.loads(txt)
# Data columns returned in the JSON object
column_names = ['NameLink', 'Country', 'Genre', 'Status']
data = DataFrame() # for collecting the results
# Valid inputs for the `letter` parameter of the URL are NBR or A through Z
letters = 'A B C D E F G H I J K L M N O P Q R S T U V W X Y Z NBR ~'.split()
date_of_scraping = datetime.datetime.utcnow().strftime('%Y-%m-%d')
# Retrieve the data
for letter in letters:
# Get total records for given letter & calculate number of chunks
print('Current letter = ', letter)
r = get_url(letter=letter, start=0, length=response_len)
js = json_fix_and_get(r.text)
n_records = js['iTotalRecords']
n_chunks = int(n_records / response_len) + 1
print('Total records = ', n_records)
# Retrieve chunks
for i in range(n_chunks):
start = response_len * i
if start + response_len < n_records:
end = start + response_len
else:
end = n_records
print('Fetching band entries ', start, 'to ', end)
for attempt in range(10):
time.sleep(3) # Obeying their robots.txt "Crawl-delay: 3"
try:
r = get_url(letter=letter, start=start, length=response_len)
js = json_fix_and_get(r.text)
# Store response
df = DataFrame(js['aaData'])
data = data.append(df)
# If the response fails, r.json() will raise an exception, so retry
except JSONDecodeError:
print('JSONDecodeError on attempt ', attempt, ' of 10.')
print('Retrying...')
continue
break
# Set informative names
data.columns = column_names
# Current index corresponds to index in smaller chunks concatenated
# Reset index to start at 0 and end at number of bands
data.index = range(len(data))
# Save to CSV
f_name = 'MA-band-names_{}.csv'.format(date_of_scraping)
print('Writing band data to csv file:', f_name)
data.to_csv(f_name)
print('Complete!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment