Created
May 9, 2020 22:08
-
-
Save ramskyi/8d831e561d835ef0659bcfb8788ca4e0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# | |
# Script for scraping band names and basic associated information from | |
# http://www.metal-archives.com | |
# | |
# Author: Jon Charest (http://github.com/jonchar | |
# Year: 2016 | |
# | |
# Approach: | |
# For each {NBR, A-Z} | |
# Read number of entries for given letter using result from `get_url` | |
# Determine how many requests of 500 entries are required, issue requests | |
# Read JSON in the `Requests` object returned by `get_url` using `r.json()` | |
# Read contents in 'aaData' key into a pandas `DataFrame` | |
# Set column names to `column_names` | |
# Clean up columns | |
# Concatenate & store outputs in a DataFrame | |
# Save final DataFrame to csv | |
import time | |
import datetime | |
import requests | |
import json | |
from pandas import DataFrame | |
BASEURL = 'http://www.metal-archives.com' | |
RELURL = '/browse/ajax-letter/json/1/l/' | |
response_len = 500 | |
def get_url(letter='A', start=0, length=500): | |
"""Gets the listings displayed as alphabetical tables on M-A for input | |
`letter`, starting at `start` and ending at `start` + `length`. | |
Returns a `Response` object. Data can be accessed by callingt the `json()` | |
method of the returned `Response` object.""" | |
payload = {'sEcho': 0, # if not set, response text is not valid JSON | |
'iDisplayStart': start, # set start index of band names returned | |
'iDisplayLength': length} # only response lengths of 500 work | |
r = requests.get(BASEURL + RELURL + letter, params=payload) | |
return r | |
def json_fix_and_get(txt): | |
"""Adds 0 to 11th position of the 4th row, because site returns | |
invalid JSON data | |
""" | |
txt = r.text.split('\n') | |
txt[3] = txt[3][:10] + '0,' | |
txt = '\n'.join(txt) | |
return json.loads(txt) | |
# Data columns returned in the JSON object | |
column_names = ['NameLink', 'Country', 'Genre', 'Status'] | |
data = DataFrame() # for collecting the results | |
# Valid inputs for the `letter` parameter of the URL are NBR or A through Z | |
letters = 'A B C D E F G H I J K L M N O P Q R S T U V W X Y Z NBR ~'.split() | |
date_of_scraping = datetime.datetime.utcnow().strftime('%Y-%m-%d') | |
# Retrieve the data | |
for letter in letters: | |
# Get total records for given letter & calculate number of chunks | |
print('Current letter = ', letter) | |
r = get_url(letter=letter, start=0, length=response_len) | |
js = json_fix_and_get(r.text) | |
n_records = js['iTotalRecords'] | |
n_chunks = int(n_records / response_len) + 1 | |
print('Total records = ', n_records) | |
# Retrieve chunks | |
for i in range(n_chunks): | |
start = response_len * i | |
if start + response_len < n_records: | |
end = start + response_len | |
else: | |
end = n_records | |
print('Fetching band entries ', start, 'to ', end) | |
for attempt in range(10): | |
time.sleep(3) # Obeying their robots.txt "Crawl-delay: 3" | |
try: | |
r = get_url(letter=letter, start=start, length=response_len) | |
js = json_fix_and_get(r.text) | |
# Store response | |
df = DataFrame(js['aaData']) | |
data = data.append(df) | |
# If the response fails, r.json() will raise an exception, so retry | |
except JSONDecodeError: | |
print('JSONDecodeError on attempt ', attempt, ' of 10.') | |
print('Retrying...') | |
continue | |
break | |
# Set informative names | |
data.columns = column_names | |
# Current index corresponds to index in smaller chunks concatenated | |
# Reset index to start at 0 and end at number of bands | |
data.index = range(len(data)) | |
# Save to CSV | |
f_name = 'MA-band-names_{}.csv'.format(date_of_scraping) | |
print('Writing band data to csv file:', f_name) | |
data.to_csv(f_name) | |
print('Complete!') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment