Skip to content

Instantly share code, notes, and snippets.

@kinow
Forked from mynameisfiber/thesession.py
Created March 25, 2016 10:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kinow/2db0e4e0c979e18104d4 to your computer and use it in GitHub Desktop.
Save kinow/2db0e4e0c979e18104d4 to your computer and use it in GitHub Desktop.
thesession.org ABC file scraper
#!/usr/bin/env python2.7
"""
Scrape thesession.org for all the yummy ABC files
"""
import os
import itertools as IT
from collections import deque
from operator import itemgetter
from urlparse import urljoin
import grequests
CHUNK_SIZE = 5
def chunk(_iter, chunk_size=CHUNK_SIZE):
"""
Chunks up the given iterator into `chunk_size` lists to make batching
easier
"""
buf = []
for item in _iter:
buf.append(item)
if len(buf) == chunk_size:
yield buf
buf = buf[:0]
yield buf
def find_best_abc(content, _id):
"""
Find the best ABC file referenced in the given thesession.org tune's page.
I was going to use regex for this, but this explicit find method actually
is substantially faster. It works on the premise that the best ABC file
will be closer to the bottom
"""
start = content.rfind("/tunes/{}/abc/".format(_id))
end = content[start:].find('"')
return content[start:start+end]
def get_abc_files(chunk_size=CHUNK_SIZE):
"""
Iterate through thesession.org tune pages sequentially and extract out the
best ABC file from each. We only look at pages that we haven't already
scrapped and we chunk batch our requests
"""
collect = deque()
for i in IT.count(1):
if not os.path.exists("data/{:08d}.abc".format(i)):
print "Adding request for: ", i
request = grequests.request("GET", "https://thesession.org/tunes/{}".format(i))
collect.append((i, request))
if len(collect) == chunk_size:
print "Issuing requests"
pages = grequests.map(IT.imap(itemgetter(1), collect))
for (_id, _), page in IT.izip(collect, pages):
abc = find_best_abc(page.content, _id)
print "Parsing request for id: ", _id, abc
yield (_id, urljoin("https://thesession.org/", abc))
collect.clear()
def main():
"""
Perform the scrape on thesession.org
"""
abc_list = get_abc_files()
for abc_files in chunk(abc_list):
print "Downloading abc files: ", len(abc_files)
data = grequests.map(grequests.get(u[1]) for u in abc_files)
for (_id, _), abc in IT.izip(abc_files, data):
print "Writing: ", _id
open("data/{:08d}.abc".format(_id), "wb+").write(abc.content)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment