Skip to content

Instantly share code, notes, and snippets.

@mynameisfiber
Last active March 25, 2016 10:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mynameisfiber/95e742837e76ecac265f to your computer and use it in GitHub Desktop.
Save mynameisfiber/95e742837e76ecac265f to your computer and use it in GitHub Desktop.
thesession.org ABC file scraper
#!/usr/bin/env python2.7
"""
Scrape thesession.org for all the yummy ABC files
"""
import os
import itertools as IT
from collections import deque
from operator import itemgetter
from urlparse import urljoin
import grequests
CHUNK_SIZE = 5
def chunk(_iter, chunk_size=CHUNK_SIZE):
"""
Chunks up the given iterator into `chunk_size` lists to make batching
easier
"""
buf = []
for item in _iter:
buf.append(item)
if len(buf) == chunk_size:
yield buf
buf = buf[:0]
yield buf
def find_best_abc(content, _id):
"""
Find the best ABC file referenced in the given thesession.org tune's page.
I was going to use regex for this, but this explicit find method actually
is substantially faster. It works on the premise that the best ABC file
will be closer to the bottom
"""
start = content.rfind("/tunes/{}/abc/".format(_id))
end = content[start:].find('"')
return content[start:start+end]
def get_abc_files(chunk_size=CHUNK_SIZE):
"""
Iterate through thesession.org tune pages sequentially and extract out the
best ABC file from each. We only look at pages that we haven't already
scrapped and we chunk batch our requests
"""
collect = deque()
for i in IT.count(1):
if not os.path.exists("data/{:08d}.abc".format(i)):
print "Adding request for: ", i
request = grequests.request("GET", "https://thesession.org/tunes/{}".format(i))
collect.append((i, request))
if len(collect) == chunk_size:
print "Issuing requests"
pages = grequests.map(IT.imap(itemgetter(1), collect))
for (_id, _), page in IT.izip(collect, pages):
abc = find_best_abc(page.content, _id)
print "Parsing request for id: ", _id, abc
yield (_id, urljoin("https://thesession.org/", abc))
collect.clear()
def main():
"""
Perform the scrape on thesession.org
"""
abc_list = get_abc_files()
for abc_files in chunk(abc_list):
print "Downloading abc files: ", len(abc_files)
data = grequests.map(grequests.get(u[1]) for u in abc_files)
for (_id, _), abc in IT.izip(abc_files, data):
print "Writing: ", _id
open("data/{:08d}.abc".format(_id), "wb+").write(abc.content)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment