kinow/thesession.py

## thesession.py
#!/usr/bin/env python2.7
"""
Scrape thesession.org for all the yummy ABC files
"""

import os
import itertools as IT
from collections import deque
from operator import itemgetter
from urlparse import urljoin

import grequests


CHUNK_SIZE = 5


def chunk(_iter, chunk_size=CHUNK_SIZE):
    """
    Chunks up the given iterator into `chunk_size` lists to make batching
    easier
    """
    buf = []
    for item in _iter:
        buf.append(item)
        if len(buf) == chunk_size:
            yield buf
            buf = buf[:0]
    yield buf

def find_best_abc(content, _id):
    """
    Find the best ABC file referenced in the given thesession.org tune's page.
    I was going to use regex for this, but this explicit find method actually
    is substantially faster.  It works on the premise that the best ABC file
    will be closer to the bottom
    """
    start = content.rfind("/tunes/{}/abc/".format(_id))
    end = content[start:].find('"')
    return content[start:start+end]

def get_abc_files(chunk_size=CHUNK_SIZE):
    """
    Iterate through thesession.org tune pages sequentially and extract out the
    best ABC file from each.  We only look at pages that we haven't already
    scrapped and we chunk batch our requests
    """
    collect = deque()
    for i in IT.count(1):
        if not os.path.exists("data/{:08d}.abc".format(i)):
            print "Adding request for: ", i
            request = grequests.request("GET", "https://thesession.org/tunes/{}".format(i))
            collect.append((i, request))
        if len(collect) == chunk_size:
            print "Issuing requests"
            pages = grequests.map(IT.imap(itemgetter(1), collect))
            for (_id, _), page in IT.izip(collect, pages):
                abc = find_best_abc(page.content, _id)
                print "Parsing request for id: ", _id, abc
                yield (_id, urljoin("https://thesession.org/", abc))
            collect.clear()

def main():
    """
    Perform the scrape on thesession.org
    """
    abc_list = get_abc_files()
    for abc_files in chunk(abc_list):
        print "Downloading abc files: ", len(abc_files)
        data = grequests.map(grequests.get(u[1]) for u in abc_files)
        for (_id, _), abc in IT.izip(abc_files, data):
            print "Writing: ", _id
            open("data/{:08d}.abc".format(_id), "wb+").write(abc.content)


if __name__ == "__main__":
    main()
	#!/usr/bin/env python2.7
	"""
	Scrape thesession.org for all the yummy ABC files
	"""

	import os
	import itertools as IT
	from collections import deque
	from operator import itemgetter
	from urlparse import urljoin

	import grequests


	CHUNK_SIZE = 5


	def chunk(_iter, chunk_size=CHUNK_SIZE):
	"""
	Chunks up the given iterator into `chunk_size` lists to make batching
	easier
	"""
	buf = []
	for item in _iter:
	buf.append(item)
	if len(buf) == chunk_size:
	yield buf
	buf = buf[:0]
	yield buf

	def find_best_abc(content, _id):
	"""
	Find the best ABC file referenced in the given thesession.org tune's page.
	I was going to use regex for this, but this explicit find method actually
	is substantially faster. It works on the premise that the best ABC file
	will be closer to the bottom
	"""
	start = content.rfind("/tunes/{}/abc/".format(_id))
	end = content[start:].find('"')
	return content[start:start+end]

	def get_abc_files(chunk_size=CHUNK_SIZE):
	"""
	Iterate through thesession.org tune pages sequentially and extract out the
	best ABC file from each. We only look at pages that we haven't already
	scrapped and we chunk batch our requests
	"""
	collect = deque()
	for i in IT.count(1):
	if not os.path.exists("data/{:08d}.abc".format(i)):
	print "Adding request for: ", i
	request = grequests.request("GET", "https://thesession.org/tunes/{}".format(i))
	collect.append((i, request))
	if len(collect) == chunk_size:
	print "Issuing requests"
	pages = grequests.map(IT.imap(itemgetter(1), collect))
	for (_id, _), page in IT.izip(collect, pages):
	abc = find_best_abc(page.content, _id)
	print "Parsing request for id: ", _id, abc
	yield (_id, urljoin("https://thesession.org/", abc))
	collect.clear()

	def main():
	"""
	Perform the scrape on thesession.org
	"""
	abc_list = get_abc_files()
	for abc_files in chunk(abc_list):
	print "Downloading abc files: ", len(abc_files)
	data = grequests.map(grequests.get(u[1]) for u in abc_files)
	for (_id, _), abc in IT.izip(abc_files, data):
	print "Writing: ", _id
	open("data/{:08d}.abc".format(_id), "wb+").write(abc.content)


	if __name__ == "__main__":
	main()