loadletter/.gitignore Secret

## .gitignore
*.sqlite3
*.pyc

## sqlitedict.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>

# Hacked together from:
#  * http://code.activestate.com/recipes/576638-draft-for-an-sqlite3-based-dbm/
#  * http://code.activestate.com/recipes/526618/
#
# Use the code in any way you like (at your own risk), it's public domain.

"""
A lightweight wrapper around Python's sqlite3 database, with a dict-like interface
and multi-thread access support::

>>> mydict = SqliteDict('some.db', autocommit=True) # the mapping will be persisted to file `some.db`
>>> mydict['some_key'] = any_jsonable_object
>>> print mydict['some_key']
>>> print len(mydict) # etc... all dict functions work

JSON (using the json module) is used internally to serialize the values. Keys are strings.

If you don't use autocommit (default is no autocommit for performance), then
don't forget to call `mydict.commit()` when done with a transaction.

"""


import sqlite3
import os
import tempfile
import random
import logging
from json import dumps, loads
from UserDict import DictMixin
from Queue import Queue
from threading import Thread


logger = logging.getLogger('sqlitedict')


def open(*args, **kwargs):
    """See documentation of the SqlDict class."""
    return SqliteDict(*args, **kwargs)


def encode(obj):
    """Serialize an object using JSON to a text format accepted by SQLite."""
    return dumps(obj)


def decode(obj):
    """Deserialize objects retrieved from SQLite."""
    return loads(obj)


class SqliteDict(object, DictMixin):
    def __init__(self, filename=None, tablename='unnamed', flag='c',
                 autocommit=False, journal_mode="DELETE"):
        """
        Initialize a thread-safe sqlite-backed dictionary. The dictionary will
        be a table `tablename` in database file `filename`. A single file (=database)
        may contain multiple tables.

        If no `filename` is given, a random file in temp will be used (and deleted
        from temp once the dict is closed/deleted).

        If you enable `autocommit`, changes will be committed after each operation
        (more inefficient but safer). Otherwise, changes are committed on `self.commit()`,
        `self.clear()` and `self.close()`.

        Set `journal_mode` to 'OFF' if you're experiencing sqlite I/O problems
        or if you need performance and don't care about crash-consistency.

        The `flag` parameter:
          'c': default mode, open for read/write, creating the db/table if necessary.
          'w': open for r/w, but drop `tablename` contents first (start with empty table)
          'n': create a new database (erasing any existing tables, not just `tablename`!).

        """
        self.in_temp = filename is None
        if self.in_temp:
            randpart = hex(random.randint(0, 0xffffff))[2:]
            filename = os.path.join(tempfile.gettempdir(), 'sqldict' + randpart)
        if flag == 'n':
            if os.path.exists(filename):
                os.remove(filename)
        self.filename = filename
        self.tablename = tablename

        logger.info("opening Sqlite table %r in %s" % (tablename, filename))
        MAKE_TABLE = 'CREATE TABLE IF NOT EXISTS %s (key TEXT PRIMARY KEY, value TEXT)' % self.tablename
        self.conn = SqliteMultithread(filename, autocommit=autocommit, journal_mode=journal_mode)
        self.conn.execute(MAKE_TABLE)
        self.conn.commit()
        if flag == 'w':
            self.clear()

    def __enter__(self):
        return self

    def __exit__(self, *exc_info):
        self.close()

    def __str__(self):
#        return "SqliteDict(%i items in %s)" % (len(self), self.conn.filename)
        return "SqliteDict(%s)" % (self.conn.filename)

    def __len__(self):
        # `select count (*)` is super slow in sqlite (does a linear scan!!)
        # As a result, len() is very slow too once the table size grows beyond trivial.
        # We could keep the total count of rows ourselves, by means of triggers,
        # but that seems too complicated and would slow down normal operation
        # (insert/delete etc).
        GET_LEN = 'SELECT COUNT(*) FROM %s' % self.tablename
        rows = self.conn.select_one(GET_LEN)[0]
        return rows if rows is not None else 0

    def __bool__(self):
        GET_LEN = 'SELECT MAX(ROWID) FROM %s' % self.tablename
        return self.conn.select_one(GET_LEN) is not None

    def iterkeys(self):
        GET_KEYS = 'SELECT key FROM %s ORDER BY rowid' % self.tablename
        for key in self.conn.select(GET_KEYS):
            yield key[0]

    def itervalues(self):
        GET_VALUES = 'SELECT value FROM %s ORDER BY rowid' % self.tablename
        for value in self.conn.select(GET_VALUES):
            yield decode(value[0])

    def iteritems(self):
        GET_ITEMS = 'SELECT key, value FROM %s ORDER BY rowid' % self.tablename
        for key, value in self.conn.select(GET_ITEMS):
            yield key, decode(value)

    def __contains__(self, key):
        HAS_ITEM = 'SELECT 1 FROM %s WHERE key = ?' % self.tablename
        return self.conn.select_one(HAS_ITEM, (key,)) is not None

    def __getitem__(self, key):
        GET_ITEM = 'SELECT value FROM %s WHERE key = ?' % self.tablename
        item = self.conn.select_one(GET_ITEM, (key,))
        if item is None:
            raise KeyError(key)

        return decode(item[0])

    def __setitem__(self, key, value):
        ADD_ITEM = 'REPLACE INTO %s (key, value) VALUES (?,?)' % self.tablename
        self.conn.execute(ADD_ITEM, (key, encode(value)))

    def __delitem__(self, key):
        if key not in self:
            raise KeyError(key)
        DEL_ITEM = 'DELETE FROM %s WHERE key = ?' % self.tablename
        self.conn.execute(DEL_ITEM, (key,))

    def update(self, items=(), **kwds):
        try:
            items = [(k, encode(v)) for k, v in items.iteritems()]
        except AttributeError:
            pass

        UPDATE_ITEMS = 'REPLACE INTO %s (key, value) VALUES (?, ?)' % self.tablename
        self.conn.executemany(UPDATE_ITEMS, items)
        if kwds:
            self.update(kwds)

    def keys(self):
        return list(self.iterkeys())

    def values(self):
        return list(self.itervalues())

    def items(self):
        return list(self.iteritems())

    def __iter__(self):
        return self.iterkeys()

    def clear(self):
        CLEAR_ALL = 'DELETE FROM %s;' % self.tablename # avoid VACUUM, as it gives "OperationalError: database schema has changed"
        self.conn.commit()
        self.conn.execute(CLEAR_ALL)
        self.conn.commit()

    def commit(self):
        if self.conn is not None:
            self.conn.commit()
    sync = commit

    def close(self):
        logger.debug("closing %s" % self)
        if self.conn is not None:
            if self.conn.autocommit:
                self.conn.commit()
            self.conn.close()
        if self.in_temp:
            try:
                os.remove(self.filename)
            except:
                pass

    def terminate(self):
        """Delete the underlying database file. Use with care."""
        self.close()
        logger.info("deleting %s" % self.filename)
        try:
            os.remove(self.filename)
        except IOError, e:
            logger.warning("failed to delete %s: %s" % (self.filename, e))

    def __del__(self):
        # like close(), but assume globals are gone by now (such as the logger)
        try:
            if self.conn is not None:
                if self.conn.autocommit:
                    self.conn.commit()
                self.conn.close()
                self.conn = None
            if self.in_temp:
                os.remove(self.filename)
        except:
            pass
#endclass SqliteDict


class SqliteMultithread(Thread):
    """
    Wrap sqlite connection in a way that allows concurrent requests from multiple threads.

    This is done by internally queueing the requests and processing them sequentially
    in a separate thread (in the same order they arrived).

    """
    def __init__(self, filename, autocommit, journal_mode):
        super(SqliteMultithread, self).__init__()
        self.filename = filename
        self.autocommit = autocommit
        self.journal_mode = journal_mode
        self.reqs = Queue() # use request queue of unlimited size
        self.setDaemon(True) # python2.5-compatible
        self.start()

    def run(self):
        if self.autocommit:
            conn = sqlite3.connect(self.filename, isolation_level=None, check_same_thread=False)
        else:
            conn = sqlite3.connect(self.filename, check_same_thread=False)
        conn.execute('PRAGMA journal_mode = %s' % self.journal_mode)
        conn.text_factory = str
        cursor = conn.cursor()
        cursor.execute('PRAGMA synchronous=NORMAL')
        while True:
            req, arg, res = self.reqs.get()
            if req == '--close--':
                break
            elif req == '--commit--':
                conn.commit()
            else:
                cursor.execute(req, arg)
                if res:
                    for rec in cursor:
                        res.put(rec)
                    res.put('--no more--')
                if self.autocommit:
                    conn.commit()
        conn.close()

    def execute(self, req, arg=None, res=None):
        """
        `execute` calls are non-blocking: just queue up the request and return immediately.

        """
        self.reqs.put((req, arg or tuple(), res))

    def executemany(self, req, items):
        for item in items:
            self.execute(req, item)

    def select(self, req, arg=None):
        """
        Unlike sqlite's native select, this select doesn't handle iteration efficiently.

        The result of `select` starts filling up with values as soon as the
        request is dequeued, and although you can iterate over the result normally
        (`for res in self.select(): ...`), the entire result will be in memory.

        """
        res = Queue() # results of the select will appear as items in this queue
        self.execute(req, arg, res)
        while True:
            rec = res.get()
            if rec == '--no more--':
                break
            yield rec

    def select_one(self, req, arg=None):
        """Return only the first row of the SELECT, or None if there are no matching rows."""
        try:
            return iter(self.select(req, arg)).next()
        except StopIteration:
            return None

    def commit(self):
        self.execute('--commit--')

    def close(self):
        self.execute('--close--')
#endclass SqliteMultithread


# running sqlitedict.py as script will perform a simple unit test
if __name__ in '__main___':
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(module)s:%(lineno)d : %(funcName)s(%(threadName)s) : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    for d in SqliteDict(), SqliteDict('example', flag='n'):
        assert list(d) == []
        assert len(d) == 0
        assert not d
        d['abc'] = 'rsvp' * 100
        assert d['abc'] == 'rsvp' * 100
        assert len(d) == 1
        d['abc'] = 'lmno'
        assert d['abc'] == 'lmno'
        assert len(d) == 1
        del d['abc']
        assert not d
        assert len(d) == 0
        d['abc'] = 'lmno'
        d['xyz'] = 'pdq'
        assert len(d) == 2
        assert list(d.iteritems()) == [('abc', 'lmno'), ('xyz', 'pdq')]
        assert d.items() == [('abc', 'lmno'), ('xyz', 'pdq')]
        assert d.values() == ['lmno', 'pdq']
        assert d.keys() == ['abc', 'xyz']
        assert list(d) == ['abc', 'xyz']
        d.update(p='x', q='y', r='z')
        assert len(d) == 5
        assert d.items() == [('abc', 'lmno'), ('xyz', 'pdq'), ('q', 'y'), ('p', 'x'), ('r', 'z')]
        del d['abc']
        try:
            error = d['abc']
        except KeyError:
            pass
        else:
            assert False
        try:
            del d['abc']
        except KeyError:
            pass
        else:
            assert False
        assert list(d) == ['xyz', 'q', 'p', 'r']
        assert d
        d.clear()
        assert not d
        assert list(d) == []
        d.update(p='x', q='y', r='z')
        assert list(d) == ['q', 'p', 'r']
        d.clear()
        assert not d
        d.close()
    print 'all tests passed :-)'

## torrentscraper.py
import sys, json, re, os, calendar, time, sqlite3, zlib
import requests

NYAA_URL = "https://www.nyaa.se"
COMMIT_CHUNK = 200
FILENAME_REGEX=re.compile('^inline; filename="(.+)"$')

#CREATE TABLE IF NOT EXISTS torrents (tid INTEGER PRIMARY KEY, mtime INTEGER, filename TEXT, alt INTEGER, data BLOB)

def time_http2unix(http_time_string):
	time_tuple = time.strptime(http_time_string, '%a, %d %b %Y %H:%M:%S GMT')
	return calendar.timegm(time_tuple)

class TorrentStore:
	def __init__(self, destdb):
		self.destdb = sqlite3.connect(destdb)
		self.destdb.execute('PRAGMA journal_mode=TRUNCATE')

	def add(self, tid, filename, mtime, data):
		zdata = zlib.compress(data, 9)
		self.destdb.execute('INSERT OR IGNORE INTO torrents (tid, mtime, filename, data) VALUES (?,?,?,?)', (tid, time_http2unix(mtime), filename.decode('utf-8'), sqlite3.Binary(zdata)))

	def alt(self, tid, alt):
		self.destdb.execute('INSERT OR IGNORE INTO torrents (tid, alt) VALUES (?,?)', (tid, alt))

	def commit(self):
		self.destdb.commit()

	def exists(self, tid):
		cur = self.destdb.execute('SELECT 1 FROM torrents WHERE tid = ?', (tid,))
		return cur.fetchone() is not None

	def close(self):
		self.destdb.close()

class NyaaScraper:
	def __init__(self, destdb, viewdb):
		self.store = TorrentStore(destdb)
		self.viewdb = sqlite3.connect(viewdb)
		self.session = requests.Session()

	def crawl(self):
		cur = self.viewdb.execute('SELECT tid FROM nyaa')
		i = 0
		for row in cur:
			if self.downloader(row[0]):
				i += 1
			if i >= COMMIT_CHUNK:
				print "---COMMIT---"
				self.store.commit()
				i = 0
		self.store.commit()
		self.store.close()

	def downloader(self, tid):
		if self.store.exists(tid):
			return False
		try:
			req = self.session.get("%s/?page=download&tid=%i" % (NYAA_URL, tid), timeout=120, allow_redirects=False)
			if req.status_code == 200:
				try:
					filename = re.findall(FILENAME_REGEX, req.headers['content-disposition'])[0]
				except KeyError:
					alt = int(req.headers['refresh'].split('=')[-1])
					self.store.alt(tid, alt)
					print "found-alt:", tid
				else:
					self.store.add(tid, filename, req.headers['Last-Modified'], req.content)
					print "found:", tid
			elif req.status_code == 404 or req.status_code in [302, 303]:
				print "torrent/%i:NotFound" % tid
			else:
				print "torrent/%i: %i" % (tid, req.status_code)
		except Exception, e:
			print "torrent/%i:Exception %s" % (tid, repr(e))
		return True

if __name__ == "__main__":
	if len(sys.argv) == 3 and os.path.isfile(sys.argv[2]):
		s = NyaaScraper(sys.argv[1], sys.argv[2])
		s.crawl()
	else:
		print "usage: ", sys.argv[0], "torrentdb viewdb"

## tpool.py
#http://code.activestate.com/recipes/577187-python-thread-pool/
from Queue import Queue
from threading import Thread

class Worker(Thread):
	"""Thread executing tasks from a given tasks queue"""
	def __init__(self, tasks):
		Thread.__init__(self)
		self.tasks = tasks
		self.daemon = True
		self.start()

	def run(self):
		while True:
			try:
				func, args, kargs = self.tasks.get()
			except TypeError:
				break
			#try: func(*args, **kargs)
			#except Exception, e: print e
			func(*args, **kargs)
			self.tasks.task_done()

class ThreadPool:
	"""Pool of threads consuming tasks from a queue"""
	def __init__(self, num_threads):
		self.tasks = Queue(num_threads)
		for _ in range(num_threads): Worker(self.tasks)

	def add_task(self, func, *args, **kargs):
		"""Add a task to the queue"""
		self.tasks.put((func, args, kargs))

	def wait_completion(self):
		"""Wait for completion of all the tasks in the queue"""
		self.tasks.join()

if __name__ == '__main__':
	from random import randrange
	delays = [randrange(1, 10) for i in range(100)]

	from time import sleep
	def wait_delay(d):
		print 'sleeping for (%d)sec' % d
		sleep(d)

	# 1) Init a Thread pool with the desired number of threads
	pool = ThreadPool(20)

	for i, d in enumerate(delays):
		# print the percentage of tasks placed in the queue
		print '%.2f%c' % ((float(i)/float(len(delays)))*100.0,'%')

		# 2) Add the task to the queue
		pool.add_task(wait_delay, d)

	# 3) Wait for completion
	pool.wait_completion()

## viewscraper.py
import sys, json, re, zlib, time, calendar
import requests
from BeautifulSoup import BeautifulSoup
from sqlitedict import SqliteMultithread
from tpool import ThreadPool

NYAA_URL = "http://www.nyaa.se"
SKIP_EXISTING = True
NUM_THREADS = 6
NUM_CHUNK = 60

def find_tag_url(parent, propertyname):
	for label in parent.findAll('td', {'class': 'tname'}):
		if label.text == propertyname:
			prop = label.nextSibling
			return (prop.text, prop.find('a')['href'])

def find_tag(parent, propertyname, classname=''):
	for label in parent.findAll('td', {'class': 'tname'}):
		if label.text == propertyname:
			prop = label.nextSibling
			if classname:
				assert prop['class'] == classname
			return prop.text

def time_nyaa2unix(time_string):
	time_tuple = time.strptime(time_string, '%Y-%m-%d, %H:%M UTC')
	return calendar.timegm(time_tuple)

def time_comment2unix(time_string):
	time_tuple = time.strptime(time_string, '%Y-%m-%d at %H:%M UTC')
	return calendar.timegm(time_tuple)

def parse_comments(soupcomment):
	comments = []
	for post in soupcomment.findAll('div', {'class':'comment'}):
		avatar = post.find('div', {'class':'avatar'})
		if avatar:
			avatar = avatar.find('img')['src']
		else:
			avatar = post.find('div', {'class':'avatarstaff'}).find('img')['src']
		if avatar == '//files.nyaa.se/Miku.png':
			avatar = ''
		#
		chead = post.find('div', {'class':'chead'})
		user = chead.find('a')
		username = user.text
		userid = int(user['href'].split('=')[-1])
		userstatus = user.findNextSibling('span').text
		date = user.findNextSibling('span').nextSibling.nextSibling
		date = time_comment2unix(date)
		#
		cid = post.find('div', {'class':'cnumber'})['id']
		cmain = unicode(post.find('div', {'class':'cmain'}))
		#
		comments.append({'av': avatar,
						'un': username,
						'ui': userid,
						'us': userstatus,
						't': date,
						'id': cid,
						'c': cmain})
	if comments:
		j = json.dumps(comments, separators=(',',':'), encoding="utf-8")
		return zlib.compress(j, 9)
	return ''

def page_parse(data):
	soup = BeautifulSoup(data)
	b = soup.find('div', {'class': 'container'})
	if not b:
		raise NotFoundError
	c = b.parent
	assert c['class'].startswith('content')
	quality = c['class'].replace('content', '').strip()
	mascotte = ''
	if 'aplus' in quality:
		mascotte = c.find('table', {'class': 'viewtable'}).nextSibling.find('img')['src']
	category = c.find('td', {'class': 'viewcategory'}).findAll('a')[-1]['href'].split('=')[-1]
	name = find_tag(c, 'Name:', 'viewtorrentname')
	date = find_tag(c, 'Date:', 'vtop')
	uploader, uploaderid = find_tag_url(c, 'Submitter:')
	seeders = find_tag(c, 'Seeders:', 'vtop')
	tracker = find_tag(c, 'Tracker')
	leechers = find_tag(c, 'Leechers:', 'vtop')
	try:
		info, infourl = find_tag_url(c, 'Information:')
	except TypeError:
		info = ''
		infourl = ''
	downloads = find_tag(c, 'Downloads:', 'vtop')
	stardom = find_tag(c, 'Stardom:')
	filesize = find_tag(c, 'File size:', 'vtop')
	description = c.find('div', {'class': 'viewdescription'})
	#
	uploaderid = int(uploaderid.split('=')[-1])
	if tracker == 'http://open.nyaatorrents.info:6544/announce':
		tracker = ''
	if 'unknown' in seeders or 'unknown' in leechers:
		seeders = -1
		leechers = -1
	else:
		seeders = int(seeders)
		leechers = int(leechers)
	downloads = int(downloads)
	stardom = int(re.findall('\d+', stardom)[0].strip())
	date = time_nyaa2unix(date)
	if description.text == "None":
		description = ''
	else:
		description = zlib.compress(unicode(c.find('div', {'class': 'viewdescription'})).encode('utf-8'), 9)
	#
	comments = parse_comments(c)

	return (name, quality, mascotte, category, date, uploader, uploaderid, seeders, leechers, tracker, info, infourl, downloads, stardom, filesize, description, comments)

class NotFoundError(Exception):
	pass

class NyaaScraper:
	def __init__(self, dbpath, startid, endid):
		self.database = SqliteMultithread(dbpath, autocommit=False, journal_mode="DELETE")
		self.database.execute('CREATE TABLE IF NOT EXISTS nyaa (tid INTEGER PRIMARY KEY, name TEXT, quality TEXT, mascotte TEXT, category TEXT, date INTEGER, uploader TEXT, uploaderid INTEGER, seeders INTEGER, leechers INTEGER, tracker TEXT, info TEXT, infourl TEXT, downloads INTEGER, stardom INTEGER, filesize TEXT, description BLOB, comments BLOB)')
		self.database.commit()
		self.startid = startid
		self.endid = endid
		self.session = requests.Session()
		self.session.headers.update({'User-Agent': 'view/0.1'})

	def exists(self, tid):
		return self.database.select_one('SELECT 1 FROM nyaa WHERE tid = ?', (tid,)) is not None

	def crawl(self):
		i = self.startid
		pool = ThreadPool(NUM_THREADS)
		while i <= self.endid:
			print "queue: %i ==> %i" % (i, i + NUM_CHUNK)
			for j in xrange(i, i + NUM_CHUNK):
				if j > self.endid:
					break
				pool.add_task(self.downloader, j)
				i += 1
			pool.wait_completion()
			self.database.commit()
		self.database.commit()
		self.database.close()
		time.sleep(2)

	def downloader(self, tid):
		if SKIP_EXISTING and self.exists(tid):
			return
		try:
			req = self.session.get("%s/?page=view&tid=%i" % (NYAA_URL, tid), timeout=120, headers={"accept-language": "en"})
			if req.status_code != 200:
				print "torrent/%i: %i %s" % (tid, req.status_code, req.error)
				return
			try:
				parsed = page_parse(req.text)
			except NotFoundError:
				print "torrent/%i:NotFound" % tid
			else:
				print "found:", tid
				self.database.execute('INSERT OR REPLACE INTO nyaa (tid, name, quality, mascotte, category, date, uploader, uploaderid, seeders, leechers, tracker, info, infourl, downloads, stardom, filesize, description, comments) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (tid,) + parsed)

		except Exception, e:
			print "torrent/%i:Exception %s" % (tid, repr(e))

if __name__ == "__main__":
	if len(sys.argv) == 4:
		s = NyaaScraper(sys.argv[1], int(sys.argv[2]), int(sys.argv[3]))
		s.crawl()
	else:
		print "usage: ", sys.argv[0], "database startid endid"
	#!/usr/bin/env python
	# -- coding: utf-8 --
	#
	# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>

	# Hacked together from:
	# * http://code.activestate.com/recipes/576638-draft-for-an-sqlite3-based-dbm/
	# * http://code.activestate.com/recipes/526618/
	#
	# Use the code in any way you like (at your own risk), it's public domain.

	"""
	A lightweight wrapper around Python's sqlite3 database, with a dict-like interface
	and multi-thread access support::

	>>> mydict = SqliteDict('some.db', autocommit=True) # the mapping will be persisted to file `some.db`
	>>> mydict['some_key'] = any_jsonable_object
	>>> print mydict['some_key']
	>>> print len(mydict) # etc... all dict functions work

	JSON (using the json module) is used internally to serialize the values. Keys are strings.

	If you don't use autocommit (default is no autocommit for performance), then
	don't forget to call `mydict.commit()` when done with a transaction.

	"""


	import sqlite3
	import os
	import tempfile
	import random
	import logging
	from json import dumps, loads
	from UserDict import DictMixin
	from Queue import Queue
	from threading import Thread


	logger = logging.getLogger('sqlitedict')



	def open(args, *kwargs):
	"""See documentation of the SqlDict class."""
	return SqliteDict(args, *kwargs)


	def encode(obj):
	"""Serialize an object using JSON to a text format accepted by SQLite."""
	return dumps(obj)


	def decode(obj):
	"""Deserialize objects retrieved from SQLite."""
	return loads(obj)


	class SqliteDict(object, DictMixin):
	def __init__(self, filename=None, tablename='unnamed', flag='c',
	autocommit=False, journal_mode="DELETE"):
	"""
	Initialize a thread-safe sqlite-backed dictionary. The dictionary will
	be a table `tablename` in database file `filename`. A single file (=database)
	may contain multiple tables.

	If no `filename` is given, a random file in temp will be used (and deleted
	from temp once the dict is closed/deleted).

	If you enable `autocommit`, changes will be committed after each operation
	(more inefficient but safer). Otherwise, changes are committed on `self.commit()`,
	`self.clear()` and `self.close()`.

	Set `journal_mode` to 'OFF' if you're experiencing sqlite I/O problems
	or if you need performance and don't care about crash-consistency.

	The `flag` parameter:
	'c': default mode, open for read/write, creating the db/table if necessary.
	'w': open for r/w, but drop `tablename` contents first (start with empty table)
	'n': create a new database (erasing any existing tables, not just `tablename`!).

	"""
	self.in_temp = filename is None
	if self.in_temp:
	randpart = hex(random.randint(0, 0xffffff))[2:]
	filename = os.path.join(tempfile.gettempdir(), 'sqldict' + randpart)
	if flag == 'n':
	if os.path.exists(filename):
	os.remove(filename)
	self.filename = filename
	self.tablename = tablename

	logger.info("opening Sqlite table %r in %s" % (tablename, filename))
	MAKE_TABLE = 'CREATE TABLE IF NOT EXISTS %s (key TEXT PRIMARY KEY, value TEXT)' % self.tablename
	self.conn = SqliteMultithread(filename, autocommit=autocommit, journal_mode=journal_mode)
	self.conn.execute(MAKE_TABLE)
	self.conn.commit()
	if flag == 'w':
	self.clear()

	def __enter__(self):
	return self

	def __exit__(self, *exc_info):
	self.close()

	def __str__(self):
	# return "SqliteDict(%i items in %s)" % (len(self), self.conn.filename)
	return "SqliteDict(%s)" % (self.conn.filename)

	def __len__(self):
	# `select count (*)` is super slow in sqlite (does a linear scan!!)
	# As a result, len() is very slow too once the table size grows beyond trivial.
	# We could keep the total count of rows ourselves, by means of triggers,
	# but that seems too complicated and would slow down normal operation
	# (insert/delete etc).
	GET_LEN = 'SELECT COUNT(*) FROM %s' % self.tablename
	rows = self.conn.select_one(GET_LEN)[0]
	return rows if rows is not None else 0

	def __bool__(self):
	GET_LEN = 'SELECT MAX(ROWID) FROM %s' % self.tablename
	return self.conn.select_one(GET_LEN) is not None

	def iterkeys(self):
	GET_KEYS = 'SELECT key FROM %s ORDER BY rowid' % self.tablename
	for key in self.conn.select(GET_KEYS):
	yield key[0]

	def itervalues(self):
	GET_VALUES = 'SELECT value FROM %s ORDER BY rowid' % self.tablename
	for value in self.conn.select(GET_VALUES):
	yield decode(value[0])

	def iteritems(self):
	GET_ITEMS = 'SELECT key, value FROM %s ORDER BY rowid' % self.tablename
	for key, value in self.conn.select(GET_ITEMS):
	yield key, decode(value)

	def __contains__(self, key):
	HAS_ITEM = 'SELECT 1 FROM %s WHERE key = ?' % self.tablename
	return self.conn.select_one(HAS_ITEM, (key,)) is not None

	def __getitem__(self, key):
	GET_ITEM = 'SELECT value FROM %s WHERE key = ?' % self.tablename
	item = self.conn.select_one(GET_ITEM, (key,))
	if item is None:
	raise KeyError(key)

	return decode(item[0])

	def __setitem__(self, key, value):
	ADD_ITEM = 'REPLACE INTO %s (key, value) VALUES (?,?)' % self.tablename
	self.conn.execute(ADD_ITEM, (key, encode(value)))

	def __delitem__(self, key):
	if key not in self:
	raise KeyError(key)
	DEL_ITEM = 'DELETE FROM %s WHERE key = ?' % self.tablename
	self.conn.execute(DEL_ITEM, (key,))

	def update(self, items=(), **kwds):
	try:
	items = [(k, encode(v)) for k, v in items.iteritems()]
	except AttributeError:
	pass

	UPDATE_ITEMS = 'REPLACE INTO %s (key, value) VALUES (?, ?)' % self.tablename
	self.conn.executemany(UPDATE_ITEMS, items)
	if kwds:
	self.update(kwds)

	def keys(self):
	return list(self.iterkeys())

	def values(self):
	return list(self.itervalues())

	def items(self):
	return list(self.iteritems())

	def __iter__(self):
	return self.iterkeys()

	def clear(self):
	CLEAR_ALL = 'DELETE FROM %s;' % self.tablename # avoid VACUUM, as it gives "OperationalError: database schema has changed"
	self.conn.commit()
	self.conn.execute(CLEAR_ALL)
	self.conn.commit()

	def commit(self):
	if self.conn is not None:
	self.conn.commit()
	sync = commit

	def close(self):
	logger.debug("closing %s" % self)
	if self.conn is not None:
	if self.conn.autocommit:
	self.conn.commit()
	self.conn.close()
	if self.in_temp:
	try:
	os.remove(self.filename)
	except:
	pass

	def terminate(self):
	"""Delete the underlying database file. Use with care."""
	self.close()
	logger.info("deleting %s" % self.filename)
	try:
	os.remove(self.filename)
	except IOError, e:
	logger.warning("failed to delete %s: %s" % (self.filename, e))

	def __del__(self):
	# like close(), but assume globals are gone by now (such as the logger)
	try:
	if self.conn is not None:
	if self.conn.autocommit:
	self.conn.commit()
	self.conn.close()
	self.conn = None
	if self.in_temp:
	os.remove(self.filename)
	except:
	pass
	#endclass SqliteDict



	class SqliteMultithread(Thread):
	"""
	Wrap sqlite connection in a way that allows concurrent requests from multiple threads.

	This is done by internally queueing the requests and processing them sequentially
	in a separate thread (in the same order they arrived).

	"""
	def __init__(self, filename, autocommit, journal_mode):
	super(SqliteMultithread, self).__init__()
	self.filename = filename
	self.autocommit = autocommit
	self.journal_mode = journal_mode
	self.reqs = Queue() # use request queue of unlimited size
	self.setDaemon(True) # python2.5-compatible
	self.start()

	def run(self):
	if self.autocommit:
	conn = sqlite3.connect(self.filename, isolation_level=None, check_same_thread=False)
	else:
	conn = sqlite3.connect(self.filename, check_same_thread=False)
	conn.execute('PRAGMA journal_mode = %s' % self.journal_mode)
	conn.text_factory = str
	cursor = conn.cursor()
	cursor.execute('PRAGMA synchronous=NORMAL')
	while True:
	req, arg, res = self.reqs.get()
	if req == '--close--':
	break
	elif req == '--commit--':
	conn.commit()
	else:
	cursor.execute(req, arg)
	if res:
	for rec in cursor:
	res.put(rec)
	res.put('--no more--')
	if self.autocommit:
	conn.commit()
	conn.close()

	def execute(self, req, arg=None, res=None):
	"""
	`execute` calls are non-blocking: just queue up the request and return immediately.

	"""
	self.reqs.put((req, arg or tuple(), res))

	def executemany(self, req, items):
	for item in items:
	self.execute(req, item)

	def select(self, req, arg=None):
	"""
	Unlike sqlite's native select, this select doesn't handle iteration efficiently.

	The result of `select` starts filling up with values as soon as the
	request is dequeued, and although you can iterate over the result normally
	(`for res in self.select(): ...`), the entire result will be in memory.

	"""
	res = Queue() # results of the select will appear as items in this queue
	self.execute(req, arg, res)
	while True:
	rec = res.get()
	if rec == '--no more--':
	break
	yield rec

	def select_one(self, req, arg=None):
	"""Return only the first row of the SELECT, or None if there are no matching rows."""
	try:
	return iter(self.select(req, arg)).next()
	except StopIteration:
	return None

	def commit(self):
	self.execute('--commit--')

	def close(self):
	self.execute('--close--')
	#endclass SqliteMultithread


	# running sqlitedict.py as script will perform a simple unit test
	if __name__ in '__main___':
	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(module)s:%(lineno)d : %(funcName)s(%(threadName)s) : %(message)s')
	logging.root.setLevel(level=logging.INFO)
	for d in SqliteDict(), SqliteDict('example', flag='n'):
	assert list(d) == []
	assert len(d) == 0
	assert not d
	d['abc'] = 'rsvp' * 100
	assert d['abc'] == 'rsvp' * 100
	assert len(d) == 1
	d['abc'] = 'lmno'
	assert d['abc'] == 'lmno'
	assert len(d) == 1
	del d['abc']
	assert not d
	assert len(d) == 0
	d['abc'] = 'lmno'
	d['xyz'] = 'pdq'
	assert len(d) == 2
	assert list(d.iteritems()) == [('abc', 'lmno'), ('xyz', 'pdq')]
	assert d.items() == [('abc', 'lmno'), ('xyz', 'pdq')]
	assert d.values() == ['lmno', 'pdq']
	assert d.keys() == ['abc', 'xyz']
	assert list(d) == ['abc', 'xyz']
	d.update(p='x', q='y', r='z')
	assert len(d) == 5
	assert d.items() == [('abc', 'lmno'), ('xyz', 'pdq'), ('q', 'y'), ('p', 'x'), ('r', 'z')]
	del d['abc']
	try:
	error = d['abc']
	except KeyError:
	pass
	else:
	assert False
	try:
	del d['abc']
	except KeyError:
	pass
	else:
	assert False
	assert list(d) == ['xyz', 'q', 'p', 'r']
	assert d
	d.clear()
	assert not d
	assert list(d) == []
	d.update(p='x', q='y', r='z')
	assert list(d) == ['q', 'p', 'r']
	d.clear()
	assert not d
	d.close()
	print 'all tests passed :-)'
	import sys, json, re, os, calendar, time, sqlite3, zlib
	import requests

	NYAA_URL = "https://www.nyaa.se"
	COMMIT_CHUNK = 200
	FILENAME_REGEX=re.compile('^inline; filename="(.+)"$')

	#CREATE TABLE IF NOT EXISTS torrents (tid INTEGER PRIMARY KEY, mtime INTEGER, filename TEXT, alt INTEGER, data BLOB)

	def time_http2unix(http_time_string):
	time_tuple = time.strptime(http_time_string, '%a, %d %b %Y %H:%M:%S GMT')
	return calendar.timegm(time_tuple)

	class TorrentStore:
	def __init__(self, destdb):
	self.destdb = sqlite3.connect(destdb)
	self.destdb.execute('PRAGMA journal_mode=TRUNCATE')

	def add(self, tid, filename, mtime, data):
	zdata = zlib.compress(data, 9)
	self.destdb.execute('INSERT OR IGNORE INTO torrents (tid, mtime, filename, data) VALUES (?,?,?,?)', (tid, time_http2unix(mtime), filename.decode('utf-8'), sqlite3.Binary(zdata)))

	def alt(self, tid, alt):
	self.destdb.execute('INSERT OR IGNORE INTO torrents (tid, alt) VALUES (?,?)', (tid, alt))

	def commit(self):
	self.destdb.commit()

	def exists(self, tid):
	cur = self.destdb.execute('SELECT 1 FROM torrents WHERE tid = ?', (tid,))
	return cur.fetchone() is not None

	def close(self):
	self.destdb.close()

	class NyaaScraper:
	def __init__(self, destdb, viewdb):
	self.store = TorrentStore(destdb)
	self.viewdb = sqlite3.connect(viewdb)
	self.session = requests.Session()

	def crawl(self):
	cur = self.viewdb.execute('SELECT tid FROM nyaa')
	i = 0
	for row in cur:
	if self.downloader(row[0]):
	i += 1
	if i >= COMMIT_CHUNK:
	print "---COMMIT---"
	self.store.commit()
	i = 0
	self.store.commit()
	self.store.close()

	def downloader(self, tid):
	if self.store.exists(tid):
	return False
	try:
	req = self.session.get("%s/?page=download&tid=%i" % (NYAA_URL, tid), timeout=120, allow_redirects=False)
	if req.status_code == 200:
	try:
	filename = re.findall(FILENAME_REGEX, req.headers['content-disposition'])[0]
	except KeyError:
	alt = int(req.headers['refresh'].split('=')[-1])
	self.store.alt(tid, alt)
	print "found-alt:", tid
	else:
	self.store.add(tid, filename, req.headers['Last-Modified'], req.content)
	print "found:", tid
	elif req.status_code == 404 or req.status_code in [302, 303]:
	print "torrent/%i:NotFound" % tid
	else:
	print "torrent/%i: %i" % (tid, req.status_code)
	except Exception, e:
	print "torrent/%i:Exception %s" % (tid, repr(e))
	return True

	if __name__ == "__main__":
	if len(sys.argv) == 3 and os.path.isfile(sys.argv[2]):
	s = NyaaScraper(sys.argv[1], sys.argv[2])
	s.crawl()
	else:
	print "usage: ", sys.argv[0], "torrentdb viewdb"
	#http://code.activestate.com/recipes/577187-python-thread-pool/
	from Queue import Queue
	from threading import Thread

	class Worker(Thread):
	"""Thread executing tasks from a given tasks queue"""
	def __init__(self, tasks):
	Thread.__init__(self)
	self.tasks = tasks
	self.daemon = True
	self.start()

	def run(self):
	while True:
	try:
	func, args, kargs = self.tasks.get()
	except TypeError:
	break
	#try: func(args, *kargs)
	#except Exception, e: print e
	func(args, *kargs)
	self.tasks.task_done()

	class ThreadPool:
	"""Pool of threads consuming tasks from a queue"""
	def __init__(self, num_threads):
	self.tasks = Queue(num_threads)
	for _ in range(num_threads): Worker(self.tasks)

	def add_task(self, func, args, *kargs):
	"""Add a task to the queue"""
	self.tasks.put((func, args, kargs))

	def wait_completion(self):
	"""Wait for completion of all the tasks in the queue"""
	self.tasks.join()

	if __name__ == '__main__':
	from random import randrange
	delays = [randrange(1, 10) for i in range(100)]

	from time import sleep
	def wait_delay(d):
	print 'sleeping for (%d)sec' % d
	sleep(d)

	# 1) Init a Thread pool with the desired number of threads
	pool = ThreadPool(20)

	for i, d in enumerate(delays):
	# print the percentage of tasks placed in the queue
	print '%.2f%c' % ((float(i)/float(len(delays)))*100.0,'%')

	# 2) Add the task to the queue
	pool.add_task(wait_delay, d)

	# 3) Wait for completion
	pool.wait_completion()
	import sys, json, re, zlib, time, calendar
	import requests
	from BeautifulSoup import BeautifulSoup
	from sqlitedict import SqliteMultithread
	from tpool import ThreadPool

	NYAA_URL = "http://www.nyaa.se"
	SKIP_EXISTING = True
	NUM_THREADS = 6
	NUM_CHUNK = 60

	def find_tag_url(parent, propertyname):
	for label in parent.findAll('td', {'class': 'tname'}):
	if label.text == propertyname:
	prop = label.nextSibling
	return (prop.text, prop.find('a')['href'])

	def find_tag(parent, propertyname, classname=''):
	for label in parent.findAll('td', {'class': 'tname'}):
	if label.text == propertyname:
	prop = label.nextSibling
	if classname:
	assert prop['class'] == classname
	return prop.text

	def time_nyaa2unix(time_string):
	time_tuple = time.strptime(time_string, '%Y-%m-%d, %H:%M UTC')
	return calendar.timegm(time_tuple)

	def time_comment2unix(time_string):
	time_tuple = time.strptime(time_string, '%Y-%m-%d at %H:%M UTC')
	return calendar.timegm(time_tuple)

	def parse_comments(soupcomment):
	comments = []
	for post in soupcomment.findAll('div', {'class':'comment'}):
	avatar = post.find('div', {'class':'avatar'})
	if avatar:
	avatar = avatar.find('img')['src']
	else:
	avatar = post.find('div', {'class':'avatarstaff'}).find('img')['src']
	if avatar == '//files.nyaa.se/Miku.png':
	avatar = ''
	#
	chead = post.find('div', {'class':'chead'})
	user = chead.find('a')
	username = user.text
	userid = int(user['href'].split('=')[-1])
	userstatus = user.findNextSibling('span').text
	date = user.findNextSibling('span').nextSibling.nextSibling
	date = time_comment2unix(date)
	#
	cid = post.find('div', {'class':'cnumber'})['id']
	cmain = unicode(post.find('div', {'class':'cmain'}))
	#
	comments.append({'av': avatar,
	'un': username,
	'ui': userid,
	'us': userstatus,
	't': date,
	'id': cid,
	'c': cmain})
	if comments:
	j = json.dumps(comments, separators=(',',':'), encoding="utf-8")
	return zlib.compress(j, 9)
	return ''

	def page_parse(data):
	soup = BeautifulSoup(data)
	b = soup.find('div', {'class': 'container'})
	if not b:
	raise NotFoundError
	c = b.parent
	assert c['class'].startswith('content')
	quality = c['class'].replace('content', '').strip()
	mascotte = ''
	if 'aplus' in quality:
	mascotte = c.find('table', {'class': 'viewtable'}).nextSibling.find('img')['src']
	category = c.find('td', {'class': 'viewcategory'}).findAll('a')[-1]['href'].split('=')[-1]
	name = find_tag(c, 'Name:', 'viewtorrentname')
	date = find_tag(c, 'Date:', 'vtop')
	uploader, uploaderid = find_tag_url(c, 'Submitter:')
	seeders = find_tag(c, 'Seeders:', 'vtop')
	tracker = find_tag(c, 'Tracker')
	leechers = find_tag(c, 'Leechers:', 'vtop')
	try:
	info, infourl = find_tag_url(c, 'Information:')
	except TypeError:
	info = ''
	infourl = ''
	downloads = find_tag(c, 'Downloads:', 'vtop')
	stardom = find_tag(c, 'Stardom:')
	filesize = find_tag(c, 'File size:', 'vtop')
	description = c.find('div', {'class': 'viewdescription'})
	#
	uploaderid = int(uploaderid.split('=')[-1])
	if tracker == 'http://open.nyaatorrents.info:6544/announce':
	tracker = ''
	if 'unknown' in seeders or 'unknown' in leechers:
	seeders = -1
	leechers = -1
	else:
	seeders = int(seeders)
	leechers = int(leechers)
	downloads = int(downloads)
	stardom = int(re.findall('\d+', stardom)[0].strip())
	date = time_nyaa2unix(date)
	if description.text == "None":
	description = ''
	else:
	description = zlib.compress(unicode(c.find('div', {'class': 'viewdescription'})).encode('utf-8'), 9)
	#
	comments = parse_comments(c)

	return (name, quality, mascotte, category, date, uploader, uploaderid, seeders, leechers, tracker, info, infourl, downloads, stardom, filesize, description, comments)

	class NotFoundError(Exception):
	pass

	class NyaaScraper:
	def __init__(self, dbpath, startid, endid):
	self.database = SqliteMultithread(dbpath, autocommit=False, journal_mode="DELETE")
	self.database.execute('CREATE TABLE IF NOT EXISTS nyaa (tid INTEGER PRIMARY KEY, name TEXT, quality TEXT, mascotte TEXT, category TEXT, date INTEGER, uploader TEXT, uploaderid INTEGER, seeders INTEGER, leechers INTEGER, tracker TEXT, info TEXT, infourl TEXT, downloads INTEGER, stardom INTEGER, filesize TEXT, description BLOB, comments BLOB)')
	self.database.commit()
	self.startid = startid
	self.endid = endid
	self.session = requests.Session()
	self.session.headers.update({'User-Agent': 'view/0.1'})

	def exists(self, tid):
	return self.database.select_one('SELECT 1 FROM nyaa WHERE tid = ?', (tid,)) is not None

	def crawl(self):
	i = self.startid
	pool = ThreadPool(NUM_THREADS)
	while i <= self.endid:
	print "queue: %i ==> %i" % (i, i + NUM_CHUNK)
	for j in xrange(i, i + NUM_CHUNK):
	if j > self.endid:
	break
	pool.add_task(self.downloader, j)
	i += 1
	pool.wait_completion()
	self.database.commit()
	self.database.commit()
	self.database.close()
	time.sleep(2)

	def downloader(self, tid):
	if SKIP_EXISTING and self.exists(tid):
	return
	try:
	req = self.session.get("%s/?page=view&tid=%i" % (NYAA_URL, tid), timeout=120, headers={"accept-language": "en"})
	if req.status_code != 200:
	print "torrent/%i: %i %s" % (tid, req.status_code, req.error)
	return
	try:
	parsed = page_parse(req.text)
	except NotFoundError:
	print "torrent/%i:NotFound" % tid
	else:
	print "found:", tid
	self.database.execute('INSERT OR REPLACE INTO nyaa (tid, name, quality, mascotte, category, date, uploader, uploaderid, seeders, leechers, tracker, info, infourl, downloads, stardom, filesize, description, comments) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (tid,) + parsed)

	except Exception, e:
	print "torrent/%i:Exception %s" % (tid, repr(e))

	if __name__ == "__main__":
	if len(sys.argv) == 4:
	s = NyaaScraper(sys.argv[1], int(sys.argv[2]), int(sys.argv[3]))
	s.crawl()
	else:
	print "usage: ", sys.argv[0], "database startid endid"