-
-
Save loadletter/584c82974eab98e52bb84a84b2e226c6 to your computer and use it in GitHub Desktop.
nyaascraper2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.sqlite3 | |
*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# | |
# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz> | |
# Hacked together from: | |
# * http://code.activestate.com/recipes/576638-draft-for-an-sqlite3-based-dbm/ | |
# * http://code.activestate.com/recipes/526618/ | |
# | |
# Use the code in any way you like (at your own risk), it's public domain. | |
""" | |
A lightweight wrapper around Python's sqlite3 database, with a dict-like interface | |
and multi-thread access support:: | |
>>> mydict = SqliteDict('some.db', autocommit=True) # the mapping will be persisted to file `some.db` | |
>>> mydict['some_key'] = any_jsonable_object | |
>>> print mydict['some_key'] | |
>>> print len(mydict) # etc... all dict functions work | |
JSON (using the json module) is used internally to serialize the values. Keys are strings. | |
If you don't use autocommit (default is no autocommit for performance), then | |
don't forget to call `mydict.commit()` when done with a transaction. | |
""" | |
import sqlite3 | |
import os | |
import tempfile | |
import random | |
import logging | |
from json import dumps, loads | |
from UserDict import DictMixin | |
from Queue import Queue | |
from threading import Thread | |
logger = logging.getLogger('sqlitedict') | |
def open(*args, **kwargs): | |
"""See documentation of the SqlDict class.""" | |
return SqliteDict(*args, **kwargs) | |
def encode(obj): | |
"""Serialize an object using JSON to a text format accepted by SQLite.""" | |
return dumps(obj) | |
def decode(obj): | |
"""Deserialize objects retrieved from SQLite.""" | |
return loads(obj) | |
class SqliteDict(object, DictMixin): | |
def __init__(self, filename=None, tablename='unnamed', flag='c', | |
autocommit=False, journal_mode="DELETE"): | |
""" | |
Initialize a thread-safe sqlite-backed dictionary. The dictionary will | |
be a table `tablename` in database file `filename`. A single file (=database) | |
may contain multiple tables. | |
If no `filename` is given, a random file in temp will be used (and deleted | |
from temp once the dict is closed/deleted). | |
If you enable `autocommit`, changes will be committed after each operation | |
(more inefficient but safer). Otherwise, changes are committed on `self.commit()`, | |
`self.clear()` and `self.close()`. | |
Set `journal_mode` to 'OFF' if you're experiencing sqlite I/O problems | |
or if you need performance and don't care about crash-consistency. | |
The `flag` parameter: | |
'c': default mode, open for read/write, creating the db/table if necessary. | |
'w': open for r/w, but drop `tablename` contents first (start with empty table) | |
'n': create a new database (erasing any existing tables, not just `tablename`!). | |
""" | |
self.in_temp = filename is None | |
if self.in_temp: | |
randpart = hex(random.randint(0, 0xffffff))[2:] | |
filename = os.path.join(tempfile.gettempdir(), 'sqldict' + randpart) | |
if flag == 'n': | |
if os.path.exists(filename): | |
os.remove(filename) | |
self.filename = filename | |
self.tablename = tablename | |
logger.info("opening Sqlite table %r in %s" % (tablename, filename)) | |
MAKE_TABLE = 'CREATE TABLE IF NOT EXISTS %s (key TEXT PRIMARY KEY, value TEXT)' % self.tablename | |
self.conn = SqliteMultithread(filename, autocommit=autocommit, journal_mode=journal_mode) | |
self.conn.execute(MAKE_TABLE) | |
self.conn.commit() | |
if flag == 'w': | |
self.clear() | |
def __enter__(self): | |
return self | |
def __exit__(self, *exc_info): | |
self.close() | |
def __str__(self): | |
# return "SqliteDict(%i items in %s)" % (len(self), self.conn.filename) | |
return "SqliteDict(%s)" % (self.conn.filename) | |
def __len__(self): | |
# `select count (*)` is super slow in sqlite (does a linear scan!!) | |
# As a result, len() is very slow too once the table size grows beyond trivial. | |
# We could keep the total count of rows ourselves, by means of triggers, | |
# but that seems too complicated and would slow down normal operation | |
# (insert/delete etc). | |
GET_LEN = 'SELECT COUNT(*) FROM %s' % self.tablename | |
rows = self.conn.select_one(GET_LEN)[0] | |
return rows if rows is not None else 0 | |
def __bool__(self): | |
GET_LEN = 'SELECT MAX(ROWID) FROM %s' % self.tablename | |
return self.conn.select_one(GET_LEN) is not None | |
def iterkeys(self): | |
GET_KEYS = 'SELECT key FROM %s ORDER BY rowid' % self.tablename | |
for key in self.conn.select(GET_KEYS): | |
yield key[0] | |
def itervalues(self): | |
GET_VALUES = 'SELECT value FROM %s ORDER BY rowid' % self.tablename | |
for value in self.conn.select(GET_VALUES): | |
yield decode(value[0]) | |
def iteritems(self): | |
GET_ITEMS = 'SELECT key, value FROM %s ORDER BY rowid' % self.tablename | |
for key, value in self.conn.select(GET_ITEMS): | |
yield key, decode(value) | |
def __contains__(self, key): | |
HAS_ITEM = 'SELECT 1 FROM %s WHERE key = ?' % self.tablename | |
return self.conn.select_one(HAS_ITEM, (key,)) is not None | |
def __getitem__(self, key): | |
GET_ITEM = 'SELECT value FROM %s WHERE key = ?' % self.tablename | |
item = self.conn.select_one(GET_ITEM, (key,)) | |
if item is None: | |
raise KeyError(key) | |
return decode(item[0]) | |
def __setitem__(self, key, value): | |
ADD_ITEM = 'REPLACE INTO %s (key, value) VALUES (?,?)' % self.tablename | |
self.conn.execute(ADD_ITEM, (key, encode(value))) | |
def __delitem__(self, key): | |
if key not in self: | |
raise KeyError(key) | |
DEL_ITEM = 'DELETE FROM %s WHERE key = ?' % self.tablename | |
self.conn.execute(DEL_ITEM, (key,)) | |
def update(self, items=(), **kwds): | |
try: | |
items = [(k, encode(v)) for k, v in items.iteritems()] | |
except AttributeError: | |
pass | |
UPDATE_ITEMS = 'REPLACE INTO %s (key, value) VALUES (?, ?)' % self.tablename | |
self.conn.executemany(UPDATE_ITEMS, items) | |
if kwds: | |
self.update(kwds) | |
def keys(self): | |
return list(self.iterkeys()) | |
def values(self): | |
return list(self.itervalues()) | |
def items(self): | |
return list(self.iteritems()) | |
def __iter__(self): | |
return self.iterkeys() | |
def clear(self): | |
CLEAR_ALL = 'DELETE FROM %s;' % self.tablename # avoid VACUUM, as it gives "OperationalError: database schema has changed" | |
self.conn.commit() | |
self.conn.execute(CLEAR_ALL) | |
self.conn.commit() | |
def commit(self): | |
if self.conn is not None: | |
self.conn.commit() | |
sync = commit | |
def close(self): | |
logger.debug("closing %s" % self) | |
if self.conn is not None: | |
if self.conn.autocommit: | |
self.conn.commit() | |
self.conn.close() | |
if self.in_temp: | |
try: | |
os.remove(self.filename) | |
except: | |
pass | |
def terminate(self): | |
"""Delete the underlying database file. Use with care.""" | |
self.close() | |
logger.info("deleting %s" % self.filename) | |
try: | |
os.remove(self.filename) | |
except IOError, e: | |
logger.warning("failed to delete %s: %s" % (self.filename, e)) | |
def __del__(self): | |
# like close(), but assume globals are gone by now (such as the logger) | |
try: | |
if self.conn is not None: | |
if self.conn.autocommit: | |
self.conn.commit() | |
self.conn.close() | |
self.conn = None | |
if self.in_temp: | |
os.remove(self.filename) | |
except: | |
pass | |
#endclass SqliteDict | |
class SqliteMultithread(Thread): | |
""" | |
Wrap sqlite connection in a way that allows concurrent requests from multiple threads. | |
This is done by internally queueing the requests and processing them sequentially | |
in a separate thread (in the same order they arrived). | |
""" | |
def __init__(self, filename, autocommit, journal_mode): | |
super(SqliteMultithread, self).__init__() | |
self.filename = filename | |
self.autocommit = autocommit | |
self.journal_mode = journal_mode | |
self.reqs = Queue() # use request queue of unlimited size | |
self.setDaemon(True) # python2.5-compatible | |
self.start() | |
def run(self): | |
if self.autocommit: | |
conn = sqlite3.connect(self.filename, isolation_level=None, check_same_thread=False) | |
else: | |
conn = sqlite3.connect(self.filename, check_same_thread=False) | |
conn.execute('PRAGMA journal_mode = %s' % self.journal_mode) | |
conn.text_factory = str | |
cursor = conn.cursor() | |
cursor.execute('PRAGMA synchronous=NORMAL') | |
while True: | |
req, arg, res = self.reqs.get() | |
if req == '--close--': | |
break | |
elif req == '--commit--': | |
conn.commit() | |
else: | |
cursor.execute(req, arg) | |
if res: | |
for rec in cursor: | |
res.put(rec) | |
res.put('--no more--') | |
if self.autocommit: | |
conn.commit() | |
conn.close() | |
def execute(self, req, arg=None, res=None): | |
""" | |
`execute` calls are non-blocking: just queue up the request and return immediately. | |
""" | |
self.reqs.put((req, arg or tuple(), res)) | |
def executemany(self, req, items): | |
for item in items: | |
self.execute(req, item) | |
def select(self, req, arg=None): | |
""" | |
Unlike sqlite's native select, this select doesn't handle iteration efficiently. | |
The result of `select` starts filling up with values as soon as the | |
request is dequeued, and although you can iterate over the result normally | |
(`for res in self.select(): ...`), the entire result will be in memory. | |
""" | |
res = Queue() # results of the select will appear as items in this queue | |
self.execute(req, arg, res) | |
while True: | |
rec = res.get() | |
if rec == '--no more--': | |
break | |
yield rec | |
def select_one(self, req, arg=None): | |
"""Return only the first row of the SELECT, or None if there are no matching rows.""" | |
try: | |
return iter(self.select(req, arg)).next() | |
except StopIteration: | |
return None | |
def commit(self): | |
self.execute('--commit--') | |
def close(self): | |
self.execute('--close--') | |
#endclass SqliteMultithread | |
# running sqlitedict.py as script will perform a simple unit test | |
if __name__ in '__main___': | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(module)s:%(lineno)d : %(funcName)s(%(threadName)s) : %(message)s') | |
logging.root.setLevel(level=logging.INFO) | |
for d in SqliteDict(), SqliteDict('example', flag='n'): | |
assert list(d) == [] | |
assert len(d) == 0 | |
assert not d | |
d['abc'] = 'rsvp' * 100 | |
assert d['abc'] == 'rsvp' * 100 | |
assert len(d) == 1 | |
d['abc'] = 'lmno' | |
assert d['abc'] == 'lmno' | |
assert len(d) == 1 | |
del d['abc'] | |
assert not d | |
assert len(d) == 0 | |
d['abc'] = 'lmno' | |
d['xyz'] = 'pdq' | |
assert len(d) == 2 | |
assert list(d.iteritems()) == [('abc', 'lmno'), ('xyz', 'pdq')] | |
assert d.items() == [('abc', 'lmno'), ('xyz', 'pdq')] | |
assert d.values() == ['lmno', 'pdq'] | |
assert d.keys() == ['abc', 'xyz'] | |
assert list(d) == ['abc', 'xyz'] | |
d.update(p='x', q='y', r='z') | |
assert len(d) == 5 | |
assert d.items() == [('abc', 'lmno'), ('xyz', 'pdq'), ('q', 'y'), ('p', 'x'), ('r', 'z')] | |
del d['abc'] | |
try: | |
error = d['abc'] | |
except KeyError: | |
pass | |
else: | |
assert False | |
try: | |
del d['abc'] | |
except KeyError: | |
pass | |
else: | |
assert False | |
assert list(d) == ['xyz', 'q', 'p', 'r'] | |
assert d | |
d.clear() | |
assert not d | |
assert list(d) == [] | |
d.update(p='x', q='y', r='z') | |
assert list(d) == ['q', 'p', 'r'] | |
d.clear() | |
assert not d | |
d.close() | |
print 'all tests passed :-)' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, json, re, os, calendar, time, sqlite3, zlib | |
import requests | |
NYAA_URL = "https://www.nyaa.se" | |
COMMIT_CHUNK = 200 | |
FILENAME_REGEX=re.compile('^inline; filename="(.+)"$') | |
#CREATE TABLE IF NOT EXISTS torrents (tid INTEGER PRIMARY KEY, mtime INTEGER, filename TEXT, alt INTEGER, data BLOB) | |
def time_http2unix(http_time_string): | |
time_tuple = time.strptime(http_time_string, '%a, %d %b %Y %H:%M:%S GMT') | |
return calendar.timegm(time_tuple) | |
class TorrentStore: | |
def __init__(self, destdb): | |
self.destdb = sqlite3.connect(destdb) | |
self.destdb.execute('PRAGMA journal_mode=TRUNCATE') | |
def add(self, tid, filename, mtime, data): | |
zdata = zlib.compress(data, 9) | |
self.destdb.execute('INSERT OR IGNORE INTO torrents (tid, mtime, filename, data) VALUES (?,?,?,?)', (tid, time_http2unix(mtime), filename.decode('utf-8'), sqlite3.Binary(zdata))) | |
def alt(self, tid, alt): | |
self.destdb.execute('INSERT OR IGNORE INTO torrents (tid, alt) VALUES (?,?)', (tid, alt)) | |
def commit(self): | |
self.destdb.commit() | |
def exists(self, tid): | |
cur = self.destdb.execute('SELECT 1 FROM torrents WHERE tid = ?', (tid,)) | |
return cur.fetchone() is not None | |
def close(self): | |
self.destdb.close() | |
class NyaaScraper: | |
def __init__(self, destdb, viewdb): | |
self.store = TorrentStore(destdb) | |
self.viewdb = sqlite3.connect(viewdb) | |
self.session = requests.Session() | |
def crawl(self): | |
cur = self.viewdb.execute('SELECT tid FROM nyaa') | |
i = 0 | |
for row in cur: | |
if self.downloader(row[0]): | |
i += 1 | |
if i >= COMMIT_CHUNK: | |
print "---COMMIT---" | |
self.store.commit() | |
i = 0 | |
self.store.commit() | |
self.store.close() | |
def downloader(self, tid): | |
if self.store.exists(tid): | |
return False | |
try: | |
req = self.session.get("%s/?page=download&tid=%i" % (NYAA_URL, tid), timeout=120, allow_redirects=False) | |
if req.status_code == 200: | |
try: | |
filename = re.findall(FILENAME_REGEX, req.headers['content-disposition'])[0] | |
except KeyError: | |
alt = int(req.headers['refresh'].split('=')[-1]) | |
self.store.alt(tid, alt) | |
print "found-alt:", tid | |
else: | |
self.store.add(tid, filename, req.headers['Last-Modified'], req.content) | |
print "found:", tid | |
elif req.status_code == 404 or req.status_code in [302, 303]: | |
print "torrent/%i:NotFound" % tid | |
else: | |
print "torrent/%i: %i" % (tid, req.status_code) | |
except Exception, e: | |
print "torrent/%i:Exception %s" % (tid, repr(e)) | |
return True | |
if __name__ == "__main__": | |
if len(sys.argv) == 3 and os.path.isfile(sys.argv[2]): | |
s = NyaaScraper(sys.argv[1], sys.argv[2]) | |
s.crawl() | |
else: | |
print "usage: ", sys.argv[0], "torrentdb viewdb" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#http://code.activestate.com/recipes/577187-python-thread-pool/ | |
from Queue import Queue | |
from threading import Thread | |
class Worker(Thread): | |
"""Thread executing tasks from a given tasks queue""" | |
def __init__(self, tasks): | |
Thread.__init__(self) | |
self.tasks = tasks | |
self.daemon = True | |
self.start() | |
def run(self): | |
while True: | |
try: | |
func, args, kargs = self.tasks.get() | |
except TypeError: | |
break | |
#try: func(*args, **kargs) | |
#except Exception, e: print e | |
func(*args, **kargs) | |
self.tasks.task_done() | |
class ThreadPool: | |
"""Pool of threads consuming tasks from a queue""" | |
def __init__(self, num_threads): | |
self.tasks = Queue(num_threads) | |
for _ in range(num_threads): Worker(self.tasks) | |
def add_task(self, func, *args, **kargs): | |
"""Add a task to the queue""" | |
self.tasks.put((func, args, kargs)) | |
def wait_completion(self): | |
"""Wait for completion of all the tasks in the queue""" | |
self.tasks.join() | |
if __name__ == '__main__': | |
from random import randrange | |
delays = [randrange(1, 10) for i in range(100)] | |
from time import sleep | |
def wait_delay(d): | |
print 'sleeping for (%d)sec' % d | |
sleep(d) | |
# 1) Init a Thread pool with the desired number of threads | |
pool = ThreadPool(20) | |
for i, d in enumerate(delays): | |
# print the percentage of tasks placed in the queue | |
print '%.2f%c' % ((float(i)/float(len(delays)))*100.0,'%') | |
# 2) Add the task to the queue | |
pool.add_task(wait_delay, d) | |
# 3) Wait for completion | |
pool.wait_completion() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, json, re, zlib, time, calendar | |
import requests | |
from BeautifulSoup import BeautifulSoup | |
from sqlitedict import SqliteMultithread | |
from tpool import ThreadPool | |
NYAA_URL = "http://www.nyaa.se" | |
SKIP_EXISTING = True | |
NUM_THREADS = 6 | |
NUM_CHUNK = 60 | |
def find_tag_url(parent, propertyname): | |
for label in parent.findAll('td', {'class': 'tname'}): | |
if label.text == propertyname: | |
prop = label.nextSibling | |
return (prop.text, prop.find('a')['href']) | |
def find_tag(parent, propertyname, classname=''): | |
for label in parent.findAll('td', {'class': 'tname'}): | |
if label.text == propertyname: | |
prop = label.nextSibling | |
if classname: | |
assert prop['class'] == classname | |
return prop.text | |
def time_nyaa2unix(time_string): | |
time_tuple = time.strptime(time_string, '%Y-%m-%d, %H:%M UTC') | |
return calendar.timegm(time_tuple) | |
def time_comment2unix(time_string): | |
time_tuple = time.strptime(time_string, '%Y-%m-%d at %H:%M UTC') | |
return calendar.timegm(time_tuple) | |
def parse_comments(soupcomment): | |
comments = [] | |
for post in soupcomment.findAll('div', {'class':'comment'}): | |
avatar = post.find('div', {'class':'avatar'}) | |
if avatar: | |
avatar = avatar.find('img')['src'] | |
else: | |
avatar = post.find('div', {'class':'avatarstaff'}).find('img')['src'] | |
if avatar == '//files.nyaa.se/Miku.png': | |
avatar = '' | |
# | |
chead = post.find('div', {'class':'chead'}) | |
user = chead.find('a') | |
username = user.text | |
userid = int(user['href'].split('=')[-1]) | |
userstatus = user.findNextSibling('span').text | |
date = user.findNextSibling('span').nextSibling.nextSibling | |
date = time_comment2unix(date) | |
# | |
cid = post.find('div', {'class':'cnumber'})['id'] | |
cmain = unicode(post.find('div', {'class':'cmain'})) | |
# | |
comments.append({'av': avatar, | |
'un': username, | |
'ui': userid, | |
'us': userstatus, | |
't': date, | |
'id': cid, | |
'c': cmain}) | |
if comments: | |
j = json.dumps(comments, separators=(',',':'), encoding="utf-8") | |
return zlib.compress(j, 9) | |
return '' | |
def page_parse(data): | |
soup = BeautifulSoup(data) | |
b = soup.find('div', {'class': 'container'}) | |
if not b: | |
raise NotFoundError | |
c = b.parent | |
assert c['class'].startswith('content') | |
quality = c['class'].replace('content', '').strip() | |
mascotte = '' | |
if 'aplus' in quality: | |
mascotte = c.find('table', {'class': 'viewtable'}).nextSibling.find('img')['src'] | |
category = c.find('td', {'class': 'viewcategory'}).findAll('a')[-1]['href'].split('=')[-1] | |
name = find_tag(c, 'Name:', 'viewtorrentname') | |
date = find_tag(c, 'Date:', 'vtop') | |
uploader, uploaderid = find_tag_url(c, 'Submitter:') | |
seeders = find_tag(c, 'Seeders:', 'vtop') | |
tracker = find_tag(c, 'Tracker') | |
leechers = find_tag(c, 'Leechers:', 'vtop') | |
try: | |
info, infourl = find_tag_url(c, 'Information:') | |
except TypeError: | |
info = '' | |
infourl = '' | |
downloads = find_tag(c, 'Downloads:', 'vtop') | |
stardom = find_tag(c, 'Stardom:') | |
filesize = find_tag(c, 'File size:', 'vtop') | |
description = c.find('div', {'class': 'viewdescription'}) | |
# | |
uploaderid = int(uploaderid.split('=')[-1]) | |
if tracker == 'http://open.nyaatorrents.info:6544/announce': | |
tracker = '' | |
if 'unknown' in seeders or 'unknown' in leechers: | |
seeders = -1 | |
leechers = -1 | |
else: | |
seeders = int(seeders) | |
leechers = int(leechers) | |
downloads = int(downloads) | |
stardom = int(re.findall('\d+', stardom)[0].strip()) | |
date = time_nyaa2unix(date) | |
if description.text == "None": | |
description = '' | |
else: | |
description = zlib.compress(unicode(c.find('div', {'class': 'viewdescription'})).encode('utf-8'), 9) | |
# | |
comments = parse_comments(c) | |
return (name, quality, mascotte, category, date, uploader, uploaderid, seeders, leechers, tracker, info, infourl, downloads, stardom, filesize, description, comments) | |
class NotFoundError(Exception): | |
pass | |
class NyaaScraper: | |
def __init__(self, dbpath, startid, endid): | |
self.database = SqliteMultithread(dbpath, autocommit=False, journal_mode="DELETE") | |
self.database.execute('CREATE TABLE IF NOT EXISTS nyaa (tid INTEGER PRIMARY KEY, name TEXT, quality TEXT, mascotte TEXT, category TEXT, date INTEGER, uploader TEXT, uploaderid INTEGER, seeders INTEGER, leechers INTEGER, tracker TEXT, info TEXT, infourl TEXT, downloads INTEGER, stardom INTEGER, filesize TEXT, description BLOB, comments BLOB)') | |
self.database.commit() | |
self.startid = startid | |
self.endid = endid | |
self.session = requests.Session() | |
self.session.headers.update({'User-Agent': 'view/0.1'}) | |
def exists(self, tid): | |
return self.database.select_one('SELECT 1 FROM nyaa WHERE tid = ?', (tid,)) is not None | |
def crawl(self): | |
i = self.startid | |
pool = ThreadPool(NUM_THREADS) | |
while i <= self.endid: | |
print "queue: %i ==> %i" % (i, i + NUM_CHUNK) | |
for j in xrange(i, i + NUM_CHUNK): | |
if j > self.endid: | |
break | |
pool.add_task(self.downloader, j) | |
i += 1 | |
pool.wait_completion() | |
self.database.commit() | |
self.database.commit() | |
self.database.close() | |
time.sleep(2) | |
def downloader(self, tid): | |
if SKIP_EXISTING and self.exists(tid): | |
return | |
try: | |
req = self.session.get("%s/?page=view&tid=%i" % (NYAA_URL, tid), timeout=120, headers={"accept-language": "en"}) | |
if req.status_code != 200: | |
print "torrent/%i: %i %s" % (tid, req.status_code, req.error) | |
return | |
try: | |
parsed = page_parse(req.text) | |
except NotFoundError: | |
print "torrent/%i:NotFound" % tid | |
else: | |
print "found:", tid | |
self.database.execute('INSERT OR REPLACE INTO nyaa (tid, name, quality, mascotte, category, date, uploader, uploaderid, seeders, leechers, tracker, info, infourl, downloads, stardom, filesize, description, comments) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', (tid,) + parsed) | |
except Exception, e: | |
print "torrent/%i:Exception %s" % (tid, repr(e)) | |
if __name__ == "__main__": | |
if len(sys.argv) == 4: | |
s = NyaaScraper(sys.argv[1], int(sys.argv[2]), int(sys.argv[3])) | |
s.crawl() | |
else: | |
print "usage: ", sys.argv[0], "database startid endid" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment