Skip to content

Instantly share code, notes, and snippets.

@hannes
Created October 12, 2019 12:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save hannes/2cf7f96f4150fc88d38d0b6022f215d6 to your computer and use it in GitHub Desktop.
Save hannes/2cf7f96f4150fc88d38d0b6022f215d6 to your computer and use it in GitHub Desktop.
Create and query a local search engine for PirateBay torrents
#!/usr/bin/env python3
import sqlite3 # yay SQLite!
import base64
import os
import sys
if sys.version_info<(3,6,0):
print("You need python 3.6 or later to run this script.")
exit(-1)
if (len(sys.argv) < 2):
print("Usage: torrent.py your search term")
print("Boolean operators (AND OR NOT) should work.")
exit(-1)
q = ' '.join(sys.argv[1:]).replace('\'', ' ')
fname = "torrent_dump_full.csv.gz"
if not os.path.isfile(fname):
print("Missing file 'torrent_dump_full.csv.gz'.")
print("Download from https://thepiratebay.org/static/dump/csv/torrent_dump_full.csv.gz")
exit(-1)
mtime = int(os.path.getmtime(fname))
con = sqlite3.connect('torrents.db')
cur = con.cursor()
# check if the db has fts5
available_pragmas = cur.execute('PRAGMA compile_options').fetchall()
if not ('ENABLE_FTS5',) in available_pragmas:
raise "no fts5 extension"
# check a bunch of conditions whether we need to recompute the index
table_cnt = cur.execute("SELECT COUNT(*) FROM sqlite_master \
WHERE type='table' AND name IN('torrent_index', 'last_updated')").fetchone()[0]
recompute = False
if (table_cnt == 2):
old_mtime = cur.execute("SELECT mtime FROM last_updated").fetchone()[0]
if (old_mtime < mtime):
recompute = True
else:
recompute = True
if recompute:
print("Re-creating index, please stand by...")
import pandas
df = pandas.read_csv(fname, sep=";", header=0, quotechar='"', escapechar="\\",
names=['added', 'hash', 'name', 'size'], usecols=[1, 2, 3])
cur.execute("BEGIN")
cur.execute("DROP TABLE IF EXISTS torrent_index")
cur.execute("CREATE VIRTUAL TABLE torrent_index USING FTS5(hash, name, size)")
# hack alert, use to_sql to directly add to fulltext index
df.to_sql("torrent_index", con, index=False, if_exists='append')
cur.execute("DROP TABLE IF EXISTS last_updated")
cur.execute("CREATE TABLE last_updated (mtime INTEGER)")
cur.execute("INSERT INTO last_updated VALUES (%d)" % mtime)
cur.execute("COMMIT") # woo ho
def sizeof_fmt(num, suffix='B'):
for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
if abs(num) < 1024.0:
return "%3.1f%s%s" % (num, unit, suffix)
num /= 1024.0
return "%.1f%s%s" % (num, 'Yi', suffix)
# this is the magic, a query that matches the cmdline params against the fulltext index
# yes this should be a prepared statement, but thats not supported
cur.execute("SELECT hash, name, size FROM torrent_index \
WHERE torrent_index MATCH '\"name\" : %s' ORDER BY rank LIMIT 100" % q)
for row in cur:
print("%-64s\t%.10s\tmagnet:?xt=urn:btih:%.40s" %
(row[1], sizeof_fmt(row[2]).rjust(10), base64.b64decode(row[0]).hex()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment