Skip to content

Instantly share code, notes, and snippets.

Created February 10, 2012 11:03
Show Gist options
  • Save anonymous/1788788 to your computer and use it in GitHub Desktop.
Save anonymous/1788788 to your computer and use it in GitHub Desktop.
Pirate Bay database dump (complete.txt) fast searcher
#!/usr/bin/env python3
# https://gist.github.com/1788788
# This program can be used to VERY fast searching and sorting of Pirate
# Bay website database/archive dump. This database contains: ids,
# titles, magnet hashes, sizes and seed/leach numbers only. Database can
# be download at https://thepiratebay.se/torrent/7016365 or just using
# magnet link: magnet:?xt=urn:btih:938802790a385c49307f34cca4c30f80b03df59c
# Last change 10 Feb 2011, 12:40
# More to come.
# To use this script you need to have Python 3 install
# Debian/Ubuntu: apt-get install python3
# Other Unix machines: check your distribution repositories, or http://python.org/download/ page
# Windows users: Visit http://python.org/download/#download-python and download 3.x series installer
# You will also need sqlite3 libraries installed.
# I do not know how they are distributed in Windows version.
# Do not forget to setup 'executable' bit for this script: chmod +x tpb.py
# Run script without parameter (preferably in text terminal/console), like this: ./tpb.py
# in the same directory a complete.txt file is located.
# Program will import data, and give short usage examples.
# Program assumes ANSI compatible color-enabled terminal with Unicode support!
import re
import sqlite3
import os
import sys
import time
limit = 12 # Change to bigger number, if you want to display more than 12 best results by default
verbose = False
def rows_generator():
start_time = time.time() - 0.1
ids = {}
prog = re.compile(r"^([0-9]+)\|(.+)\|([0-9]+)\|([0-9]+)\|([0-9]+)\|([0-9a-fA-F]{38,40})$") # most have 40, some 38
line_number = 0
f = open("complete.txt")
if not f:
print("Please download complete.gz and unpack it as complete.txt in current directory", file=sys.stderr)
print("You can download it using following links:", file=sys.stderr)
print(" https://thepiratebay.se/torrent/7016365 or", file=sys.stderr)
#print(" magnet:?xt=urn:btih:938802790a385c49307f34cca4c30f80b03df59c&dn=The+whole+Pirate+Bay+magnet+archive&tr=udp://tracker.openbittorrent.com:80&tr=udp://tracker.publicbt.com:80&tr=udp://tracker.ccc.de:80", file=sys.stderr)
print(" magnet:?xt=urn:btih:938802790a385c49307f34cca4c30f80b03df59c", file=sys.stderr)
sys.exit(1)
f.seek(0, 2)
file_size = f.tell()
f.seek(0, 0)
#file_size = os.fstat(fd).st_size
current_position = 0
progress = 0
for row0 in f:
current_position += len(row0)
#current_position = f.tell()
new_progress = int(100*current_position / file_size)
if new_progress > progress+4:
progress = new_progress
elappsed_time = time.time() - start_time
total_time = (elappsed_time/progress)*100
# this additionally needs to be multiplied by proper slowdown factor due log(n) function usage when doing INSERT
total_time
remaining_time = total_time - elappsed_time
print("Progress: ", progress, "% (elappsed ",int(elappsed_time),"s, ETA ",int(remaining_time),"s, total ",int(total_time),"s)", sep='', file=sys.stderr)
line_number += 1
row = row0.rstrip()
r = prog.match(row)
if not r:
if verbose:
print("Incorrect row format (line ",line_number,"): ", row, sep='', file=sys.stderr)
continue
g = lambda i: r.group(i)
ID = int(g(1))
if ID in ids:
if verbose:
print("ID duplicate (line ", ids[ID], " and ", line_number, "): ", row, sep='', file=sys.stderr)
continue
ids[ID] = line_number
res = (ID, g(2), g(3), g(4), g(5), g(6))
yield res
f.close()
f = None
ids = None
def initalize_db():
theIter = rows_generator() # moving it early, so in case of missing input file, we have only one error message
print("Creating database in file complete.sqlite3 (if something goes wrong remove it and -journal files)", file=sys.stderr)
conn = sqlite3.connect('complete.sqlite3', isolation_level="DEFERRED")
print("Changing tunning options", file=sys.stderr)
conn.execute("PRAGMA page_size = 8192") # 8 KiB
conn.execute("PRAGMA cache_size = -65536") # 64 MiB
conn.execute("PRAGMA automatic_index = false")
conn.execute("PRAGMA synchronous = OFF")
c = conn.cursor()
print("Main data table create", file=sys.stderr)
c.execute('CREATE TABLE IF NOT EXISTS torrents (id integer, title text, size integer, seeds integer, leachers integer, hash text)')
print("DONE", file=sys.stderr)
conn.commit()
try:
c.execute('CREATE VIRTUAL TABLE torrents_f USING fts4(title)')
conn.commit()
print("FTS virtual table created", file=sys.stderr)
except:
pass
print("DATABASE IMPORT STARTED (can take few minutes and use about 600MB of disk space)", file=sys.stderr)
c = conn.cursor()
c.executemany("INSERT INTO torrents(id, title, size, leachers, seeds, hash) VALUES (?, ?, ?, ?, ?, ?)", theIter)
#res2 = (ID, g(2))
#c.execute("INSERT INTO torrents_f(docid, title) VALUES (?, ?)", res2)
print("Create indexes AFTER bulk loading (MUCH FASTER than creating indexes first)", file=sys.stderr)
c.execute('CREATE UNIQUE INDEX IF NOT EXISTS torrents_0 ON torrents (id)')
c.execute('CREATE INDEX IF NOT EXISTS torrents_1 ON torrents (size)')
c.execute('CREATE INDEX IF NOT EXISTS torrents_1 ON torrents (seeds DESC, leachers DESC)')
c.execute('CREATE INDEX IF NOT EXISTS torrents_2 ON torrents (title)')
conn.commit()
print("Populate FTS index using data from main table (can take about a minute)", file=sys.stderr)
c.execute("INSERT INTO torrents_f(docid, title) SELECT id, title FROM torrents")
conn.commit()
print("Optimize FTS", file=sys.stderr)
c.execute("INSERT INTO torrents_f(torrents_f) VALUES('optimize');")
conn.commit()
#print("Optimize space", file=sys.stderr)
#conn.execute("VACUUM")
#conn.commit()
conn.close()
conn = None
print("DATABASE IMPORT COMPLETED", file=sys.stderr)
def s(x):
x = int(x)
if x < 900:
return "{0}B".format(x)
x /= 1024.0
if x < 900:
return "{0:.1f}kB".format(x)
x /= 1024.0
if x < 900:
return "{0:.1f}MB".format(x)
x /= 1024.0
if x < 900:
return "{0:.1f}GB".format(x)
x /= 1024.0
return "{0:.1f}TB".format(x)
if not os.path.exists("complete.sqlite3"):
initalize_db()
conn = sqlite3.connect('complete.sqlite3')
print("USING EXISTING DATABSE (if You want to recreate database just delete complete.sqlite3 file and restart)", file=sys.stderr)
conn.row_factory = sqlite3.Row
c = conn.cursor()
def db_stats():
c.execute('SELECT MAX(id), MAX(size), COUNT(id), SUM(size) FROM torrents')
for row in c:
print("Maximum Pirate Bay torrent ID", row[0])
print("Largest torrent have", s(row[1]))
print("Number of all torrents", row[2])
print("Total size of all torrents", s(row[3]))
print()
if len(sys.argv) <= 1:
db_stats()
print()
print("Newest version always available at https://gist.github.com/1788788")
print()
print("TODO: uploader nick, and status (VIP), torrent upload date, descriptions, poster links, language, categories, filtering using torrent size, more options (displaying more results, sorting using differnt columns)")
print()
print("Example usage of search in title:")
h = lambda x: print(" {0} {1}".format(sys.argv[0], x))
h('Far Cry Anything with words Far and Cry (order and case irrelevant)')
h('"Far Cry" Actually the same, because of the way shell works')
h('"Far Cry" -2 Do not show Far Cry 2')
h('\'"Far Cry"\' Phrase "Far Cry" exactly in this order (two consequtive words, but case irrelevant)')
h('\'"Far Cry"\' PC Search for "Far Cry" phrase AND PC word')
h('"SQL" Anything with whole word SQL')
h('"Dex*" "Lab*" Title must contain word Dexter and some word starting with Lab...')
h('"Dex* Lab*" Same as above')
h('"\'Dex* Lab*\'" Word Dexter and Lab.... must be one after another.')
h('"Windows AND (2000 OR 2008)" Boolean operators (Note: AND, OR, NOT, NEAR must be uppercase)')
h('"Far Cry" AND NOT 2 Same as "Far Cry" -2')
h('Anna NEAR/0 "K*" Proximity query')
print("For more informations look at http://www.sqlite.org/fts3.html#section_3_2")
sys.exit(1)
symbol=" ".join(sys.argv[1:])
print("Searching for:", symbol)
print()
def magnet(row):
return "magnet:?xt=urn:btih:"+row["hash"]
limit0 = limit*50
if verbose:
start_time = time.time()
t = (symbol, limit0)
i = 0
c.execute('SELECT docid, size, seeds, leachers, hash, tf.title, snippet(torrents_f, "\033[1m", "\033[0m", "…") FROM torrents_f AS tf JOIN torrents AS t ON t.id = tf.docid WHERE tf.title MATCH ? ORDER BY seeds DESC, leachers DESC, id DESC LIMIT ?', t)
print(" # TPB-ID SIZE S L MAGNET-LINK")
for row in c:
i += 1
print("{0:3} {1:8} {2:>10} {3:4} {4:4} {5}".format(i, row[0], s(row[1]), row[2], row[3], magnet(row)))
print(" ", row[6])
if i >= limit:
break
j = i
for row in c:
j += 1
print("Shown {0} results from {1}".format(i, (str(j)+" or more", j)[j < limit0]))
if verbose:
print(time.time()-start_time, "seconds")
c.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment