/tpb.py

## tpb.py
#!/usr/bin/env python3

# https://gist.github.com/1788788

# This program can be used to VERY fast searching and sorting of Pirate
# Bay website database/archive dump. This database contains: ids,
# titles, magnet hashes, sizes and seed/leach numbers only. Database can
# be download at https://thepiratebay.se/torrent/7016365 or just using
# magnet link: magnet:?xt=urn:btih:938802790a385c49307f34cca4c30f80b03df59c


# Last change 10 Feb 2011, 12:40
# More to come.

# To use this script you need to have Python 3 install

# Debian/Ubuntu: apt-get install python3
# Other Unix machines: check your distribution repositories, or http://python.org/download/ page
# Windows users: Visit http://python.org/download/#download-python and download 3.x series installer

# You will also need sqlite3 libraries installed.
# I do not know how they are distributed in Windows version.

# Do not forget to setup 'executable' bit for this script: chmod +x tpb.py

# Run script without parameter (preferably in text terminal/console), like this: ./tpb.py
# in the same directory a complete.txt file is located.
# Program will import data, and give short usage examples.

# Program assumes ANSI compatible color-enabled terminal with Unicode support!

import re
import sqlite3
import os
import sys
import time

limit = 12  # Change to bigger number, if you want to display more than 12 best results by default
verbose = False

def rows_generator():
	start_time = time.time() - 0.1
	ids = {}
	prog = re.compile(r"^([0-9]+)\|(.+)\|([0-9]+)\|([0-9]+)\|([0-9]+)\|([0-9a-fA-F]{38,40})$") # most have 40, some 38
	line_number = 0
	f = open("complete.txt")
	if not f:
		print("Please download complete.gz and unpack it as complete.txt in current directory", file=sys.stderr)
		print("You can download it using following links:", file=sys.stderr)
		print("  https://thepiratebay.se/torrent/7016365 or", file=sys.stderr)
		#print("  magnet:?xt=urn:btih:938802790a385c49307f34cca4c30f80b03df59c&dn=The+whole+Pirate+Bay+magnet+archive&tr=udp://tracker.openbittorrent.com:80&tr=udp://tracker.publicbt.com:80&tr=udp://tracker.ccc.de:80", file=sys.stderr)
		print("  magnet:?xt=urn:btih:938802790a385c49307f34cca4c30f80b03df59c", file=sys.stderr)
		sys.exit(1)
	f.seek(0, 2)
	file_size = f.tell()
	f.seek(0, 0)
	#file_size = os.fstat(fd).st_size
	current_position = 0
	progress = 0
	for row0 in f:
		current_position += len(row0)
		#current_position = f.tell()
		new_progress = int(100*current_position / file_size)
		if new_progress > progress+4:
			progress = new_progress
			elappsed_time = time.time() - start_time
			total_time = (elappsed_time/progress)*100
			# this additionally needs to be multiplied by proper slowdown factor due log(n) function usage when doing INSERT
			total_time
			remaining_time = total_time - elappsed_time
			print("Progress: ", progress, "% (elappsed ",int(elappsed_time),"s, ETA ",int(remaining_time),"s, total ",int(total_time),"s)", sep='', file=sys.stderr)
		line_number += 1
		row = row0.rstrip()
		r = prog.match(row)
		if not r:
			if verbose:
				print("Incorrect row format (line ",line_number,"): ", row, sep='', file=sys.stderr)
			continue
		g = lambda i: r.group(i)
		ID = int(g(1))
		if ID in ids:
			if verbose:
				print("ID duplicate (line ", ids[ID], " and ", line_number, "): ", row, sep='', file=sys.stderr)
			continue
		ids[ID] = line_number
		res = (ID, g(2), g(3), g(4), g(5), g(6))
		yield res
	f.close()
	f = None
	ids = None

def initalize_db():
	theIter = rows_generator() # moving it early, so in case of missing input file, we have only one error message

	print("Creating database in file complete.sqlite3 (if something goes wrong remove it and  -journal files)", file=sys.stderr)
	conn = sqlite3.connect('complete.sqlite3', isolation_level="DEFERRED")
	print("Changing tunning options", file=sys.stderr)
	conn.execute("PRAGMA page_size = 8192") # 8 KiB
	conn.execute("PRAGMA cache_size = -65536") # 64 MiB
	conn.execute("PRAGMA automatic_index = false")
	conn.execute("PRAGMA synchronous = OFF")
	c = conn.cursor()
	print("Main data table create", file=sys.stderr)
	c.execute('CREATE TABLE IF NOT EXISTS torrents (id integer, title text, size integer, seeds integer, leachers integer, hash text)')
	print("DONE", file=sys.stderr)
	conn.commit()

	try:
		c.execute('CREATE VIRTUAL TABLE torrents_f USING fts4(title)')
		conn.commit()
		print("FTS virtual table created", file=sys.stderr)
	except:
		pass

	print("DATABASE IMPORT STARTED (can take few minutes and use about 600MB of disk space)", file=sys.stderr)
	c = conn.cursor()

	c.executemany("INSERT INTO torrents(id, title, size, leachers, seeds, hash) VALUES (?, ?, ?, ?, ?, ?)", theIter)
	#res2 = (ID, g(2))
	#c.execute("INSERT INTO torrents_f(docid, title) VALUES (?, ?)", res2)

	print("Create indexes AFTER bulk loading (MUCH FASTER than creating indexes first)", file=sys.stderr)
	c.execute('CREATE UNIQUE INDEX IF NOT EXISTS torrents_0 ON torrents (id)')
	c.execute('CREATE INDEX IF NOT EXISTS torrents_1 ON torrents (size)')
	c.execute('CREATE INDEX IF NOT EXISTS torrents_1 ON torrents (seeds DESC, leachers DESC)')
	c.execute('CREATE INDEX IF NOT EXISTS torrents_2 ON torrents (title)')

	conn.commit()

	print("Populate FTS index using data from main table (can take about a minute)", file=sys.stderr)
	c.execute("INSERT INTO torrents_f(docid, title) SELECT id, title FROM torrents")
	conn.commit()

	print("Optimize FTS", file=sys.stderr)
	c.execute("INSERT INTO torrents_f(torrents_f) VALUES('optimize');")
	conn.commit()

	#print("Optimize space", file=sys.stderr)
	#conn.execute("VACUUM")
	#conn.commit()

	conn.close()
	conn = None
	print("DATABASE IMPORT COMPLETED", file=sys.stderr)

def s(x):
	x = int(x)
	if x < 900:
		return "{0}B".format(x)
	x /= 1024.0
	if x < 900:
		return "{0:.1f}kB".format(x)
	x /= 1024.0
	if x < 900:
		return "{0:.1f}MB".format(x)
	x /= 1024.0
	if x < 900:
		return "{0:.1f}GB".format(x)
	x /= 1024.0
	return "{0:.1f}TB".format(x)


if not os.path.exists("complete.sqlite3"):
	initalize_db()

conn = sqlite3.connect('complete.sqlite3')

print("USING EXISTING DATABSE (if You want to recreate database just delete complete.sqlite3 file and restart)", file=sys.stderr)

conn.row_factory = sqlite3.Row
c = conn.cursor()
def db_stats():
	c.execute('SELECT MAX(id), MAX(size), COUNT(id), SUM(size) FROM torrents')
	for row in c:
		print("Maximum Pirate Bay torrent ID", row[0])
		print("Largest torrent have", s(row[1]))
		print("Number of all torrents", row[2])
		print("Total size of all torrents", s(row[3]))

print()

if len(sys.argv) <= 1:
	db_stats()
	print()
	print("Newest version always available at https://gist.github.com/1788788")
	print()
	print("TODO: uploader nick, and status (VIP), torrent upload date, descriptions, poster links, language, categories, filtering using torrent size, more options (displaying more results, sorting using differnt columns)")
	print()
	print("Example usage of search in title:")
	h = lambda x: print("  {0} {1}".format(sys.argv[0], x))
	h('Far Cry           Anything with words Far and Cry (order and case irrelevant)')
	h('"Far Cry"         Actually the same, because of the way shell works')
	h('"Far Cry" -2      Do not show Far Cry 2')
	h('\'"Far Cry"\'     Phrase "Far Cry" exactly in this order (two consequtive words, but case irrelevant)')
	h('\'"Far Cry"\' PC  Search for "Far Cry" phrase AND PC word')
	h('"SQL"             Anything with whole word SQL')
	h('"Dex*" "Lab*"     Title must contain word Dexter and some word starting with Lab...')
	h('"Dex* Lab*"       Same as above')
	h('"\'Dex* Lab*\'"   Word Dexter and Lab.... must be one after another.')
	h('"Windows AND (2000 OR 2008)"   Boolean operators (Note: AND, OR, NOT, NEAR must be uppercase)')
	h('"Far Cry" AND NOT 2   Same as "Far Cry" -2')
	h('Anna NEAR/0 "K*"  Proximity query')
	print("For more informations look at http://www.sqlite.org/fts3.html#section_3_2")
	sys.exit(1)

symbol=" ".join(sys.argv[1:])
print("Searching for:", symbol)
print()

def magnet(row):
	return "magnet:?xt=urn:btih:"+row["hash"]

limit0 = limit*50

if verbose:
	start_time = time.time()
t = (symbol, limit0)
i = 0
c.execute('SELECT docid, size, seeds, leachers, hash, tf.title, snippet(torrents_f, "\033[1m", "\033[0m", "…") FROM torrents_f AS tf JOIN torrents AS t ON t.id = tf.docid WHERE tf.title MATCH ? ORDER BY seeds DESC, leachers DESC, id DESC LIMIT ?', t)
print("  #   TPB-ID       SIZE    S    L   MAGNET-LINK")
for row in c:
	i += 1
	print("{0:3} {1:8} {2:>10} {3:4} {4:4}   {5}".format(i, row[0], s(row[1]), row[2], row[3], magnet(row)))
	print("                                   ", row[6])
	if i >= limit:
		break
j = i
for row in c:
	j += 1
print("Shown {0} results from {1}".format(i, (str(j)+" or more", j)[j < limit0]))
if verbose:
	print(time.time()-start_time, "seconds")

c.close()
	#!/usr/bin/env python3

	# https://gist.github.com/1788788

	# This program can be used to VERY fast searching and sorting of Pirate
	# Bay website database/archive dump. This database contains: ids,
	# titles, magnet hashes, sizes and seed/leach numbers only. Database can
	# be download at https://thepiratebay.se/torrent/7016365 or just using
	# magnet link: magnet:?xt=urn:btih:938802790a385c49307f34cca4c30f80b03df59c


	# Last change 10 Feb 2011, 12:40
	# More to come.

	# To use this script you need to have Python 3 install

	# Debian/Ubuntu: apt-get install python3
	# Other Unix machines: check your distribution repositories, or http://python.org/download/ page
	# Windows users: Visit http://python.org/download/#download-python and download 3.x series installer

	# You will also need sqlite3 libraries installed.
	# I do not know how they are distributed in Windows version.

	# Do not forget to setup 'executable' bit for this script: chmod +x tpb.py

	# Run script without parameter (preferably in text terminal/console), like this: ./tpb.py
	# in the same directory a complete.txt file is located.
	# Program will import data, and give short usage examples.

	# Program assumes ANSI compatible color-enabled terminal with Unicode support!

	import re
	import sqlite3
	import os
	import sys
	import time

	limit = 12 # Change to bigger number, if you want to display more than 12 best results by default
	verbose = False

	def rows_generator():
	start_time = time.time() - 0.1
	ids = {}
	prog = re.compile(r"^([0-9]+)\\|(.+)\\|([0-9]+)\\|([0-9]+)\\|([0-9]+)\\|([0-9a-fA-F]{38,40})$") # most have 40, some 38
	line_number = 0
	f = open("complete.txt")
	if not f:
	print("Please download complete.gz and unpack it as complete.txt in current directory", file=sys.stderr)
	print("You can download it using following links:", file=sys.stderr)
	print(" https://thepiratebay.se/torrent/7016365 or", file=sys.stderr)
	#print(" magnet:?xt=urn:btih:938802790a385c49307f34cca4c30f80b03df59c&dn=The+whole+Pirate+Bay+magnet+archive&tr=udp://tracker.openbittorrent.com:80&tr=udp://tracker.publicbt.com:80&tr=udp://tracker.ccc.de:80", file=sys.stderr)
	print(" magnet:?xt=urn:btih:938802790a385c49307f34cca4c30f80b03df59c", file=sys.stderr)
	sys.exit(1)
	f.seek(0, 2)
	file_size = f.tell()
	f.seek(0, 0)
	#file_size = os.fstat(fd).st_size
	current_position = 0
	progress = 0
	for row0 in f:
	current_position += len(row0)
	#current_position = f.tell()
	new_progress = int(100*current_position / file_size)
	if new_progress > progress+4:
	progress = new_progress
	elappsed_time = time.time() - start_time
	total_time = (elappsed_time/progress)*100
	# this additionally needs to be multiplied by proper slowdown factor due log(n) function usage when doing INSERT
	total_time
	remaining_time = total_time - elappsed_time
	print("Progress: ", progress, "% (elappsed ",int(elappsed_time),"s, ETA ",int(remaining_time),"s, total ",int(total_time),"s)", sep='', file=sys.stderr)
	line_number += 1
	row = row0.rstrip()
	r = prog.match(row)
	if not r:
	if verbose:
	print("Incorrect row format (line ",line_number,"): ", row, sep='', file=sys.stderr)
	continue
	g = lambda i: r.group(i)
	ID = int(g(1))
	if ID in ids:
	if verbose:
	print("ID duplicate (line ", ids[ID], " and ", line_number, "): ", row, sep='', file=sys.stderr)
	continue
	ids[ID] = line_number
	res = (ID, g(2), g(3), g(4), g(5), g(6))
	yield res
	f.close()
	f = None
	ids = None

	def initalize_db():
	theIter = rows_generator() # moving it early, so in case of missing input file, we have only one error message

	print("Creating database in file complete.sqlite3 (if something goes wrong remove it and -journal files)", file=sys.stderr)
	conn = sqlite3.connect('complete.sqlite3', isolation_level="DEFERRED")
	print("Changing tunning options", file=sys.stderr)
	conn.execute("PRAGMA page_size = 8192") # 8 KiB
	conn.execute("PRAGMA cache_size = -65536") # 64 MiB
	conn.execute("PRAGMA automatic_index = false")
	conn.execute("PRAGMA synchronous = OFF")
	c = conn.cursor()
	print("Main data table create", file=sys.stderr)
	c.execute('CREATE TABLE IF NOT EXISTS torrents (id integer, title text, size integer, seeds integer, leachers integer, hash text)')
	print("DONE", file=sys.stderr)
	conn.commit()

	try:
	c.execute('CREATE VIRTUAL TABLE torrents_f USING fts4(title)')
	conn.commit()
	print("FTS virtual table created", file=sys.stderr)
	except:
	pass

	print("DATABASE IMPORT STARTED (can take few minutes and use about 600MB of disk space)", file=sys.stderr)
	c = conn.cursor()

	c.executemany("INSERT INTO torrents(id, title, size, leachers, seeds, hash) VALUES (?, ?, ?, ?, ?, ?)", theIter)
	#res2 = (ID, g(2))
	#c.execute("INSERT INTO torrents_f(docid, title) VALUES (?, ?)", res2)

	print("Create indexes AFTER bulk loading (MUCH FASTER than creating indexes first)", file=sys.stderr)
	c.execute('CREATE UNIQUE INDEX IF NOT EXISTS torrents_0 ON torrents (id)')
	c.execute('CREATE INDEX IF NOT EXISTS torrents_1 ON torrents (size)')
	c.execute('CREATE INDEX IF NOT EXISTS torrents_1 ON torrents (seeds DESC, leachers DESC)')
	c.execute('CREATE INDEX IF NOT EXISTS torrents_2 ON torrents (title)')

	conn.commit()

	print("Populate FTS index using data from main table (can take about a minute)", file=sys.stderr)
	c.execute("INSERT INTO torrents_f(docid, title) SELECT id, title FROM torrents")
	conn.commit()

	print("Optimize FTS", file=sys.stderr)
	c.execute("INSERT INTO torrents_f(torrents_f) VALUES('optimize');")
	conn.commit()

	#print("Optimize space", file=sys.stderr)
	#conn.execute("VACUUM")
	#conn.commit()

	conn.close()
	conn = None
	print("DATABASE IMPORT COMPLETED", file=sys.stderr)

	def s(x):
	x = int(x)
	if x < 900:
	return "{0}B".format(x)
	x /= 1024.0
	if x < 900:
	return "{0:.1f}kB".format(x)
	x /= 1024.0
	if x < 900:
	return "{0:.1f}MB".format(x)
	x /= 1024.0
	if x < 900:
	return "{0:.1f}GB".format(x)
	x /= 1024.0
	return "{0:.1f}TB".format(x)


	if not os.path.exists("complete.sqlite3"):
	initalize_db()

	conn = sqlite3.connect('complete.sqlite3')

	print("USING EXISTING DATABSE (if You want to recreate database just delete complete.sqlite3 file and restart)", file=sys.stderr)

	conn.row_factory = sqlite3.Row
	c = conn.cursor()
	def db_stats():
	c.execute('SELECT MAX(id), MAX(size), COUNT(id), SUM(size) FROM torrents')
	for row in c:
	print("Maximum Pirate Bay torrent ID", row[0])
	print("Largest torrent have", s(row[1]))
	print("Number of all torrents", row[2])
	print("Total size of all torrents", s(row[3]))

	print()

	if len(sys.argv) <= 1:
	db_stats()
	print()
	print("Newest version always available at https://gist.github.com/1788788")
	print()
	print("TODO: uploader nick, and status (VIP), torrent upload date, descriptions, poster links, language, categories, filtering using torrent size, more options (displaying more results, sorting using differnt columns)")
	print()
	print("Example usage of search in title:")
	h = lambda x: print(" {0} {1}".format(sys.argv[0], x))
	h('Far Cry Anything with words Far and Cry (order and case irrelevant)')
	h('"Far Cry" Actually the same, because of the way shell works')
	h('"Far Cry" -2 Do not show Far Cry 2')
	h('\'"Far Cry"\' Phrase "Far Cry" exactly in this order (two consequtive words, but case irrelevant)')
	h('\'"Far Cry"\' PC Search for "Far Cry" phrase AND PC word')
	h('"SQL" Anything with whole word SQL')
	h('"Dex" "Lab" Title must contain word Dexter and some word starting with Lab...')
	h('"Dex* Lab*" Same as above')
	h('"\'Dex* Lab*\'" Word Dexter and Lab.... must be one after another.')
	h('"Windows AND (2000 OR 2008)" Boolean operators (Note: AND, OR, NOT, NEAR must be uppercase)')
	h('"Far Cry" AND NOT 2 Same as "Far Cry" -2')
	h('Anna NEAR/0 "K*" Proximity query')
	print("For more informations look at http://www.sqlite.org/fts3.html#section_3_2")
	sys.exit(1)

	symbol=" ".join(sys.argv[1:])
	print("Searching for:", symbol)
	print()

	def magnet(row):
	return "magnet:?xt=urn:btih:"+row["hash"]

	limit0 = limit*50

	if verbose:
	start_time = time.time()
	t = (symbol, limit0)
	i = 0
	c.execute('SELECT docid, size, seeds, leachers, hash, tf.title, snippet(torrents_f, "\033[1m", "\033[0m", "…") FROM torrents_f AS tf JOIN torrents AS t ON t.id = tf.docid WHERE tf.title MATCH ? ORDER BY seeds DESC, leachers DESC, id DESC LIMIT ?', t)
	print(" # TPB-ID SIZE S L MAGNET-LINK")
	for row in c:
	i += 1
	print("{0:3} {1:8} {2:>10} {3:4} {4:4} {5}".format(i, row[0], s(row[1]), row[2], row[3], magnet(row)))
	print(" ", row[6])
	if i >= limit:
	break
	j = i
	for row in c:
	j += 1
	print("Shown {0} results from {1}".format(i, (str(j)+" or more", j)[j < limit0]))
	if verbose:
	print(time.time()-start_time, "seconds")

	c.close()