stbuehler/Makefile

## Makefile

override LDFLAGS += -lssl -lpcrecpp
override CPPFLAGS += -O2 -Wall

all: torrent-sanitize

clean:
	rm torrent-sanitize

.PHONY: all clean

## torrent-sanitize.cpp

/*
   goal: fix torrents (if possible): cleanup announce urls, set custom comments, filter meta fields ...

   compile:
     LDFLAGS='-lssl -lpcrecpp' CPPFLAGS='-O2 -Wall' make torrent-sanitize
       needs openssl-dev for sha1 and libpcre++-dev

 */

/* using MMAP is dangerous if files are not updated atomically;
 * atomic file update works like this:
 *   write to "<filename>.tmp$$"
 *   mv "<filename>.tmp$$" -> "<filename>"
 *   (the tmp file should be on the same filesystem)
 * MMAP is only used to read files - it delays reading the actual content until
 *   the code accesses the memory
 */
#define USE_MMAP

#include <string>
#include <vector>
#include <map>
#include <algorithm>

#include <iostream>
#include <sstream>
#include <fstream>

#include <limits>

#include <pcrecpp.h>

extern "C" {
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <stdint.h>
#include <openssl/sha.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>

#ifdef USE_MMAP
# include <sys/mman.h>
#endif

}

bool validUTF8(const char *_s, size_t len) {
	const unsigned char *s = (const unsigned char*) _s;
	for (size_t i = 0; i < len; i++) {
		size_t seqlen = 0;
		if (0 == (s[i] & 0x80)) { seqlen = 0; }
		else if (0xC0 == (s[i] & 0xE0)) { seqlen = 1; }
		else if (0xE0 == (s[i] & 0xF0)) { seqlen = 2; }
		else if (0xF0 == (s[i] & 0xF8)) { seqlen = 3; }
		else if (0xF8 == (s[i] & 0xFC)) { seqlen = 4; }
		else if (0xFC == (s[i] & 0xFE)) { seqlen = 5; }
		else return false;
		if (seqlen > len - i) return false;
		for ( ; seqlen > 0; seqlen--, i++) {
			if (0x80 != (s[i+1] && 0xC0)) return false;
		}
	}
	return true;
}

bool validUTF8(std::string s) {
	return validUTF8(s.c_str(), s.length());
}

template<size_t N> bool stringHasPrefix(const std::string &s, const char (&prefix)[N]) {
	return s.length() >= (N-1) && 0 == memcmp(s.c_str(), prefix, N-1);
}


class Buffer {
private:
	Buffer(const Buffer &b);
	Buffer& operator=(const Buffer &b);

public:
	Buffer() :m_data(0), m_len(0), m_pos(0) {
	}

	bool load(const std::string &filename) {
		clear();
		m_filename = filename;

		int fd;

		if (-1 == (fd = ::open(m_filename.c_str(), O_RDONLY))) {
			int e = errno;
			std::cerr << "Cannot open file '" << m_filename << "': " << ::strerror(e) << std::endl;
			return false;
		}
		if (-1 == ::fstat(fd, &filestat)) {
			int e = errno;
			std::cerr << "Cannot stat file '" << m_filename << "': " << ::strerror(e) << std::endl;
			close(fd);
			return false;
		}
		if (!S_ISREG(filestat.st_mode)) {
			std::cerr << "Not a regular file: '" << m_filename << "'" << std::endl;
			close(fd);
			return false;
		}
		if (filestat.st_size > std::numeric_limits<ssize_t>::max()) {
			std::cerr << "File too big: '" << m_filename << "': " << filestat.st_size << " > " << std::numeric_limits<ssize_t>::max() << std::endl;
			close(fd);
			return false;
		}
		m_len = filestat.st_size;

#ifdef USE_MMAP

#ifndef MAP_POPULATE
# define MAP_POPULATE 0
#endif

		m_data = (char*) mmap(NULL, m_len, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0);
		if (0 == m_data) {
			int e = errno;
			std::cerr << "Cannot mmap file '" << m_filename << "': " << strerror(e) << std::endl;
			close(fd);
			return false;
		}
#else
		ssize_t r;
		m_data = new char[filestat.st_size];
		if (-1 == (r = ::read(fd, m_data, m_len))) {
			int e = errno;
			std::cerr << "Cannot read file '" << m_filename << "': " << strerror(e) << std::endl;
			close(fd);
			return false;
		}
		close(fd);
#endif

		return true;
	}

	template< std::size_t n > bool tryNext( const char (&cstr)[n] ) {
		size_t len = sizeof(cstr)/sizeof(char);
		if (0 == len) return false;
		len--;
		if (len > m_len - m_pos) return false;
		if (0 != memcmp(cstr, m_data + m_pos, len)) return false;
		m_pos += len;
		return true;
	}

	bool isNext(char c) {
		return !eof() && (c == m_data[m_pos]);
	}

	bool eof() const {
		return m_pos >= m_len;
	}

	void clear() {
#ifdef USE_MMAP
		if (0 != m_data) munmap(m_data, m_len);
#else
		delete m_data;
#endif
		m_data = 0; m_len = m_pos = 0;
	}

	size_t pos() const { return m_pos; }
	size_t len() const { return m_len; }
	const char* data() const { return m_data; }
	const char* c_str() const { return m_data; }

	void next() { if (m_pos < m_len) m_pos++; }
	char current() const { return !eof() ? m_data[m_pos] : '\0'; }

	~Buffer() {
		clear();
	}

	struct stat filestat;

private:
	friend class Torrent;
	friend class TorrentBase;

	std::string m_filename;
	char *m_data;
	size_t m_len, m_pos;
};

class BufferString {
public:
	BufferString()
	: m_data(0), m_len(0) { }

	template< std::size_t n > explicit BufferString(const char (&data)[n])
	: m_data(data), m_len(sizeof(data)/sizeof(char) > 0 ? sizeof(data)/sizeof(char) - 1 : 0) {
	}
	BufferString(const char *data, size_t len)
	: m_data(data), m_len(len) {
	}
	explicit BufferString(const std::string &s)
	: m_data(s.c_str()), m_len(s.length()) { }

	std::string toString() {
		if (0 == m_data || 0 == m_len) return std::string();
		return std::string(m_data, m_len);
	}

	pcrecpp::StringPiece toStringPiece() const {
		return pcrecpp::StringPiece(m_data, m_len);
	}

	std::string sha1() {
		unsigned char raw[20];
		::SHA1((const unsigned char*) m_data, m_len, raw);
		std::string hex(40, '.');
		const char hexchar[] = "0123456789ABCDEF";
		for (int i = 0; i < 20; i++) {
			hex[2*i] = hexchar[raw[i] >> 4];
			hex[2*i+1] = hexchar[raw[i] & 0xf];
		}
		return hex;
	}

	bool validUTF8() {
		return ::validUTF8(m_data, m_len);
	}

	size_t length() const {
		return m_len;
	}
	const char* data() const { return m_data; }
	const char* c_str() const { return m_data; }

	char operator[](size_t ndx) const {
		if (ndx >= m_len) return '\0';
		return m_data[ndx];
	}

private:
	friend class TorrentBase;
	friend class Torrent;
	friend class TorrentSanitize;

	friend bool operator<(const BufferString &a, const BufferString &b);
	friend bool operator<=(const BufferString &a, const BufferString &b);
	friend bool operator==(const BufferString &a, const BufferString &b);

	const char *m_data;
	size_t m_len;
};

bool operator<(const BufferString &a, const BufferString &b) {
	int i = memcmp(a.m_data, b.m_data, std::min(a.m_len, b.m_len));
	if (i < 0) return true;
	if (i == 0 && a.m_len < b.m_len) return true;
	return false;
}
bool operator>(const BufferString &a, const BufferString &b) {
	return b < a;
}
bool operator<=(const BufferString &a, const BufferString &b) {
	int i = memcmp(a.m_data, b.m_data, std::min(a.m_len, b.m_len));
	if (i < 0) return true;
	if (i == 0 && a.m_len <= b.m_len) return true;
	return false;
}
bool operator>=(const BufferString &a, const BufferString &b) {
	return b <= a;
}
bool operator==(const BufferString &a, const BufferString &b) {
	if (a.m_len != b.m_len) return false;
	return (0 == memcmp(a.m_data, b.m_data, a.m_len));
}
bool operator!=(const BufferString &a, const BufferString &b) {
	return !(a == b);
}

std::ostream& operator<<(std::ostream &os, const BufferString &b) {
	os.write(b.data(), b.length());
	return os;
}

static const BufferString bs_empty("");
static const BufferString bs_announce("announce");
static const BufferString bs_announce_list("announce-list");
static const BufferString bs_info("info");
static const BufferString bs_encoding("encoding");
static const BufferString bs_files("files");
static const BufferString bs_length("length");
static const BufferString bs_name("name");
static const BufferString bs_path("path");
static const BufferString bs_piece_length("piece length");
static const BufferString bs_pieces("pieces");
static const BufferString bs_private("private");

class TorrentOStream {
public:
	explicit TorrentOStream(std::ostream &os) : os(os) { }

	std::ostream &os;

	TorrentOStream& operator<<(const std::string &s) {
		os << s.length() << ":" << s; return *this;
	}
	TorrentOStream& operator<<(const BufferString &s) {
		os << s.length() << ":"; os.write(s.data(), s.length()); return *this;
	}
	template< std::size_t n > TorrentOStream& operator<<(const char (&data)[n]) {
		if (n > 0) { os << (n-1) << ":"; os.write(data, n-1); } else { os << "0:"; }
		return *this;
	}
	TorrentOStream& operator<<(int64_t num) {
		os << "i" << num << "e"; return *this;
	}
	template<typename T> TorrentOStream& operator<<(std::vector< T > list) {
		os << "l";
		for (size_t i = 0; i < list.size(); i++) *this << list[i];
		os << "e";
		return *this;
	}
};

typedef std::map<std::string, std::string> TorrentRawParts;
typedef std::pair<std::string, int64_t> File;

class PCRE {
public:
	PCRE() : m_re(0) { }
	~PCRE() { clear(); }

	void clear() {
		delete m_re;
		m_re = 0;
	}

	void load(const char *pattern) {
		m_re = new pcrecpp::RE(pattern);
		if (!m_re->error().empty()) {
			std::cerr << "Couldn't parse pcre pattern: " << m_re->error() << std::endl;
			clear();
			exit(5);
		}
	}

	bool matches(BufferString str) const {
		if (0 == m_re) return false;
		return m_re->FullMatch(str.toStringPiece());
	}

	bool matches(const std::string &str) const {
		if (0 == m_re) return false;
		return m_re->FullMatch(str);
	}

private:
	pcrecpp::RE *m_re;
};

class TorrentSanitize {
public:
	TorrentSanitize() : debug(false), show_paths(false), check_info_utf8(true) {
	}

	bool validMetaKey(BufferString key) const {
		if (0 == key.length()) return false;
		for (size_t i = 0; i < key.length(); i++) {
			if (iscntrl(key[i]) || !isascii(key[i])) return false;
		}
		if (new_meta_entries.end() != new_meta_entries.find(key.toString())) return false;
		return true;
	}

	bool validMetaTextKey(BufferString key) const {
		return filter_meta_text.matches(key);
	}
	bool validMetaNumKey(BufferString key) const {
		return filter_meta_num.matches(key);
	}
	bool validMetaOtherKey(BufferString key) const {
		return filter_meta_other.matches(key);
	}

	bool validURL(const std::string &url, std::string &domain) const {
		size_t domainstart;
		if (stringHasPrefix(url, "dht://")) return false; /* drop dht urls - fallback if no other url is found */
		if (stringHasPrefix(url, "udp://")) {
			domainstart = 6;
		} else if (stringHasPrefix(url, "http://")) {
			domainstart = 7;
		} else if (stringHasPrefix(url, "https://")) {
			domainstart = 8;
		} else {
			if (std::string::npos == url.find("://")) return false; /* not an url */
			/* unknown scheme */
			domain = url;
			return true;
		}
		size_t pos, dot;
		/* search for next ':' or '/' - domain name should end there */
		if (std::string::npos == (pos = url.find(':', domainstart))) pos = url.find('/', domainstart);
		if (std::string::npos != pos) { pos--; } else { pos = url.length() - 1; }
		if (url[pos] == '.') pos--; /* ignore trailing dot */

		/* extract "domain.tld" (without subdomains); pos points to last character of domain name */
		dot = url.find_last_of('.', pos);
		if (std::string::npos != dot) dot = url.find_last_of('.', dot-1);
		if (std::string::npos == dot) { dot = domainstart; } else { dot++; }

		domain = url.substr(dot, pos - dot + 1);
// 		std::cerr << "domain for '" << url << "' is '" << domain << "'\n";
		return true;
	}

	bool whitelistURL(const std::string &url) const {
		return filter_whitelist.matches(url);
	}
	bool blacklistURL(const std::string &url) const {
		return filter_blacklist.matches(url);
	}

	template<typename Value> void add_new_meta_entry(const std::string &key, const Value &value) {
		std::ostringstream raw;
		TorrentOStream(raw) << key << value;
		new_meta_entries.insert(std::make_pair(key, raw.str()));
	}

	TorrentRawParts new_meta_entries;

	bool debug;
	bool show_paths; /* build paths for file entries, joined with '/', show them later */
	bool check_info_utf8; /* as we can't modify the info part, optionally disable struct utf-8 checks */

	PCRE filter_meta_text, filter_meta_num, filter_meta_other;
	PCRE filter_whitelist, filter_blacklist;

	std::vector< std::string > additional_announce_urls;

private:
	std::vector< std::string > m_alloced_strings;
};

class TorrentBase {
public:
	TorrentBase()
	: m_check_info_utf8(true) { }

	std::string lasterror() { return m_lasterror; }
	std::string filename() { return m_buffer.m_filename; }

	std::string infohash() { if (m_info_hash.empty() && m_raw_info.length() > 0) m_info_hash = m_raw_info.sha1(); return m_info_hash; }

	std::string t_announce;
	std::vector< std::vector< std::string > > t_announce_list;

	void sanitize_announce_urls(const TorrentSanitize &san, const TorrentBase *mergefromother = 0);

protected:
	bool m_check_info_utf8;

	Buffer m_buffer;
	bool loadfile(const std::string &filename) {
		if (!m_buffer.load(filename)) return seterror("couldn't load file");
		return true;
	}

	BufferString m_raw_info;
	std::string m_info_hash;

	std::string m_lasterror;

	bool parse_announce_list() {
		std::string s;
		if (m_buffer.eof()) return seterror("expected announce-list, found eof");
		if (!m_buffer.isNext('l')) {
			if (!read_utf8(s)) return errorcontext("expected announce-list, neither list nor string found");
			std::cerr << "broken Announce-list, found single string" << std::endl;
			t_announce_list.push_back(std::vector<std::string>());
			t_announce_list.back().push_back(s);
		} else {
			bool warned1 = false;
			m_buffer.next();
			while (!m_buffer.eof() && !m_buffer.isNext('e')) {
				if (!m_buffer.isNext('l')) {
					if (!read_utf8(s)) return errorcontext("expected announce-list list, neither list nor string found");
					if (!warned1) {
						std::cerr << "broken announce-list, found string in list, should be in separate list" << std::endl;
						warned1 = true;
					}
					if (t_announce_list.empty()) t_announce_list.push_back(std::vector<std::string>());
					t_announce_list.back().push_back(s);
				} else {
					t_announce_list.push_back(std::vector<std::string>());
					m_buffer.next();
					while (!m_buffer.eof() && !m_buffer.isNext('e')) {
						if (!read_utf8(s)) return errorcontext("expected announce-list list entry");
						t_announce_list.back().push_back(s);
					}
					if (!m_buffer.isNext('e')) return seterror("expected announce-list list, found eof");
					m_buffer.next();
				}
			}
			if (!m_buffer.isNext('e')) return seterror("expected announce-list lists, found eof");
			m_buffer.next();
		}
		return true;
	}


	bool errorcontext(const char msg[]) {
		std::ostringstream oss;
		oss << "Error: " << msg << "\n    in " << m_lasterror;
		m_lasterror = oss.str();
		return false;
	}

	bool seterror(const char msg[]) {
		std::ostringstream oss;
		std::string context = std::string(m_buffer.m_data + m_buffer.pos(), std::min<size_t>(16, m_buffer.m_len - m_buffer.pos()));
		oss << "Error @[" << m_buffer.pos() << "/" << m_buffer.m_len << " '" << context << "'...]: " << msg;
		m_lasterror = oss.str();
		return false;
	}

	bool read_string(BufferString &str) {
		char c;
		int64_t pos = m_buffer.pos(), len = m_buffer.m_len;
		int64_t slen = 0;
		str.m_data = 0; str.m_len = 0;

		if (pos >= len) return seterror("expected string length, found eof");

		c = m_buffer.m_data[pos++];
		if (c < '0' || c > '9') return seterror("expected digit for string length, found eof");
		if (pos >= len) return seterror("expected string length, found eof");

		if (c == '0' && m_buffer.m_data[pos] != ':') return seterror("expected string length, found leading zero of non zero length (no following ':')");

		slen = (c - '0');
		for (;;) {
			c = m_buffer.m_data[pos++];
			if (c == ':') break;
			if (c < '0' || c > '9') return seterror("expected digit or colon for string length, found eof");
			if (pos >= len) return seterror("expected string length, found eof");
			if (slen > std::numeric_limits<int32_t>::max()) return seterror("string length overflow");
			slen = 10*slen + (c - '0');
		}

		if (slen > len || slen > len - pos) return seterror("file not large enough for string length"); /* overflow */

		str.m_data = m_buffer.m_data + pos;
		str.m_len = slen;
		m_buffer.m_pos = pos + slen;

		return true;
	}
	bool read_string(std::string &str) {
		BufferString tmp;
		if (!read_string(tmp)) return false;
		str = tmp.toString();
		return true;
	}

	bool skip_string() {
		BufferString tmp;
		return read_string(tmp);
	}

	bool read_utf8(BufferString &str) {
		size_t pos = m_buffer.pos();
		if (!read_string(str)) return false;
		if (!str.validUTF8()) { m_buffer.m_pos = pos; return seterror("string not valid utf-8"); }
		return true;
	}
	bool read_utf8(std::string &str) {
		size_t pos = m_buffer.pos();
		BufferString tmp;
		if (!read_string(tmp)) return false;
		if (!tmp.validUTF8()) { m_buffer.m_pos = pos; return seterror("string not valid utf-8"); }
		str = tmp.toString();
		return true;
	}
	bool skip_utf8() {
		BufferString tmp;
		return read_utf8(tmp);
	}

	bool read_info_utf8(BufferString &str) {
		return m_check_info_utf8 ? read_utf8(str) : read_string(str);
	}
	bool read_info_utf8(std::string &str) {
		return m_check_info_utf8 ? read_utf8(str) : read_string(str);
	}
	bool skip_info_utf8() {
		return m_check_info_utf8 ? skip_utf8() : skip_string();
	}

	bool read_number(int64_t &number) {
		char c;
		int64_t pos = m_buffer.pos(), len = m_buffer.m_len;
		number = 0;

		if (pos >= len) return seterror("expected number, found eof");

		c = m_buffer.m_data[pos++];
		if (c != 'i') return seterror("expected 'i' for number");
		if (pos >= len) return seterror("expected digit for number, found eof");

		c = m_buffer.m_data[pos++];
		if (pos >= len) return seterror("expected digit, '-' or 'e' for number, found eof");

		if (c == '-') {
			c = m_buffer.m_data[pos++];
			if (pos >= len) return seterror("expected digit or 'e' for number, found eof");
			if (c == '0') return seterror("found leading zero in negative number");
			if (c < '1' || c > '9') return seterror("expected leading digit for negative number");

			number = -(c - '0');
			for (;;) {
				c = m_buffer.m_data[pos++];
				if (c == 'e') break;
				if (pos >= len) return seterror("expected digit or 'e' for number, found eof");
				if (c < '0' || c > '9') return seterror("expected digit or 'e' for number");
				if ((std::numeric_limits<int64_t>::min() + (c - '0')) / 10 > number) return seterror("number too small for int64_t"); /* underflow */
				number = 10*number - (c - '0');
			}
		} else {
			if (c == '0' && m_buffer.m_data[pos] != 'e') return seterror("found leading zero for non zero number");
			if (c < '0' || c > '9') return seterror("expected leading digit for number");

			number = (c - '0');
			for (;;) {
				c = m_buffer.m_data[pos++];
				if (c == 'e') break;
				if (pos >= len) return seterror("expected digit or 'e' for number, found eof");
				if (c < '0' || c > '9') return seterror("expected digit or 'e' for number");
				if ((std::numeric_limits<int64_t>::max() - (c - '0')) / 10 < number) return seterror("number too large for int64_t"); /* overflow */
				number = 10*number + (c - '0');
			}
		}

		m_buffer.m_pos = pos;
		return true;
	}

	bool skip_number() {
		char c;
		int64_t pos = m_buffer.pos(), len = m_buffer.m_len;

		if (pos >= len) return seterror("expected number, found eof");

		c = m_buffer.m_data[pos++];
		if (c != 'i') return seterror("expected 'i' for number");
		if (pos >= len) return seterror("expected digit for number, found eof");

		c = m_buffer.m_data[pos++];
		if (pos >= len) return seterror("expected digit, '-' or 'e' for number, found eof");

		if (c == '-') {
			c = m_buffer.m_data[pos++];
			if (pos >= len) return seterror("expected digit or 'e' for number, found eof");
			if (c == '0') return seterror("found leading zero in negative number");
			if (c < '1' || c > '9') return seterror("expected leading digit for negative number");
		} else {
			if (c == '0' && m_buffer.m_data[pos] != 'e') return seterror("found leading zero for non zero number");
			if (c < '0' || c > '9') return seterror("expected leading digit for number");
		}

		for (;;) {
			c = m_buffer.m_data[pos++];
			if (c == 'e') break;
			if (pos >= len) return seterror("expected digit or 'e' for number, found eof");
			if (c < '0' || c > '9') return seterror("expected digit or 'e' for number");
		}

		m_buffer.m_pos = pos;
		return true;
	}

	bool skip_list() {
		if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected list, found eof");
		if (m_buffer.m_data[m_buffer.pos()] != 'l') return seterror("expected 'l' for list");

		m_buffer.next();
		if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected list entry or 'e', found eof");

		while (m_buffer.m_data[m_buffer.pos()] != 'e') {
			if (!skip_value()) return errorcontext("parsing list entry failed");;
			if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected list entry or 'e', found eof");
		}
		m_buffer.next();

		return true;
	}

	bool skip_dict() {
		if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict, found eof");
		if (m_buffer.m_data[m_buffer.pos()] != 'd') return seterror("expected 'd' for dict");

		m_buffer.next();
		if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict entry or 'e', found eof");

		BufferString last, cur;

		while (m_buffer.m_data[m_buffer.pos()] != 'e') {
			if (!read_utf8(cur)) return errorcontext("parsing dict key failed");
			if (cur <= last) return seterror("(previous) dict entries in wrong order");
			last = cur;
			if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict value, found eof");
			if (!skip_value()) return errorcontext("parsing dict value failed");
			if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict entry or 'e', found eof");
		}
		m_buffer.next();

		return true;
	}

	/* skip entries until found search or dict end */
	bool try_next_dict_entry(BufferString search, BufferString prev, bool &error, BufferString *skipped = 0) {
		size_t start = m_buffer.pos();
		error = true;
		if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict, found eof");

		BufferString cur;

		while (m_buffer.m_data[m_buffer.pos()] != 'e') {
			size_t curpos = m_buffer.pos();;
			if (!read_utf8(cur)) return errorcontext("parsing dict key failed");
			if (cur <= prev) return seterror("(previous) dict entries in wrong order");
			if (cur == search) {
				error = false;
				if (0 != skipped) *skipped = BufferString(m_buffer.data() + start, m_buffer.pos() - start);
				return true;
			}
			if (cur > search) {
				m_buffer.m_pos = curpos;
				error = false;
				if (0 != skipped) *skipped = BufferString(m_buffer.data() + start, m_buffer.pos() - start);
				return false;
			}
			prev = cur;
			if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict value, found eof");
			if (!skip_value()) return errorcontext("parsing dict value failed");
			if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict entry or 'e', found eof");
		}

		/* dict end found, don't skip the 'e' */
		error = false;
		if (0 != skipped) *skipped = BufferString(m_buffer.data() + start, m_buffer.pos() - start);
		return false;
	}

	/* skip entries until found search or dict end */
	bool goto_dict_end(BufferString prev, BufferString *skipped = 0) {
		size_t start = m_buffer.pos();
		if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict entry or 'e', found eof");

		BufferString cur;

		while (m_buffer.m_data[m_buffer.pos()] != 'e') {
			if (!read_utf8(cur)) return errorcontext("parsing dict key failed");
			if (cur <= prev) return seterror("(previous) dict entries in wrong order");
			prev = cur;
			if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict value, found eof");
			if (!skip_value()) return errorcontext("parsing dict value failed");
			if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict entry or 'e', found eof");
		}
		m_buffer.next();

		if (0 != skipped) *skipped = BufferString(m_buffer.data() + start, m_buffer.pos() - start);

		return true;
	}

	bool skip_value() {
		char c;
		if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected value, found eof");
		c = m_buffer.m_data[m_buffer.pos()];
		if (c >= '0' && c <= '9') return skip_string();
		if (c == 'i') return skip_number();
		if (c == 'l') return skip_list();
		if (c == 'd') return skip_dict();
		return seterror("expected value");
	}
};

class AnnounceList {
public:
	AnnounceList(const TorrentSanitize &san) : m_san(san) { }

	void merge(std::string url) {
		std::string domain;
		if (!m_san.validURL(url, domain)) return;

		if (!m_san.whitelistURL(url) && m_san.blacklistURL(url)) return;

		add(domain, url);
	}

	template<typename T> void merge(std::vector< T > urllist) {
		for (size_t i = 0; i < urllist.size(); i++) merge(urllist[i]);
	}

	void force_merge(std::string url) {
		std::string domain;
		if (!m_san.validURL(url, domain)) return;

		add(domain, url);
	}

	template<typename T> void force_merge(std::vector< T > urllist) {
		for (size_t i = 0; i < urllist.size(); i++) merge(urllist[i]);
	}

	void merge(const TorrentBase &t) {
		merge(t.t_announce);
		merge(t.t_announce_list);
	}

	std::vector< std::vector< std::string > > list;

private:
	const TorrentSanitize &m_san;

	typedef std::map< std::string, size_t > GroupIndex;
	GroupIndex m_index;

	void add(const std::string &domain, const std::string &url) {
		GroupIndex::iterator it;
		it = m_index.find(domain);
		if (m_index.end() == it || it->second >= list.size()) {
			m_index.insert(std::make_pair( domain, list.size() ));
			std::vector<std::string> l;
			l.push_back(url);
			list.push_back(l);
		} else {
			std::vector<std::string> &l = list[it->second];
			if (l.end() == std::find(l.begin(), l.end(), url)) l.push_back(url);
		}
	}
};

class Torrent : public TorrentBase {
public:
	Torrent(const TorrentSanitize &san) : m_san(san) {
		m_check_info_utf8 = m_san.check_info_utf8;
	}

	bool load(const std::string &filename) {
		if (!loadfile(filename)) return false;

		if (!m_buffer.tryNext("d8:announce")) return seterror("doesn't look like a valid torrent, expected 'd8:announce'");
		if (!read_utf8(t_announce)) return errorcontext("parsing torrent announce failed");

		BufferString prevkey(m_buffer.m_data+3, 8), curkey;
		while (!m_buffer.eof() && !m_buffer.isNext('e')) {
			size_t curpos = m_buffer.pos();
			if (!read_utf8(curkey)) return errorcontext("parsing dict key in torrent failed");
			if (curkey <= prevkey) return seterror("wrong key order in torrent dict");
			prevkey = curkey;

			if (curkey == BufferString("announce-list")) {
				if (!parse_announce_list()) return errorcontext("parsing torrent announce-list failed");
			} else if (curkey == BufferString("info")) {
				curpos = m_buffer.pos();
				if (!parse_info()) return errorcontext("parsing torrent info failed");
				m_raw_info = BufferString(m_buffer.m_data + curpos, m_buffer.pos() - curpos);
			} else if (curkey == BufferString("encoding")) {
				if (!read_utf8(t_encoding)) return errorcontext("parsing torrent encoding failed");
			} else {
				std::string content;
				int64_t number;

				if (!m_san.validMetaKey(curkey)) {
					if (!skip_value()) return errorcontext("parsing torrent meta entry failed");
					if (m_san.debug) std::cerr << "Skipped entry '" << curkey.toString() << "'\n";
				} else if (m_san.validMetaTextKey(curkey) && read_utf8(content)) {
					if (m_san.debug) std::cerr << "Additional text entry '" << curkey.toString() << "': '" << content << "'\n";
					m_raw_parts.insert(std::make_pair(curkey.toString(), BufferString(m_buffer.m_data + curpos, m_buffer.pos() - curpos).toString()));
				} else if (m_san.validMetaNumKey(curkey) && read_number(number)) {
					if (m_san.debug) std::cerr << "Additional numeric entry '" << curkey.toString() << "': " << number << "\n";
					m_raw_parts.insert(std::make_pair(curkey.toString(), BufferString(m_buffer.m_data + curpos, m_buffer.pos() - curpos).toString()));
				} else if (m_san.validMetaOtherKey(curkey)) {
					if (!skip_value()) return errorcontext("parsing torrent meta entry failed");
					if (m_san.debug) std::cerr << "Additional raw entry '" << curkey.toString() << "\n";
					m_raw_parts.insert(std::make_pair(curkey.toString(), BufferString(m_buffer.m_data + curpos, m_buffer.pos() - curpos).toString()));
				} else if (skip_value()) {
					if (m_san.debug) std::cerr << "Skipped entry '" << curkey.toString() << "'\n";
				} else {
					return errorcontext("parsing torrent meta entry failed");
				}
			}
		}

		m_raw_parts.insert(m_san.new_meta_entries.begin(), m_san.new_meta_entries.end());

		if (m_buffer.m_len-1 != m_buffer.pos()) {
			if (m_buffer.eof()) return seterror("unexpected end of file while parsing torrent");
			return seterror("file contains garbaga after torrent");
		}

		return true;
	}

	void write(std::ostream &os) const {
		TorrentOStream tos(os);

		os << "d8:announce";
		tos << t_announce;

		writerawkeys(os, bs_announce, bs_announce_list);

		if (!t_announce_list.empty()) {
			os << "13:announce-list";
			tos << t_announce_list;
		} else {
			writerawkey(os, bs_announce_list);
		}

		if (!t_encoding.empty()) {
			writerawkeys(os, bs_announce_list, bs_encoding);
			os << "8:encoding";
			tos << bs_encoding;
			writerawkeys(os, bs_encoding, bs_info);
		} else {
			writerawkeys(os, bs_announce_list, bs_info);
		}

		os << "4:info" << m_raw_info;

		writerawkeys(os, bs_info, BufferString());

		os << "e";
	}
	void print_details() {
		std::cout << "Announce-url: " << t_announce << std::endl;
		std::cout << "Announce-list:\n";
		for (size_t i = 0; i < t_announce_list.size(); i++) {
			std::cout << " - [ ";
			for (size_t j = 0; j < t_announce_list[i].size(); j++) {
				if (j > 0) std::cout << ", ";
				std::cout << t_announce_list[i][j];
			}
			std::cout << " ]\n";
		}
		if (!t_encoding.empty()) std::cout << "Encoding: " << t_encoding << std::endl;

		std::cout << "Info name: " << t_info_name << std::endl;
		std::cout << "Info files: " << t_info_files.size() << std::endl;
		if (m_san.show_paths) {
			for (size_t i = 0; i < t_info_files.size(); i++) {
				std::cout << " - " << t_info_files[i].second << " bytes: '" << t_info_files[i].first << "'" << std::endl;
			}
		}
		std::cout << "Info complete length: " << t_info_complete_length << " bytes" << std::endl;
		std::cout << "Info pieces length: " << t_info_piece_length << " bytes" << std::endl;
		std::cout << "Info-Hash: " << infohash() << std::endl;
	}

private:
	bool parse_info() {
		bool err;
		BufferString prev;
		std::string tmps;
		/* ('length':number | 'files':...) ['name': string] 'piece length':number 'pieces':raw[%20] */
		if (m_buffer.eof()) return seterror("expected torrent info, found eof");
		if (!m_buffer.isNext('d')) return seterror("expected 'd' for dict");
		m_buffer.next();

		if (try_next_dict_entry(bs_files, bs_empty, err)) {
			t_info_complete_length = 0;
			if (!parse_info_files()) return errorcontext("couldn't parse files in torrent info");
			prev = bs_files;
		} else if (err) {
			return errorcontext("couldn't find torrent info key");
		} else if (try_next_dict_entry(bs_length, bs_files, err)) {
			if (!read_number(t_info_complete_length)) return errorcontext("couldn't parse length in torrent info");
			prev = bs_length;
		} else if (err) {
			return errorcontext("couldn't find torrent info key");
		} else return seterror("expected files or length in torrent info");

		if (try_next_dict_entry(bs_name, prev, err)) {
			if (!read_info_utf8(t_info_name)) return errorcontext("couldn't parse name in torrent info");
		} else if (err) {
			return errorcontext("couldn't find torrent info key");
		} else {
			std::cerr << "torrent info has no name entry\n";
		}

		if (try_next_dict_entry(bs_piece_length, bs_name, err)) {
			if (!read_number(t_info_piece_length)) return errorcontext("couldn't parse piece length in torrent info");
		} else if (err) {
			return errorcontext("couldn't find torrent info key");
		} else {
			return seterror("expected piece length in torrent info");
		}

		if (try_next_dict_entry(bs_pieces, bs_piece_length, err)) {
			BufferString pieces;
			if (!read_string(pieces)) return errorcontext("couldn't parse pieces in torrent info");
			if (0 != pieces.m_len % 20) return seterror("pieces in torrent info has wrong length (not a multiple of 20)");
		} else if (err) {
			return errorcontext("couldn't find torrent info key");
		} else {
			return seterror("expected piece length in torrent info");
		}

		if (try_next_dict_entry(bs_private, bs_pieces, err)) {
			int64_t private_flag;
			if (!read_number(private_flag)) return errorcontext("couldn't parse torrent info private flag");
			if (0 != private_flag && 1 != private_flag) return seterror("torrent info private flag is neither 0 nor 1");
		} else if (err) {
			return errorcontext("couldn't find torrent info key");
		}

		if (!goto_dict_end(bs_private)) return errorcontext("couldn't find torrent info key");

		return true;
	}

	bool parse_info_files() {
		if (!m_buffer.isNext('l')) return seterror("expected 'l' for list");
		m_buffer.next();

		while (!m_buffer.eof() && !m_buffer.isNext('e')) {
			if (!parse_info_file()) return errorcontext("couldn't parse files entry");
		}
		if (!m_buffer.isNext('e')) return seterror("expected info files entry, found eof");
		m_buffer.next();

		if (0 == t_info_files.size()) return seterror("empty files list");

		return true;
	}

	bool parse_info_file() {
		if (!m_buffer.isNext('d')) return seterror("expected 'd' for dict");
		m_buffer.next();

		bool err;

		int64_t length;
		std::string path;

		if (try_next_dict_entry(bs_length, bs_empty, err)) {
			if (!read_number(length)) return seterror("couldn't parse length");
			if (length < 0) return seterror("negative file length");
			t_info_complete_length += length;
		} else if (err) {
			return errorcontext("couldn't find info file entry key");
		} else {
			return seterror("expected length in file entry");
		}

		if (try_next_dict_entry(bs_path, bs_length, err)) {
			if (!parse_info_file_path(path)) return errorcontext("couldn't parse path in file entry");
		} else if (err) {
			return errorcontext("couldn't find info file entry key");
		} else {
			return seterror("expected path in file entry");
		}

		t_info_files.push_back(File(path, length));

		if (!goto_dict_end(bs_private)) return seterror("expected end of info files entry");

		return true;
	}

	bool parse_info_file_path(std::string &path) {
		int components = 0;
		std::ostringstream buildpath;
		std::string part;
		if (!m_buffer.isNext('l')) return seterror("expected 'l' for list");
		m_buffer.next();

		while (!m_buffer.eof() && !m_buffer.isNext('e')) {
			if (m_san.show_paths) {
				if (!read_info_utf8(part)) return errorcontext("couldn't parse path component");
				if (components > 0) buildpath << '/';
				buildpath << part;
			} else {
				if (!skip_info_utf8()) return errorcontext("couldn't parse path component");
			}
			components++;
		}
		if (!m_buffer.isNext('e')) return seterror("expected path component, found eof");
		m_buffer.next();

		path = buildpath.str();

		return true;
	}

	void writerawkeys(std::ostream &os, BufferString prev, BufferString next) const {
		std::string ns = next.toString();
		TorrentRawParts::const_iterator it = m_raw_parts.upper_bound(prev.toString());
		while (it != m_raw_parts.end() && (0 == next.length() || it->first < ns)) {
			os << it->second;
			it++;
		}
	}
	void writerawkey(std::ostream &os, BufferString key) const {
		TorrentRawParts::const_iterator it = m_raw_parts.find(key.toString());
		if (it != m_raw_parts.end()) os << it->second;
	}

	const TorrentSanitize &m_san;

	std::string t_encoding;

	std::string t_info_name;
	int64_t t_info_piece_length;
	int64_t t_info_complete_length;
	std::vector<File> t_info_files;

	TorrentRawParts m_raw_parts;
};

std::ostream& operator<<(std::ostream &os, const Torrent &t) {
	t.write(os);
	return os;
}

/* only load announce urls and info hash */
class TorrentAnnounceInfo : public TorrentBase {
public:
	TorrentAnnounceInfo() {
	}

	bool load(const std::string &filename) {
		bool err;

		if (!loadfile(filename)) return false;

		if (!m_buffer.tryNext("d8:announce")) return seterror("doesn't look like a valid torrent, expected 'd8:announce'");
		if (!read_utf8(t_announce)) return errorcontext("parsing torrent announce failed");

		if (try_next_dict_entry(bs_announce_list, bs_announce, err, &m_post_announce)) {
			if (!parse_announce_list()) return errorcontext("parsing torrent announce-list failed");
		} else if (err) {
			return errorcontext("parsing dict key in torrent failed");
		}

		if (try_next_dict_entry(bs_info, bs_announce_list, err, &m_post_announce_list)) {
			size_t curpos = m_buffer.pos();
			if (!skip_value()) return errorcontext("parsing torrent info failed");
			m_raw_info = BufferString(m_buffer.data() + curpos, m_buffer.pos() - curpos);
		} else if (err) {
			return errorcontext("parsing dict key in torrent failed");
		} else {
			return seterror("no info key in torrent");
		}

		if (!goto_dict_end(bs_info, &m_post_info)) return seterror("expected end of info files entry");

		return true;
	}

	void write(std::ostream &os) const {
		TorrentOStream tos(os);

		os << "d8:announce";
		tos << t_announce;

		os << m_post_announce;

		if (!t_announce_list.empty()) {
			os << "13:announce-list";
			tos << t_announce_list;
		}

		os << m_post_announce_list << "4:info" << m_raw_info << m_post_info;
	}

	void print_details() {
		std::cout << "Announce-url: " << t_announce << std::endl;
		std::cout << "Announce-list:\n";
		for (size_t i = 0; i < t_announce_list.size(); i++) {
			if (i > 0) std::cout << " -- \n";
			for (size_t j = 0; j < t_announce_list[i].size(); j++) {
				std::cout << " - " << t_announce_list[i][j] << "\n";
			}
		}

		std::cout << "Info-Hash: " << infohash() << std::endl;
	}

private:
	BufferString m_post_announce, m_post_announce_list, m_post_info;
};

std::ostream& operator<<(std::ostream &os, const TorrentAnnounceInfo &t) {
	t.write(os);
	return os;
}

/* only read announce urls, don't verify any data after announce-list */
/* very fast as the load method doesn't read the whole torrent from disk */
class TorrentAnnounce : public TorrentBase {
public:
	TorrentAnnounce() {
	}

	bool load(const std::string &filename) {
		bool err;

		if (!loadfile(filename)) return false;

		if (!m_buffer.tryNext("d8:announce")) return seterror("doesn't look like a valid torrent, expected 'd8:announce'");
		if (!read_utf8(t_announce)) return errorcontext("parsing torrent announce failed");

		if (try_next_dict_entry(bs_announce_list, bs_announce, err, &m_post_announce)) {
			if (!parse_announce_list()) return errorcontext("parsing torrent announce-list failed");
		} else if (err) {
			return errorcontext("parsing dict key in torrent failed");
		}

		/* just remember which part we skipped - this doesn't actually read anything */
		m_post_announce_list = BufferString(m_buffer.data() + m_buffer.pos(), m_buffer.len() - m_buffer.pos());

		return true;
	}

	void write(std::ostream &os) const {
		TorrentOStream tos(os);

		os << "d8:announce";
		tos << t_announce;

		os << m_post_announce;

		if (!t_announce_list.empty()) {
			os << "13:announce-list";
			tos << t_announce_list;
		}

		os << m_post_announce_list;
	}

	void print_details() {
		std::cout << "Announce-url: " << t_announce << std::endl;
		std::cout << "Announce-list:\n";
		for (size_t i = 0; i < t_announce_list.size(); i++) {
			if (i > 0) std::cout << " -- \n";
			for (size_t j = 0; j < t_announce_list[i].size(); j++) {
				std::cout << " - " << t_announce_list[i][j] << "\n";
			}
		}
	}

private:
	BufferString m_post_announce, m_post_announce_list;
};

std::ostream& operator<<(std::ostream &os, const TorrentAnnounce &t) {
	t.write(os);
	return os;
}

void TorrentBase::sanitize_announce_urls(const TorrentSanitize &san, const TorrentBase *mergefromother) {
	AnnounceList list(san);
	list.force_merge(san.additional_announce_urls);
	list.merge(*this);

	if (0 != mergefromother) list.merge(*mergefromother);

	t_announce_list.clear();

	if (list.list.empty()) {
		std::string hash = infohash();
		for (size_t i = 0; i < hash.length(); i++) hash[i] = ::toupper(hash[i]);
		t_announce = std::string("dht://") + hash;
	} else {
		t_announce = list.list[0][0];
		t_announce_list = list.list;
	}
}

template<typename T> bool writeAtomicFile(const std::string &filename, const T &t) {
	char *tmpfname = new char[filename.length() + 8];
	::memcpy(tmpfname, filename.c_str(), filename.length());
	::memcpy(tmpfname + filename.length(), ".XXXXXX", 8);
	int fd = ::mkstemp(tmpfname);
	if (-1 == fd) {
		int e = errno;
		std::cerr << "Cannot create secure tempfile '" << tmpfname << "': " << ::strerror(e) << std::endl;
		return false;
	}

	{
		std::ofstream os(tmpfname, std::ios_base::binary | std::ios_base::trunc | std::ios_base::out);
		if (os.fail()) {
			int e = errno;
			std::cerr << "Cannot open file '" << tmpfname << "': " << ::strerror(e) << std::endl;
			::close(fd);
			return false;
		}
		os << t;
		os.close();
	}
	::fchmod(fd, 0644);
	::close(fd);
	if (-1 == ::rename(tmpfname, filename.c_str())) {
		int e = errno;
		std::cerr << "Cannot rename tempfile '" << tmpfname << "' to '" << filename << "': " << ::strerror(e) << std::endl;
		return false;
	}

	return true;
}

void syntax() {
	std::cerr << "Syntax: \n"
		"\tprint info (default command):\n"
		"\t\ttorrent-sanitize -i [-f] [-d] [-v] file.torrent\n"
		"\n"
		"\tsanitize:\n"
		"\t\ttorrent-sanitize -s [-d] [-v] infile.torrent outfile.torrent\n"
		"\n"
		"\tcalculate info hash / show announce urls:\n"
		"\t\ttorrent-sanitize [-h] [-u] file.torrent\n"
		"\n"
		"\t\t -f: show files\n"
		"\t\t -d: debug mode\n"
		"\t\t -v: verify strict: utf-8 checks (more may come)\n";
	exit(100);
}

int main(int argc, char **argv) {
	int opt_sanitize = 0, opt_info_hash = 0, opt_show_urls = 0, opt_show_info = -1, opt_show_files = 0, opt_debug = 0, opt_verify = 0;

	int c;
	while (-1 != (c = getopt(argc, argv, "ifdvshu"))) {
		switch (c) {
		case '?':
			syntax();
			break;
		case 'i':
			opt_show_info = 1;
			break;
		case 'f':
			opt_show_files = 1;
			break;
		case 'd':
			opt_debug = 1;
			break;
		case 'v':
			opt_verify = 1;
			break;
		case 's':
			opt_sanitize = 1;
			break;
		case 'h':
			opt_info_hash = 1;
			if (-1 == opt_show_info) opt_show_info = 0;
			break;
		case 'u':
			opt_show_urls = 1;
			if (-1 == opt_show_info) opt_show_info = 0;
			break;
		}
	}

	int filenames = argc - optind;

	if (opt_sanitize) {
		if (2 != filenames) syntax();

		TorrentSanitize san;

		san.show_paths = false;
		san.debug = opt_debug;
		san.filter_meta_text.load("comment|created by");
		san.filter_meta_num.load("creation date");
		san.filter_meta_other.load("");
		san.check_info_utf8 = opt_verify;

		san.add_new_meta_entry(std::string("comment"), std::string("Torrent downloaded from torrent cache at http://torrage.com"));
		san.additional_announce_urls.push_back(std::string("udp://tracker.thepiratebay.org:80/announce"));
		san.additional_announce_urls.push_back(std::string("udp://tracker.openbittorrent.com:80/announce"));

		Torrent t(san);
		if (!t.load(std::string(argv[optind]))) {
			std::cerr << t.filename() << ": " << t.lasterror() << std::endl;
			return 1;
		}
		t.sanitize_announce_urls(san);
		writeAtomicFile(std::string(argv[optind+1]), t);
	} else if (opt_show_info) {
		/* show info */;
		if (1 != filenames) syntax();

		TorrentSanitize san;

		san.show_paths = opt_show_files;
		san.debug = opt_debug;
		san.filter_meta_text.load("comment|created by");
		san.filter_meta_num.load("creation date");
		san.filter_meta_other.load("");
		san.check_info_utf8 = opt_verify;

		Torrent t(san);
		if (!t.load(std::string(argv[optind]))) {
			std::cerr << t.filename() << ": " << t.lasterror() << std::endl;
			return 1;
		}
		t.print_details();
	} else if (opt_info_hash) {
		/* show info hash, optionally urls */;
		if (1 != filenames) syntax();

		TorrentAnnounceInfo t;
		if (!t.load(std::string(argv[optind]))) {
			std::cerr << t.filename() << ": " << t.lasterror() << std::endl;
			return 1;
		}
		std::cout << t.infohash() << std::endl;
		if (opt_show_urls) {
			std::cout << t.t_announce << "\n";
			for (size_t i = 0; i < t.t_announce_list.size(); i++) {
				for (size_t j = 0; j < t.t_announce_list[i].size(); j++) {
					std::cout << t.t_announce_list[i][j] << "\n";
				}
			}
		}
	} else {
		/* show urls */
		if (1 != filenames) syntax();

		TorrentAnnounce t;
		if (!t.load(std::string(argv[optind]))) {
			std::cerr << t.filename() << ": " << t.lasterror() << std::endl;
			return 1;
		}
		std::cout << t.t_announce << "\n";
		for (size_t i = 0; i < t.t_announce_list.size(); i++) {
			for (size_t j = 0; j < t.t_announce_list[i].size(); j++) {
				std::cout << t.t_announce_list[i][j] << "\n";
			}
		}
	}
	return 0;
}

	override LDFLAGS += -lssl -lpcrecpp
	override CPPFLAGS += -O2 -Wall

	all: torrent-sanitize

	clean:
	rm torrent-sanitize

	.PHONY: all clean