Skip to content

Instantly share code, notes, and snippets.

@stbuehler
Created April 11, 2011 00:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stbuehler/912884 to your computer and use it in GitHub Desktop.
Save stbuehler/912884 to your computer and use it in GitHub Desktop.
analyse/fix torrent files
override LDFLAGS += -lssl -lpcrecpp
override CPPFLAGS += -O2 -Wall
all: torrent-sanitize
clean:
rm torrent-sanitize
.PHONY: all clean
/*
goal: fix torrents (if possible): cleanup announce urls, set custom comments, filter meta fields ...
compile:
LDFLAGS='-lssl -lpcrecpp' CPPFLAGS='-O2 -Wall' make torrent-sanitize
needs openssl-dev for sha1 and libpcre++-dev
*/
/* using MMAP is dangerous if files are not updated atomically;
* atomic file update works like this:
* write to "<filename>.tmp$$"
* mv "<filename>.tmp$$" -> "<filename>"
* (the tmp file should be on the same filesystem)
* MMAP is only used to read files - it delays reading the actual content until
* the code accesses the memory
*/
#define USE_MMAP
#include <string>
#include <vector>
#include <map>
#include <algorithm>
#include <iostream>
#include <sstream>
#include <fstream>
#include <limits>
#include <pcrecpp.h>
extern "C" {
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <stdint.h>
#include <openssl/sha.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdio.h>
#ifdef USE_MMAP
# include <sys/mman.h>
#endif
}
bool validUTF8(const char *_s, size_t len) {
const unsigned char *s = (const unsigned char*) _s;
for (size_t i = 0; i < len; i++) {
size_t seqlen = 0;
if (0 == (s[i] & 0x80)) { seqlen = 0; }
else if (0xC0 == (s[i] & 0xE0)) { seqlen = 1; }
else if (0xE0 == (s[i] & 0xF0)) { seqlen = 2; }
else if (0xF0 == (s[i] & 0xF8)) { seqlen = 3; }
else if (0xF8 == (s[i] & 0xFC)) { seqlen = 4; }
else if (0xFC == (s[i] & 0xFE)) { seqlen = 5; }
else return false;
if (seqlen > len - i) return false;
for ( ; seqlen > 0; seqlen--, i++) {
if (0x80 != (s[i+1] && 0xC0)) return false;
}
}
return true;
}
bool validUTF8(std::string s) {
return validUTF8(s.c_str(), s.length());
}
template<size_t N> bool stringHasPrefix(const std::string &s, const char (&prefix)[N]) {
return s.length() >= (N-1) && 0 == memcmp(s.c_str(), prefix, N-1);
}
class Buffer {
private:
Buffer(const Buffer &b);
Buffer& operator=(const Buffer &b);
public:
Buffer() :m_data(0), m_len(0), m_pos(0) {
}
bool load(const std::string &filename) {
clear();
m_filename = filename;
int fd;
if (-1 == (fd = ::open(m_filename.c_str(), O_RDONLY))) {
int e = errno;
std::cerr << "Cannot open file '" << m_filename << "': " << ::strerror(e) << std::endl;
return false;
}
if (-1 == ::fstat(fd, &filestat)) {
int e = errno;
std::cerr << "Cannot stat file '" << m_filename << "': " << ::strerror(e) << std::endl;
close(fd);
return false;
}
if (!S_ISREG(filestat.st_mode)) {
std::cerr << "Not a regular file: '" << m_filename << "'" << std::endl;
close(fd);
return false;
}
if (filestat.st_size > std::numeric_limits<ssize_t>::max()) {
std::cerr << "File too big: '" << m_filename << "': " << filestat.st_size << " > " << std::numeric_limits<ssize_t>::max() << std::endl;
close(fd);
return false;
}
m_len = filestat.st_size;
#ifdef USE_MMAP
#ifndef MAP_POPULATE
# define MAP_POPULATE 0
#endif
m_data = (char*) mmap(NULL, m_len, PROT_READ, MAP_PRIVATE | MAP_POPULATE, fd, 0);
if (0 == m_data) {
int e = errno;
std::cerr << "Cannot mmap file '" << m_filename << "': " << strerror(e) << std::endl;
close(fd);
return false;
}
#else
ssize_t r;
m_data = new char[filestat.st_size];
if (-1 == (r = ::read(fd, m_data, m_len))) {
int e = errno;
std::cerr << "Cannot read file '" << m_filename << "': " << strerror(e) << std::endl;
close(fd);
return false;
}
close(fd);
#endif
return true;
}
template< std::size_t n > bool tryNext( const char (&cstr)[n] ) {
size_t len = sizeof(cstr)/sizeof(char);
if (0 == len) return false;
len--;
if (len > m_len - m_pos) return false;
if (0 != memcmp(cstr, m_data + m_pos, len)) return false;
m_pos += len;
return true;
}
bool isNext(char c) {
return !eof() && (c == m_data[m_pos]);
}
bool eof() const {
return m_pos >= m_len;
}
void clear() {
#ifdef USE_MMAP
if (0 != m_data) munmap(m_data, m_len);
#else
delete m_data;
#endif
m_data = 0; m_len = m_pos = 0;
}
size_t pos() const { return m_pos; }
size_t len() const { return m_len; }
const char* data() const { return m_data; }
const char* c_str() const { return m_data; }
void next() { if (m_pos < m_len) m_pos++; }
char current() const { return !eof() ? m_data[m_pos] : '\0'; }
~Buffer() {
clear();
}
struct stat filestat;
private:
friend class Torrent;
friend class TorrentBase;
std::string m_filename;
char *m_data;
size_t m_len, m_pos;
};
class BufferString {
public:
BufferString()
: m_data(0), m_len(0) { }
template< std::size_t n > explicit BufferString(const char (&data)[n])
: m_data(data), m_len(sizeof(data)/sizeof(char) > 0 ? sizeof(data)/sizeof(char) - 1 : 0) {
}
BufferString(const char *data, size_t len)
: m_data(data), m_len(len) {
}
explicit BufferString(const std::string &s)
: m_data(s.c_str()), m_len(s.length()) { }
std::string toString() {
if (0 == m_data || 0 == m_len) return std::string();
return std::string(m_data, m_len);
}
pcrecpp::StringPiece toStringPiece() const {
return pcrecpp::StringPiece(m_data, m_len);
}
std::string sha1() {
unsigned char raw[20];
::SHA1((const unsigned char*) m_data, m_len, raw);
std::string hex(40, '.');
const char hexchar[] = "0123456789ABCDEF";
for (int i = 0; i < 20; i++) {
hex[2*i] = hexchar[raw[i] >> 4];
hex[2*i+1] = hexchar[raw[i] & 0xf];
}
return hex;
}
bool validUTF8() {
return ::validUTF8(m_data, m_len);
}
size_t length() const {
return m_len;
}
const char* data() const { return m_data; }
const char* c_str() const { return m_data; }
char operator[](size_t ndx) const {
if (ndx >= m_len) return '\0';
return m_data[ndx];
}
private:
friend class TorrentBase;
friend class Torrent;
friend class TorrentSanitize;
friend bool operator<(const BufferString &a, const BufferString &b);
friend bool operator<=(const BufferString &a, const BufferString &b);
friend bool operator==(const BufferString &a, const BufferString &b);
const char *m_data;
size_t m_len;
};
bool operator<(const BufferString &a, const BufferString &b) {
int i = memcmp(a.m_data, b.m_data, std::min(a.m_len, b.m_len));
if (i < 0) return true;
if (i == 0 && a.m_len < b.m_len) return true;
return false;
}
bool operator>(const BufferString &a, const BufferString &b) {
return b < a;
}
bool operator<=(const BufferString &a, const BufferString &b) {
int i = memcmp(a.m_data, b.m_data, std::min(a.m_len, b.m_len));
if (i < 0) return true;
if (i == 0 && a.m_len <= b.m_len) return true;
return false;
}
bool operator>=(const BufferString &a, const BufferString &b) {
return b <= a;
}
bool operator==(const BufferString &a, const BufferString &b) {
if (a.m_len != b.m_len) return false;
return (0 == memcmp(a.m_data, b.m_data, a.m_len));
}
bool operator!=(const BufferString &a, const BufferString &b) {
return !(a == b);
}
std::ostream& operator<<(std::ostream &os, const BufferString &b) {
os.write(b.data(), b.length());
return os;
}
static const BufferString bs_empty("");
static const BufferString bs_announce("announce");
static const BufferString bs_announce_list("announce-list");
static const BufferString bs_info("info");
static const BufferString bs_encoding("encoding");
static const BufferString bs_files("files");
static const BufferString bs_length("length");
static const BufferString bs_name("name");
static const BufferString bs_path("path");
static const BufferString bs_piece_length("piece length");
static const BufferString bs_pieces("pieces");
static const BufferString bs_private("private");
class TorrentOStream {
public:
explicit TorrentOStream(std::ostream &os) : os(os) { }
std::ostream &os;
TorrentOStream& operator<<(const std::string &s) {
os << s.length() << ":" << s; return *this;
}
TorrentOStream& operator<<(const BufferString &s) {
os << s.length() << ":"; os.write(s.data(), s.length()); return *this;
}
template< std::size_t n > TorrentOStream& operator<<(const char (&data)[n]) {
if (n > 0) { os << (n-1) << ":"; os.write(data, n-1); } else { os << "0:"; }
return *this;
}
TorrentOStream& operator<<(int64_t num) {
os << "i" << num << "e"; return *this;
}
template<typename T> TorrentOStream& operator<<(std::vector< T > list) {
os << "l";
for (size_t i = 0; i < list.size(); i++) *this << list[i];
os << "e";
return *this;
}
};
typedef std::map<std::string, std::string> TorrentRawParts;
typedef std::pair<std::string, int64_t> File;
class PCRE {
public:
PCRE() : m_re(0) { }
~PCRE() { clear(); }
void clear() {
delete m_re;
m_re = 0;
}
void load(const char *pattern) {
m_re = new pcrecpp::RE(pattern);
if (!m_re->error().empty()) {
std::cerr << "Couldn't parse pcre pattern: " << m_re->error() << std::endl;
clear();
exit(5);
}
}
bool matches(BufferString str) const {
if (0 == m_re) return false;
return m_re->FullMatch(str.toStringPiece());
}
bool matches(const std::string &str) const {
if (0 == m_re) return false;
return m_re->FullMatch(str);
}
private:
pcrecpp::RE *m_re;
};
class TorrentSanitize {
public:
TorrentSanitize() : debug(false), show_paths(false), check_info_utf8(true) {
}
bool validMetaKey(BufferString key) const {
if (0 == key.length()) return false;
for (size_t i = 0; i < key.length(); i++) {
if (iscntrl(key[i]) || !isascii(key[i])) return false;
}
if (new_meta_entries.end() != new_meta_entries.find(key.toString())) return false;
return true;
}
bool validMetaTextKey(BufferString key) const {
return filter_meta_text.matches(key);
}
bool validMetaNumKey(BufferString key) const {
return filter_meta_num.matches(key);
}
bool validMetaOtherKey(BufferString key) const {
return filter_meta_other.matches(key);
}
bool validURL(const std::string &url, std::string &domain) const {
size_t domainstart;
if (stringHasPrefix(url, "dht://")) return false; /* drop dht urls - fallback if no other url is found */
if (stringHasPrefix(url, "udp://")) {
domainstart = 6;
} else if (stringHasPrefix(url, "http://")) {
domainstart = 7;
} else if (stringHasPrefix(url, "https://")) {
domainstart = 8;
} else {
if (std::string::npos == url.find("://")) return false; /* not an url */
/* unknown scheme */
domain = url;
return true;
}
size_t pos, dot;
/* search for next ':' or '/' - domain name should end there */
if (std::string::npos == (pos = url.find(':', domainstart))) pos = url.find('/', domainstart);
if (std::string::npos != pos) { pos--; } else { pos = url.length() - 1; }
if (url[pos] == '.') pos--; /* ignore trailing dot */
/* extract "domain.tld" (without subdomains); pos points to last character of domain name */
dot = url.find_last_of('.', pos);
if (std::string::npos != dot) dot = url.find_last_of('.', dot-1);
if (std::string::npos == dot) { dot = domainstart; } else { dot++; }
domain = url.substr(dot, pos - dot + 1);
// std::cerr << "domain for '" << url << "' is '" << domain << "'\n";
return true;
}
bool whitelistURL(const std::string &url) const {
return filter_whitelist.matches(url);
}
bool blacklistURL(const std::string &url) const {
return filter_blacklist.matches(url);
}
template<typename Value> void add_new_meta_entry(const std::string &key, const Value &value) {
std::ostringstream raw;
TorrentOStream(raw) << key << value;
new_meta_entries.insert(std::make_pair(key, raw.str()));
}
TorrentRawParts new_meta_entries;
bool debug;
bool show_paths; /* build paths for file entries, joined with '/', show them later */
bool check_info_utf8; /* as we can't modify the info part, optionally disable struct utf-8 checks */
PCRE filter_meta_text, filter_meta_num, filter_meta_other;
PCRE filter_whitelist, filter_blacklist;
std::vector< std::string > additional_announce_urls;
private:
std::vector< std::string > m_alloced_strings;
};
class TorrentBase {
public:
TorrentBase()
: m_check_info_utf8(true) { }
std::string lasterror() { return m_lasterror; }
std::string filename() { return m_buffer.m_filename; }
std::string infohash() { if (m_info_hash.empty() && m_raw_info.length() > 0) m_info_hash = m_raw_info.sha1(); return m_info_hash; }
std::string t_announce;
std::vector< std::vector< std::string > > t_announce_list;
void sanitize_announce_urls(const TorrentSanitize &san, const TorrentBase *mergefromother = 0);
protected:
bool m_check_info_utf8;
Buffer m_buffer;
bool loadfile(const std::string &filename) {
if (!m_buffer.load(filename)) return seterror("couldn't load file");
return true;
}
BufferString m_raw_info;
std::string m_info_hash;
std::string m_lasterror;
bool parse_announce_list() {
std::string s;
if (m_buffer.eof()) return seterror("expected announce-list, found eof");
if (!m_buffer.isNext('l')) {
if (!read_utf8(s)) return errorcontext("expected announce-list, neither list nor string found");
std::cerr << "broken Announce-list, found single string" << std::endl;
t_announce_list.push_back(std::vector<std::string>());
t_announce_list.back().push_back(s);
} else {
bool warned1 = false;
m_buffer.next();
while (!m_buffer.eof() && !m_buffer.isNext('e')) {
if (!m_buffer.isNext('l')) {
if (!read_utf8(s)) return errorcontext("expected announce-list list, neither list nor string found");
if (!warned1) {
std::cerr << "broken announce-list, found string in list, should be in separate list" << std::endl;
warned1 = true;
}
if (t_announce_list.empty()) t_announce_list.push_back(std::vector<std::string>());
t_announce_list.back().push_back(s);
} else {
t_announce_list.push_back(std::vector<std::string>());
m_buffer.next();
while (!m_buffer.eof() && !m_buffer.isNext('e')) {
if (!read_utf8(s)) return errorcontext("expected announce-list list entry");
t_announce_list.back().push_back(s);
}
if (!m_buffer.isNext('e')) return seterror("expected announce-list list, found eof");
m_buffer.next();
}
}
if (!m_buffer.isNext('e')) return seterror("expected announce-list lists, found eof");
m_buffer.next();
}
return true;
}
bool errorcontext(const char msg[]) {
std::ostringstream oss;
oss << "Error: " << msg << "\n in " << m_lasterror;
m_lasterror = oss.str();
return false;
}
bool seterror(const char msg[]) {
std::ostringstream oss;
std::string context = std::string(m_buffer.m_data + m_buffer.pos(), std::min<size_t>(16, m_buffer.m_len - m_buffer.pos()));
oss << "Error @[" << m_buffer.pos() << "/" << m_buffer.m_len << " '" << context << "'...]: " << msg;
m_lasterror = oss.str();
return false;
}
bool read_string(BufferString &str) {
char c;
int64_t pos = m_buffer.pos(), len = m_buffer.m_len;
int64_t slen = 0;
str.m_data = 0; str.m_len = 0;
if (pos >= len) return seterror("expected string length, found eof");
c = m_buffer.m_data[pos++];
if (c < '0' || c > '9') return seterror("expected digit for string length, found eof");
if (pos >= len) return seterror("expected string length, found eof");
if (c == '0' && m_buffer.m_data[pos] != ':') return seterror("expected string length, found leading zero of non zero length (no following ':')");
slen = (c - '0');
for (;;) {
c = m_buffer.m_data[pos++];
if (c == ':') break;
if (c < '0' || c > '9') return seterror("expected digit or colon for string length, found eof");
if (pos >= len) return seterror("expected string length, found eof");
if (slen > std::numeric_limits<int32_t>::max()) return seterror("string length overflow");
slen = 10*slen + (c - '0');
}
if (slen > len || slen > len - pos) return seterror("file not large enough for string length"); /* overflow */
str.m_data = m_buffer.m_data + pos;
str.m_len = slen;
m_buffer.m_pos = pos + slen;
return true;
}
bool read_string(std::string &str) {
BufferString tmp;
if (!read_string(tmp)) return false;
str = tmp.toString();
return true;
}
bool skip_string() {
BufferString tmp;
return read_string(tmp);
}
bool read_utf8(BufferString &str) {
size_t pos = m_buffer.pos();
if (!read_string(str)) return false;
if (!str.validUTF8()) { m_buffer.m_pos = pos; return seterror("string not valid utf-8"); }
return true;
}
bool read_utf8(std::string &str) {
size_t pos = m_buffer.pos();
BufferString tmp;
if (!read_string(tmp)) return false;
if (!tmp.validUTF8()) { m_buffer.m_pos = pos; return seterror("string not valid utf-8"); }
str = tmp.toString();
return true;
}
bool skip_utf8() {
BufferString tmp;
return read_utf8(tmp);
}
bool read_info_utf8(BufferString &str) {
return m_check_info_utf8 ? read_utf8(str) : read_string(str);
}
bool read_info_utf8(std::string &str) {
return m_check_info_utf8 ? read_utf8(str) : read_string(str);
}
bool skip_info_utf8() {
return m_check_info_utf8 ? skip_utf8() : skip_string();
}
bool read_number(int64_t &number) {
char c;
int64_t pos = m_buffer.pos(), len = m_buffer.m_len;
number = 0;
if (pos >= len) return seterror("expected number, found eof");
c = m_buffer.m_data[pos++];
if (c != 'i') return seterror("expected 'i' for number");
if (pos >= len) return seterror("expected digit for number, found eof");
c = m_buffer.m_data[pos++];
if (pos >= len) return seterror("expected digit, '-' or 'e' for number, found eof");
if (c == '-') {
c = m_buffer.m_data[pos++];
if (pos >= len) return seterror("expected digit or 'e' for number, found eof");
if (c == '0') return seterror("found leading zero in negative number");
if (c < '1' || c > '9') return seterror("expected leading digit for negative number");
number = -(c - '0');
for (;;) {
c = m_buffer.m_data[pos++];
if (c == 'e') break;
if (pos >= len) return seterror("expected digit or 'e' for number, found eof");
if (c < '0' || c > '9') return seterror("expected digit or 'e' for number");
if ((std::numeric_limits<int64_t>::min() + (c - '0')) / 10 > number) return seterror("number too small for int64_t"); /* underflow */
number = 10*number - (c - '0');
}
} else {
if (c == '0' && m_buffer.m_data[pos] != 'e') return seterror("found leading zero for non zero number");
if (c < '0' || c > '9') return seterror("expected leading digit for number");
number = (c - '0');
for (;;) {
c = m_buffer.m_data[pos++];
if (c == 'e') break;
if (pos >= len) return seterror("expected digit or 'e' for number, found eof");
if (c < '0' || c > '9') return seterror("expected digit or 'e' for number");
if ((std::numeric_limits<int64_t>::max() - (c - '0')) / 10 < number) return seterror("number too large for int64_t"); /* overflow */
number = 10*number + (c - '0');
}
}
m_buffer.m_pos = pos;
return true;
}
bool skip_number() {
char c;
int64_t pos = m_buffer.pos(), len = m_buffer.m_len;
if (pos >= len) return seterror("expected number, found eof");
c = m_buffer.m_data[pos++];
if (c != 'i') return seterror("expected 'i' for number");
if (pos >= len) return seterror("expected digit for number, found eof");
c = m_buffer.m_data[pos++];
if (pos >= len) return seterror("expected digit, '-' or 'e' for number, found eof");
if (c == '-') {
c = m_buffer.m_data[pos++];
if (pos >= len) return seterror("expected digit or 'e' for number, found eof");
if (c == '0') return seterror("found leading zero in negative number");
if (c < '1' || c > '9') return seterror("expected leading digit for negative number");
} else {
if (c == '0' && m_buffer.m_data[pos] != 'e') return seterror("found leading zero for non zero number");
if (c < '0' || c > '9') return seterror("expected leading digit for number");
}
for (;;) {
c = m_buffer.m_data[pos++];
if (c == 'e') break;
if (pos >= len) return seterror("expected digit or 'e' for number, found eof");
if (c < '0' || c > '9') return seterror("expected digit or 'e' for number");
}
m_buffer.m_pos = pos;
return true;
}
bool skip_list() {
if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected list, found eof");
if (m_buffer.m_data[m_buffer.pos()] != 'l') return seterror("expected 'l' for list");
m_buffer.next();
if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected list entry or 'e', found eof");
while (m_buffer.m_data[m_buffer.pos()] != 'e') {
if (!skip_value()) return errorcontext("parsing list entry failed");;
if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected list entry or 'e', found eof");
}
m_buffer.next();
return true;
}
bool skip_dict() {
if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict, found eof");
if (m_buffer.m_data[m_buffer.pos()] != 'd') return seterror("expected 'd' for dict");
m_buffer.next();
if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict entry or 'e', found eof");
BufferString last, cur;
while (m_buffer.m_data[m_buffer.pos()] != 'e') {
if (!read_utf8(cur)) return errorcontext("parsing dict key failed");
if (cur <= last) return seterror("(previous) dict entries in wrong order");
last = cur;
if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict value, found eof");
if (!skip_value()) return errorcontext("parsing dict value failed");
if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict entry or 'e', found eof");
}
m_buffer.next();
return true;
}
/* skip entries until found search or dict end */
bool try_next_dict_entry(BufferString search, BufferString prev, bool &error, BufferString *skipped = 0) {
size_t start = m_buffer.pos();
error = true;
if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict, found eof");
BufferString cur;
while (m_buffer.m_data[m_buffer.pos()] != 'e') {
size_t curpos = m_buffer.pos();;
if (!read_utf8(cur)) return errorcontext("parsing dict key failed");
if (cur <= prev) return seterror("(previous) dict entries in wrong order");
if (cur == search) {
error = false;
if (0 != skipped) *skipped = BufferString(m_buffer.data() + start, m_buffer.pos() - start);
return true;
}
if (cur > search) {
m_buffer.m_pos = curpos;
error = false;
if (0 != skipped) *skipped = BufferString(m_buffer.data() + start, m_buffer.pos() - start);
return false;
}
prev = cur;
if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict value, found eof");
if (!skip_value()) return errorcontext("parsing dict value failed");
if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict entry or 'e', found eof");
}
/* dict end found, don't skip the 'e' */
error = false;
if (0 != skipped) *skipped = BufferString(m_buffer.data() + start, m_buffer.pos() - start);
return false;
}
/* skip entries until found search or dict end */
bool goto_dict_end(BufferString prev, BufferString *skipped = 0) {
size_t start = m_buffer.pos();
if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict entry or 'e', found eof");
BufferString cur;
while (m_buffer.m_data[m_buffer.pos()] != 'e') {
if (!read_utf8(cur)) return errorcontext("parsing dict key failed");
if (cur <= prev) return seterror("(previous) dict entries in wrong order");
prev = cur;
if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict value, found eof");
if (!skip_value()) return errorcontext("parsing dict value failed");
if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected dict entry or 'e', found eof");
}
m_buffer.next();
if (0 != skipped) *skipped = BufferString(m_buffer.data() + start, m_buffer.pos() - start);
return true;
}
bool skip_value() {
char c;
if (m_buffer.pos() >= m_buffer.m_len) return seterror("expected value, found eof");
c = m_buffer.m_data[m_buffer.pos()];
if (c >= '0' && c <= '9') return skip_string();
if (c == 'i') return skip_number();
if (c == 'l') return skip_list();
if (c == 'd') return skip_dict();
return seterror("expected value");
}
};
class AnnounceList {
public:
AnnounceList(const TorrentSanitize &san) : m_san(san) { }
void merge(std::string url) {
std::string domain;
if (!m_san.validURL(url, domain)) return;
if (!m_san.whitelistURL(url) && m_san.blacklistURL(url)) return;
add(domain, url);
}
template<typename T> void merge(std::vector< T > urllist) {
for (size_t i = 0; i < urllist.size(); i++) merge(urllist[i]);
}
void force_merge(std::string url) {
std::string domain;
if (!m_san.validURL(url, domain)) return;
add(domain, url);
}
template<typename T> void force_merge(std::vector< T > urllist) {
for (size_t i = 0; i < urllist.size(); i++) merge(urllist[i]);
}
void merge(const TorrentBase &t) {
merge(t.t_announce);
merge(t.t_announce_list);
}
std::vector< std::vector< std::string > > list;
private:
const TorrentSanitize &m_san;
typedef std::map< std::string, size_t > GroupIndex;
GroupIndex m_index;
void add(const std::string &domain, const std::string &url) {
GroupIndex::iterator it;
it = m_index.find(domain);
if (m_index.end() == it || it->second >= list.size()) {
m_index.insert(std::make_pair( domain, list.size() ));
std::vector<std::string> l;
l.push_back(url);
list.push_back(l);
} else {
std::vector<std::string> &l = list[it->second];
if (l.end() == std::find(l.begin(), l.end(), url)) l.push_back(url);
}
}
};
class Torrent : public TorrentBase {
public:
Torrent(const TorrentSanitize &san) : m_san(san) {
m_check_info_utf8 = m_san.check_info_utf8;
}
bool load(const std::string &filename) {
if (!loadfile(filename)) return false;
if (!m_buffer.tryNext("d8:announce")) return seterror("doesn't look like a valid torrent, expected 'd8:announce'");
if (!read_utf8(t_announce)) return errorcontext("parsing torrent announce failed");
BufferString prevkey(m_buffer.m_data+3, 8), curkey;
while (!m_buffer.eof() && !m_buffer.isNext('e')) {
size_t curpos = m_buffer.pos();
if (!read_utf8(curkey)) return errorcontext("parsing dict key in torrent failed");
if (curkey <= prevkey) return seterror("wrong key order in torrent dict");
prevkey = curkey;
if (curkey == BufferString("announce-list")) {
if (!parse_announce_list()) return errorcontext("parsing torrent announce-list failed");
} else if (curkey == BufferString("info")) {
curpos = m_buffer.pos();
if (!parse_info()) return errorcontext("parsing torrent info failed");
m_raw_info = BufferString(m_buffer.m_data + curpos, m_buffer.pos() - curpos);
} else if (curkey == BufferString("encoding")) {
if (!read_utf8(t_encoding)) return errorcontext("parsing torrent encoding failed");
} else {
std::string content;
int64_t number;
if (!m_san.validMetaKey(curkey)) {
if (!skip_value()) return errorcontext("parsing torrent meta entry failed");
if (m_san.debug) std::cerr << "Skipped entry '" << curkey.toString() << "'\n";
} else if (m_san.validMetaTextKey(curkey) && read_utf8(content)) {
if (m_san.debug) std::cerr << "Additional text entry '" << curkey.toString() << "': '" << content << "'\n";
m_raw_parts.insert(std::make_pair(curkey.toString(), BufferString(m_buffer.m_data + curpos, m_buffer.pos() - curpos).toString()));
} else if (m_san.validMetaNumKey(curkey) && read_number(number)) {
if (m_san.debug) std::cerr << "Additional numeric entry '" << curkey.toString() << "': " << number << "\n";
m_raw_parts.insert(std::make_pair(curkey.toString(), BufferString(m_buffer.m_data + curpos, m_buffer.pos() - curpos).toString()));
} else if (m_san.validMetaOtherKey(curkey)) {
if (!skip_value()) return errorcontext("parsing torrent meta entry failed");
if (m_san.debug) std::cerr << "Additional raw entry '" << curkey.toString() << "\n";
m_raw_parts.insert(std::make_pair(curkey.toString(), BufferString(m_buffer.m_data + curpos, m_buffer.pos() - curpos).toString()));
} else if (skip_value()) {
if (m_san.debug) std::cerr << "Skipped entry '" << curkey.toString() << "'\n";
} else {
return errorcontext("parsing torrent meta entry failed");
}
}
}
m_raw_parts.insert(m_san.new_meta_entries.begin(), m_san.new_meta_entries.end());
if (m_buffer.m_len-1 != m_buffer.pos()) {
if (m_buffer.eof()) return seterror("unexpected end of file while parsing torrent");
return seterror("file contains garbaga after torrent");
}
return true;
}
void write(std::ostream &os) const {
TorrentOStream tos(os);
os << "d8:announce";
tos << t_announce;
writerawkeys(os, bs_announce, bs_announce_list);
if (!t_announce_list.empty()) {
os << "13:announce-list";
tos << t_announce_list;
} else {
writerawkey(os, bs_announce_list);
}
if (!t_encoding.empty()) {
writerawkeys(os, bs_announce_list, bs_encoding);
os << "8:encoding";
tos << bs_encoding;
writerawkeys(os, bs_encoding, bs_info);
} else {
writerawkeys(os, bs_announce_list, bs_info);
}
os << "4:info" << m_raw_info;
writerawkeys(os, bs_info, BufferString());
os << "e";
}
void print_details() {
std::cout << "Announce-url: " << t_announce << std::endl;
std::cout << "Announce-list:\n";
for (size_t i = 0; i < t_announce_list.size(); i++) {
std::cout << " - [ ";
for (size_t j = 0; j < t_announce_list[i].size(); j++) {
if (j > 0) std::cout << ", ";
std::cout << t_announce_list[i][j];
}
std::cout << " ]\n";
}
if (!t_encoding.empty()) std::cout << "Encoding: " << t_encoding << std::endl;
std::cout << "Info name: " << t_info_name << std::endl;
std::cout << "Info files: " << t_info_files.size() << std::endl;
if (m_san.show_paths) {
for (size_t i = 0; i < t_info_files.size(); i++) {
std::cout << " - " << t_info_files[i].second << " bytes: '" << t_info_files[i].first << "'" << std::endl;
}
}
std::cout << "Info complete length: " << t_info_complete_length << " bytes" << std::endl;
std::cout << "Info pieces length: " << t_info_piece_length << " bytes" << std::endl;
std::cout << "Info-Hash: " << infohash() << std::endl;
}
private:
bool parse_info() {
bool err;
BufferString prev;
std::string tmps;
/* ('length':number | 'files':...) ['name': string] 'piece length':number 'pieces':raw[%20] */
if (m_buffer.eof()) return seterror("expected torrent info, found eof");
if (!m_buffer.isNext('d')) return seterror("expected 'd' for dict");
m_buffer.next();
if (try_next_dict_entry(bs_files, bs_empty, err)) {
t_info_complete_length = 0;
if (!parse_info_files()) return errorcontext("couldn't parse files in torrent info");
prev = bs_files;
} else if (err) {
return errorcontext("couldn't find torrent info key");
} else if (try_next_dict_entry(bs_length, bs_files, err)) {
if (!read_number(t_info_complete_length)) return errorcontext("couldn't parse length in torrent info");
prev = bs_length;
} else if (err) {
return errorcontext("couldn't find torrent info key");
} else return seterror("expected files or length in torrent info");
if (try_next_dict_entry(bs_name, prev, err)) {
if (!read_info_utf8(t_info_name)) return errorcontext("couldn't parse name in torrent info");
} else if (err) {
return errorcontext("couldn't find torrent info key");
} else {
std::cerr << "torrent info has no name entry\n";
}
if (try_next_dict_entry(bs_piece_length, bs_name, err)) {
if (!read_number(t_info_piece_length)) return errorcontext("couldn't parse piece length in torrent info");
} else if (err) {
return errorcontext("couldn't find torrent info key");
} else {
return seterror("expected piece length in torrent info");
}
if (try_next_dict_entry(bs_pieces, bs_piece_length, err)) {
BufferString pieces;
if (!read_string(pieces)) return errorcontext("couldn't parse pieces in torrent info");
if (0 != pieces.m_len % 20) return seterror("pieces in torrent info has wrong length (not a multiple of 20)");
} else if (err) {
return errorcontext("couldn't find torrent info key");
} else {
return seterror("expected piece length in torrent info");
}
if (try_next_dict_entry(bs_private, bs_pieces, err)) {
int64_t private_flag;
if (!read_number(private_flag)) return errorcontext("couldn't parse torrent info private flag");
if (0 != private_flag && 1 != private_flag) return seterror("torrent info private flag is neither 0 nor 1");
} else if (err) {
return errorcontext("couldn't find torrent info key");
}
if (!goto_dict_end(bs_private)) return errorcontext("couldn't find torrent info key");
return true;
}
bool parse_info_files() {
if (!m_buffer.isNext('l')) return seterror("expected 'l' for list");
m_buffer.next();
while (!m_buffer.eof() && !m_buffer.isNext('e')) {
if (!parse_info_file()) return errorcontext("couldn't parse files entry");
}
if (!m_buffer.isNext('e')) return seterror("expected info files entry, found eof");
m_buffer.next();
if (0 == t_info_files.size()) return seterror("empty files list");
return true;
}
bool parse_info_file() {
if (!m_buffer.isNext('d')) return seterror("expected 'd' for dict");
m_buffer.next();
bool err;
int64_t length;
std::string path;
if (try_next_dict_entry(bs_length, bs_empty, err)) {
if (!read_number(length)) return seterror("couldn't parse length");
if (length < 0) return seterror("negative file length");
t_info_complete_length += length;
} else if (err) {
return errorcontext("couldn't find info file entry key");
} else {
return seterror("expected length in file entry");
}
if (try_next_dict_entry(bs_path, bs_length, err)) {
if (!parse_info_file_path(path)) return errorcontext("couldn't parse path in file entry");
} else if (err) {
return errorcontext("couldn't find info file entry key");
} else {
return seterror("expected path in file entry");
}
t_info_files.push_back(File(path, length));
if (!goto_dict_end(bs_private)) return seterror("expected end of info files entry");
return true;
}
bool parse_info_file_path(std::string &path) {
int components = 0;
std::ostringstream buildpath;
std::string part;
if (!m_buffer.isNext('l')) return seterror("expected 'l' for list");
m_buffer.next();
while (!m_buffer.eof() && !m_buffer.isNext('e')) {
if (m_san.show_paths) {
if (!read_info_utf8(part)) return errorcontext("couldn't parse path component");
if (components > 0) buildpath << '/';
buildpath << part;
} else {
if (!skip_info_utf8()) return errorcontext("couldn't parse path component");
}
components++;
}
if (!m_buffer.isNext('e')) return seterror("expected path component, found eof");
m_buffer.next();
path = buildpath.str();
return true;
}
void writerawkeys(std::ostream &os, BufferString prev, BufferString next) const {
std::string ns = next.toString();
TorrentRawParts::const_iterator it = m_raw_parts.upper_bound(prev.toString());
while (it != m_raw_parts.end() && (0 == next.length() || it->first < ns)) {
os << it->second;
it++;
}
}
void writerawkey(std::ostream &os, BufferString key) const {
TorrentRawParts::const_iterator it = m_raw_parts.find(key.toString());
if (it != m_raw_parts.end()) os << it->second;
}
const TorrentSanitize &m_san;
std::string t_encoding;
std::string t_info_name;
int64_t t_info_piece_length;
int64_t t_info_complete_length;
std::vector<File> t_info_files;
TorrentRawParts m_raw_parts;
};
std::ostream& operator<<(std::ostream &os, const Torrent &t) {
t.write(os);
return os;
}
/* only load announce urls and info hash */
class TorrentAnnounceInfo : public TorrentBase {
public:
TorrentAnnounceInfo() {
}
bool load(const std::string &filename) {
bool err;
if (!loadfile(filename)) return false;
if (!m_buffer.tryNext("d8:announce")) return seterror("doesn't look like a valid torrent, expected 'd8:announce'");
if (!read_utf8(t_announce)) return errorcontext("parsing torrent announce failed");
if (try_next_dict_entry(bs_announce_list, bs_announce, err, &m_post_announce)) {
if (!parse_announce_list()) return errorcontext("parsing torrent announce-list failed");
} else if (err) {
return errorcontext("parsing dict key in torrent failed");
}
if (try_next_dict_entry(bs_info, bs_announce_list, err, &m_post_announce_list)) {
size_t curpos = m_buffer.pos();
if (!skip_value()) return errorcontext("parsing torrent info failed");
m_raw_info = BufferString(m_buffer.data() + curpos, m_buffer.pos() - curpos);
} else if (err) {
return errorcontext("parsing dict key in torrent failed");
} else {
return seterror("no info key in torrent");
}
if (!goto_dict_end(bs_info, &m_post_info)) return seterror("expected end of info files entry");
return true;
}
void write(std::ostream &os) const {
TorrentOStream tos(os);
os << "d8:announce";
tos << t_announce;
os << m_post_announce;
if (!t_announce_list.empty()) {
os << "13:announce-list";
tos << t_announce_list;
}
os << m_post_announce_list << "4:info" << m_raw_info << m_post_info;
}
void print_details() {
std::cout << "Announce-url: " << t_announce << std::endl;
std::cout << "Announce-list:\n";
for (size_t i = 0; i < t_announce_list.size(); i++) {
if (i > 0) std::cout << " -- \n";
for (size_t j = 0; j < t_announce_list[i].size(); j++) {
std::cout << " - " << t_announce_list[i][j] << "\n";
}
}
std::cout << "Info-Hash: " << infohash() << std::endl;
}
private:
BufferString m_post_announce, m_post_announce_list, m_post_info;
};
std::ostream& operator<<(std::ostream &os, const TorrentAnnounceInfo &t) {
t.write(os);
return os;
}
/* only read announce urls, don't verify any data after announce-list */
/* very fast as the load method doesn't read the whole torrent from disk */
class TorrentAnnounce : public TorrentBase {
public:
TorrentAnnounce() {
}
bool load(const std::string &filename) {
bool err;
if (!loadfile(filename)) return false;
if (!m_buffer.tryNext("d8:announce")) return seterror("doesn't look like a valid torrent, expected 'd8:announce'");
if (!read_utf8(t_announce)) return errorcontext("parsing torrent announce failed");
if (try_next_dict_entry(bs_announce_list, bs_announce, err, &m_post_announce)) {
if (!parse_announce_list()) return errorcontext("parsing torrent announce-list failed");
} else if (err) {
return errorcontext("parsing dict key in torrent failed");
}
/* just remember which part we skipped - this doesn't actually read anything */
m_post_announce_list = BufferString(m_buffer.data() + m_buffer.pos(), m_buffer.len() - m_buffer.pos());
return true;
}
void write(std::ostream &os) const {
TorrentOStream tos(os);
os << "d8:announce";
tos << t_announce;
os << m_post_announce;
if (!t_announce_list.empty()) {
os << "13:announce-list";
tos << t_announce_list;
}
os << m_post_announce_list;
}
void print_details() {
std::cout << "Announce-url: " << t_announce << std::endl;
std::cout << "Announce-list:\n";
for (size_t i = 0; i < t_announce_list.size(); i++) {
if (i > 0) std::cout << " -- \n";
for (size_t j = 0; j < t_announce_list[i].size(); j++) {
std::cout << " - " << t_announce_list[i][j] << "\n";
}
}
}
private:
BufferString m_post_announce, m_post_announce_list;
};
std::ostream& operator<<(std::ostream &os, const TorrentAnnounce &t) {
t.write(os);
return os;
}
void TorrentBase::sanitize_announce_urls(const TorrentSanitize &san, const TorrentBase *mergefromother) {
AnnounceList list(san);
list.force_merge(san.additional_announce_urls);
list.merge(*this);
if (0 != mergefromother) list.merge(*mergefromother);
t_announce_list.clear();
if (list.list.empty()) {
std::string hash = infohash();
for (size_t i = 0; i < hash.length(); i++) hash[i] = ::toupper(hash[i]);
t_announce = std::string("dht://") + hash;
} else {
t_announce = list.list[0][0];
t_announce_list = list.list;
}
}
template<typename T> bool writeAtomicFile(const std::string &filename, const T &t) {
char *tmpfname = new char[filename.length() + 8];
::memcpy(tmpfname, filename.c_str(), filename.length());
::memcpy(tmpfname + filename.length(), ".XXXXXX", 8);
int fd = ::mkstemp(tmpfname);
if (-1 == fd) {
int e = errno;
std::cerr << "Cannot create secure tempfile '" << tmpfname << "': " << ::strerror(e) << std::endl;
return false;
}
{
std::ofstream os(tmpfname, std::ios_base::binary | std::ios_base::trunc | std::ios_base::out);
if (os.fail()) {
int e = errno;
std::cerr << "Cannot open file '" << tmpfname << "': " << ::strerror(e) << std::endl;
::close(fd);
return false;
}
os << t;
os.close();
}
::fchmod(fd, 0644);
::close(fd);
if (-1 == ::rename(tmpfname, filename.c_str())) {
int e = errno;
std::cerr << "Cannot rename tempfile '" << tmpfname << "' to '" << filename << "': " << ::strerror(e) << std::endl;
return false;
}
return true;
}
void syntax() {
std::cerr << "Syntax: \n"
"\tprint info (default command):\n"
"\t\ttorrent-sanitize -i [-f] [-d] [-v] file.torrent\n"
"\n"
"\tsanitize:\n"
"\t\ttorrent-sanitize -s [-d] [-v] infile.torrent outfile.torrent\n"
"\n"
"\tcalculate info hash / show announce urls:\n"
"\t\ttorrent-sanitize [-h] [-u] file.torrent\n"
"\n"
"\t\t -f: show files\n"
"\t\t -d: debug mode\n"
"\t\t -v: verify strict: utf-8 checks (more may come)\n";
exit(100);
}
int main(int argc, char **argv) {
int opt_sanitize = 0, opt_info_hash = 0, opt_show_urls = 0, opt_show_info = -1, opt_show_files = 0, opt_debug = 0, opt_verify = 0;
int c;
while (-1 != (c = getopt(argc, argv, "ifdvshu"))) {
switch (c) {
case '?':
syntax();
break;
case 'i':
opt_show_info = 1;
break;
case 'f':
opt_show_files = 1;
break;
case 'd':
opt_debug = 1;
break;
case 'v':
opt_verify = 1;
break;
case 's':
opt_sanitize = 1;
break;
case 'h':
opt_info_hash = 1;
if (-1 == opt_show_info) opt_show_info = 0;
break;
case 'u':
opt_show_urls = 1;
if (-1 == opt_show_info) opt_show_info = 0;
break;
}
}
int filenames = argc - optind;
if (opt_sanitize) {
if (2 != filenames) syntax();
TorrentSanitize san;
san.show_paths = false;
san.debug = opt_debug;
san.filter_meta_text.load("comment|created by");
san.filter_meta_num.load("creation date");
san.filter_meta_other.load("");
san.check_info_utf8 = opt_verify;
san.add_new_meta_entry(std::string("comment"), std::string("Torrent downloaded from torrent cache at http://torrage.com"));
san.additional_announce_urls.push_back(std::string("udp://tracker.thepiratebay.org:80/announce"));
san.additional_announce_urls.push_back(std::string("udp://tracker.openbittorrent.com:80/announce"));
Torrent t(san);
if (!t.load(std::string(argv[optind]))) {
std::cerr << t.filename() << ": " << t.lasterror() << std::endl;
return 1;
}
t.sanitize_announce_urls(san);
writeAtomicFile(std::string(argv[optind+1]), t);
} else if (opt_show_info) {
/* show info */;
if (1 != filenames) syntax();
TorrentSanitize san;
san.show_paths = opt_show_files;
san.debug = opt_debug;
san.filter_meta_text.load("comment|created by");
san.filter_meta_num.load("creation date");
san.filter_meta_other.load("");
san.check_info_utf8 = opt_verify;
Torrent t(san);
if (!t.load(std::string(argv[optind]))) {
std::cerr << t.filename() << ": " << t.lasterror() << std::endl;
return 1;
}
t.print_details();
} else if (opt_info_hash) {
/* show info hash, optionally urls */;
if (1 != filenames) syntax();
TorrentAnnounceInfo t;
if (!t.load(std::string(argv[optind]))) {
std::cerr << t.filename() << ": " << t.lasterror() << std::endl;
return 1;
}
std::cout << t.infohash() << std::endl;
if (opt_show_urls) {
std::cout << t.t_announce << "\n";
for (size_t i = 0; i < t.t_announce_list.size(); i++) {
for (size_t j = 0; j < t.t_announce_list[i].size(); j++) {
std::cout << t.t_announce_list[i][j] << "\n";
}
}
}
} else {
/* show urls */
if (1 != filenames) syntax();
TorrentAnnounce t;
if (!t.load(std::string(argv[optind]))) {
std::cerr << t.filename() << ": " << t.lasterror() << std::endl;
return 1;
}
std::cout << t.t_announce << "\n";
for (size_t i = 0; i < t.t_announce_list.size(); i++) {
for (size_t j = 0; j < t.t_announce_list[i].size(); j++) {
std::cout << t.t_announce_list[i][j] << "\n";
}
}
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment