Skip to content

Instantly share code, notes, and snippets.

@alexanderGugel
Created July 19, 2014 21:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save alexanderGugel/a27ab36ac6ced1bcdb64 to your computer and use it in GitHub Desktop.
Save alexanderGugel/a27ab36ac6ced1bcdb64 to your computer and use it in GitHub Desktop.
BitTorrent DHT Crawler; No Redis DB required
var bencode = require('bencode'),
dgram = require('dgram'),
hat = require('hat'),
_ = require('lodash');
// Put in a function. The returned function won't ever throw an error. This is
// quite useful for malformed messages.
var makeSafe = function (fn, onFuckedUp) {
return function () {
try {
return fn.apply(null, arguments);
} catch (e) {
console.log(e);
return onFuckedUp;
}
};
};
// See https://github.com/bencevans/node-compact2string.
var compact2string = makeSafe(require('compact2string'));
// Necessary formatting for the protocols we are using.
var transactionIdToBuffer = makeSafe(function (transactionId) {
var buf = new Buffer(2);
buf.writeUInt16BE(transactionId, 0);
return buf;
});
// Necessary formatting for the protocols we are using.
var idToBuffer = makeSafe(function (id) {
return new Buffer(id, 'hex');
});
// Time in ms for a crawlJob to live.
var ttl = 60*1000;
var decode = makeSafe(bencode.decode, {}),
encode = makeSafe(bencode.encode, {});
var ROUTERS = [
'router.bittorrent.com:6881',
'router.utorrent.com:6881',
'dht.transmissionbt.com:6881'
],
BOOTSTRAP_NODES = ROUTERS.slice();
var nodeID = hat(160),
port = process.env.UDP_PORT || 6881,
socket = dgram.createSocket('udp4');
// Update our id once in a while, since we are esentially spamming the DHT
// network and this might prevent other nodes from blocking us.
setInterval(function () {
nodeID = hat(160);
}, 10000);
// Key: infoHash; Value: Object representing the current results of this crawl
// job (peers and nodes set using object).
var jobs = {};
// Key: transactionId; Value: infoHash
var transactions = {};
// This function will be invoked as soon as a node/peer sends a message. It does
// a lot of formatting for the protocols.
socket.on('message', function (msg, rinfo) {
// console.log('Received message from ' + rinfo.address);
msg = decode(msg);
var transactionId = Buffer.isBuffer(msg.t) && msg.t.length === 2 && msg.t.readUInt16BE(0);
var infoHash = transactions[transactionId];
if (transactionId === false) {
console.log('Malformed message from ' + rinfo.address + ':' + rinfo.port + '.');
console.log(msg);
return;
}
if (infoHash === undefined) {
console.log('Unknown transaction for ' + transactionId + ' from ' + rinfo.address + ':' + rinfo.port + '.');
console.log(msg);
return;
}
if (msg.r && msg.r.values) {
_.each(msg.r.values, function (peer) {
peer = compact2string(peer);
if (peer && jobs[infoHash]) {
// console.log('Found new peer ' + node + ' for ' + infoHash);
jobs[infoHash].peers[peer] = true;
getPeers(infoHash, peer);
}
});
}
if (msg.r && msg.r.nodes && Buffer.isBuffer(msg.r.nodes)) {
for (var i = 0; i < msg.r.nodes.length; i += 26) {
var node = compact2string(msg.r.nodes.slice(i + 20, i + 26));
if (node && jobs[infoHash]) {
// console.log('Found new node ' + node + ' for ' + infoHash);
jobs[infoHash].nodes[node] = true;
getPeers(infoHash, node);
}
}
}
});
// Sends the get_peers request to a node.
var getPeers = function (infoHash, addr) {
// console.log('Sending get_peers to ' + addr + ' for ' + infoHash);
addr = addr.split(':');
var ip = addr[0],
port = parseInt(addr[1]);
if (port <= 0 || port >= 65536) {
return;
}
// var transactionId = _.random(Math.pow(2, 16));
var transactionId = _.random(Math.pow(2, 12));
transactions[transactionId] = infoHash;
var message = encode({
t: transactionIdToBuffer(transactionId),
y: 'q',
q: 'get_peers',
a: {
id: idToBuffer(nodeID),
info_hash: idToBuffer(infoHash)
}
});
socket.send(message, 0, message.length, port, ip);
};
var crawl = function (infoHash, callback) {
console.log('Crawling ' + infoHash + '...');
if (jobs[infoHash]) {
return callback(new Error('Crawljob already in progress'));
}
jobs[infoHash] = {
peers: {},
nodes: {}
};
setTimeout(function () {
console.log('Done crawling ' + infoHash + '.');
var peers = _.keys(jobs[infoHash].peers);
var nodes = _.keys(jobs[infoHash].nodes);
console.log('Found ' + peers.length + ' peers for ' + infoHash + '.');
console.log('Found ' + nodes.length + ' nodes for ' + infoHash + '.');
delete jobs[infoHash];
console.log('Successfully deleted crawl job for ' + infoHash + '.');
callback(null, {
peers: peers,
nodes: nodes
});
}, ttl);
// Packages might get lost. This sends each get_peers request multiple times.
// Routers provided by BitTorrent, Inc. are sometimes down. This way we
// ensure that we corrently enter the DHT network. Otherwise, we might not get
// a single peer/ node.
_.each(BOOTSTRAP_NODES, function (addr) {
getPeers(infoHash, addr);
});
};
module.exports = exports = crawl;
module.exports.init = function (callback) {
socket.bind(port, callback);
};
// Example usage:
// var crawl = require('./crawl');
// crawl.init(function () {
// crawl('8CA378DBC8F62E04DF4A4A0114B66018666C17CD', function (err, results) {
// console.log(results);
// process.exit(1);
// });
// });
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment