Skip to content

Instantly share code, notes, and snippets.

@Tatsh
Created October 13, 2011 20:44
Show Gist options
  • Save Tatsh/1285466 to your computer and use it in GitHub Desktop.
Save Tatsh/1285466 to your computer and use it in GitHub Desktop.
Crawl a site and make an RSS feed
var http = require('http');
var select = require('soupselect').select;
var htmlparser = require('htmlparser');
var ua = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.83 Safari/535.2';
var updateFromTheSite = function (callback) {
var cookies = 'uid=0000; pass=0000000000000000';
var siteName = 'The Site name';
var url = 'thedomain.com';
var client = http.createClient(80, url);
var request = client.request('GET', '/browse.php', {'Host': url, 'Cookie': cookies, 'User-Agent': ua});
request.on('response', function (response) {
response.setEncoding('utf8');
var body = '';
response.on('data', function (chunk) {
body += chunk;
});
response.on('end', function () {
var handler = new htmlparser.DefaultHandler(function (error, dom) {
if (error) {
throw error;
}
var titles = select(dom, 'td[align="left"] a');
var downloadUrl, title;
var torrents = [];
for (var i = 3; i < titles.length; i++) {
if (titles[i].children[0].raw === 'b') {
title = titles[i].children[0].children[0].raw;
downloadUrl = 'http://' + url + '/' + titles[i+1].attribs.href;
torrents.push({title: title, downloadUrl: downloadUrl});
i++;
}
}
if (callback !== null) {
callback(torrents);
}
});
var parser = new htmlparser.Parser(handler);
parser.parseComplete(body);
});
});
request.end();
};
http.createServer(function (req, res) {
res.writeHead(200, {'Content-Type': 'application/xml'});
var d = new Date();
// example: 2011-10-13 19:31:30
var pad = function (n) {
return n < 10 ? '0'+n : n;
}
var added = d.getUTCFullYear() + '-' + pad(d.getUTCMonth()+1) + '-' + pad(d.getUTCDate()) + ' ' + pad(d.getHours()) + ':' + pad(d.getMinutes()) + ':' + pad(d.getSeconds());
var xml = '<?xml version="1.0" encoding="utf-8"?>';
xml += '<rss version="0.91">';
xml += '<channel><title>The Site</title><link>http://thedomain.com</link>';
xml += '<description></description>';
updateFromTheSite(function (torrents) {
torrents.forEach(function (obj) {
xml += '<item>';
xml += '<title>' + obj.title + '</title>';
xml += '<link>' + obj.downloadUrl + '</link>';
xml += '<description>Added: ' + added + '</description>';
xml += '</item>';
});
xml += '</channel>';
xml += '</rss>';
res.end(xml);
});
}).listen(22221, '127.0.0.1');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment