Skip to content

Instantly share code, notes, and snippets.

@kurokikaze
Created August 3, 2010 07:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kurokikaze/e37a7d8378822cfb8969 to your computer and use it in GitHub Desktop.
Save kurokikaze/e37a7d8378822cfb8969 to your computer and use it in GitHub Desktop.
var fs = require('fs'),
sys = require('sys');
fs.readFile('leaders.json', 'utf-8', function(err, dataraw) {
if (err) {
sys.puts('Err!');
throw new Error(JSON.stringify(err));
}
var data = JSON.parse(dataraw);
//sys.puts(sys.inspect(data.rows.pop()));
data.rows.sort(function(a, b) {
return a.value - b.value;
});
//data.rows.reverse();
for (var i = 1; i <= 30; i++) {
var leader = data.rows.pop();
sys.puts(i + ' place: ' + leader.key + ' with ' + leader.value + ' followers');
}
// sys.puts('Rows: ' + JSON.parse(data).rows.length);
});
exports.couchhost = '192.168.175.128:5984';
exports.couchbase = 'habrausers';
exports.targethost = 'habrahabr.ru';
exports.max_streams = 1;
exports.crawl_timeout = 800;
var unique = function(arr) {
var a = [];
var l = arr.length;
for(var i=0; i<l; i++) {
for(var j=i+1; j<l; j++) {
// If this[i] is found later in the array
if (arr[i] === arr[j])
j = ++i;
}
a.push(arr[i]);
}
return a;
};
function indexInArray(arr, val){
for (var i = 0; i < arr.length; i++) {
if(arr[i]==val) return true;
}
return false;
}
function get_content_type(headers) {
return headers['content-type'].split(';')[0];
}
var libxml = {}, // require("libxmljs"),
http = require("http"),
url = require("url"),
settings = require("./settings"),
couch = require("./node-couch").CouchDB,
sys = require("sys");
//var Riak = require('riak-js');
var target_site = http.createClient(80, settings.targethost);
var db = couch.db(settings.couchbase, settings.couchhost);
var db2 = couch.db('habratwitter', settings.couchhost);
var db3 = couch.db('lead_new', settings.couchhost);
// var db = new Riak.Client({host: '127.0.0.1', port: 8098, debug: false});
var parsePage = function(string) {
try {
var parsed = libxml.parseHtmlString(string);
} catch(e) {
sys.puts('Cannot parse: ' + string);
return {};
}
return parsed;
};
var getLinks = function(parsed_html, baseURL) {
var links = parsed_html.find('//a');
var destinations = [];
for (link in links) {
var attr = links[link].attr('href');
if (attr && attr.value) {
var url_parts = url.parse(url.resolve(baseURL, attr.value()));
if (!url_parts.hostname || url_parts.hostname.indexOf(settings.targethost) > -1) {
var destination = url_parts.pathname;
if (url_parts.search) {
destination = destination + url_parts.search;
}
destinations.push(destination);
} else {
// sys.puts('Found outbound link to ' + url_parts.hostname);
}
}
}
return destinations;
}
var getUsers = function(parsed_html) {
// <table class="users-list">
sys.puts('Getting users table');
var usertable = parsed_html.find("//div[@id='rate-table-wrap']/table");
sys.puts('Got users table');
//sys.puts(sys.inspect(usertable));
if (usertable[0].find) {
sys.puts('Getting user rows');
var users = usertable[0].find('tr');
sys.puts('Got user rows');
for (user in users) {
if (users[user].find && (users[user].find('td').length > 0)) {
var profile_dl = users[user].find('td')[2].find('dl')[0];
var karma = users[user].find('td')[3].text();
var rating = users[user].find('td')[4].text();
if (profile_dl.find) {
var profile_link = profile_dl.find('dt')[0].find('a')[0].attr('href');
var profile_name = profile_dl.find('dt')[0].find('a')[0].text()
save_user(profile_link, profile_name, karma, rating);
} else {
profile_link = 'none';
}
sys.puts('User URL: ' + profile_link);
} else {
sys.puts('Broken user');
}
}
}
}
var getPage = function(url, connection, callback) {
var request = connection.request("GET", url, {"host": 'api.twitter.com'});
request.addListener('response', function (response) {
response.setEncoding("utf8");
var text = '';
response.addListener("data", function (chunk) {
text += chunk;
});
response.addListener('end', function() {
// sys.puts('URL: ' + URL + ' > ' + response.statusCode);
// sys.puts('HEADERS > ' + JSON.stringify(response.headers));
callback(response.statusCode, text, response.headers);
});
});
request.end();
};
var cleanPage = function(parsed_html) {
var scripts = parsed_html.find('//script');
for (script in scripts) {
scripts[script].remove();
}
var styles = parsed_html.find('//style');
for (style in styles) {
styles[style].remove();
}
var body = parsed_html.get('/html/body');
if (body && body.text) {
body = body.text();
} else {
sys.puts('Body is empty?');
body = '';
}
return body;
}
var pageTitle = function(parsed_html) {
var title = parsed_html.get('//head/title');
return title.text();
}
var known_pages = [];
var visited_pages = [];
var num_of_streams = 0;
var get_next_page = function() {
for (page in known_pages) {
if (known_pages[page] && !indexInArray(visited_pages, known_pages[page]) && (typeof known_pages[page] != 'undefined')) {
visited_pages.push(known_pages[page]);
// sys.puts(known_pages[page] + ' marked as visited');
// sys.puts('Visited pages: ' + visited_pages.length);
return known_pages[page];
}
}
process.exit(); // End of list
}
var crawl_page = function (URL, connection, stream_id) {
sys.puts('Stream ' + stream_id + ' visiting ' + URL);
getPage(URL, connection, function(code, text, headers) {
// sys.puts('Got ' + code + ' answer from '+URL+', headers is: ' + JSON.stringify(headers));
sys.puts('Got ' + code + ' answer from ' + URL);
var links = [];
if (code == 200) {
var content_type = get_content_type(headers);
if (content_type == 'text/html' || content_type == 'text/plain' || content_type == '') {
parsed_page = parsePage(text);
if (parsed_page.find) {
var title = pageTitle(parsed_page);
var page_text = cleanPage(parsed_page);
// links = getLinks(parsed_page, URL);
getUsers(parsed_page);
// sys.puts('Got ' + users.length + ' users from ' + URL);
// save_page(URL, title, page_text);
} else {
sys.puts('Bad parsed page: ' + URL);
}
} else {
sys.puts('Strange content type: ' + content-type);
}
} else if (code == 301 || code == 303) {
// Return redirect location to known pages
links = [headers.location];
} else if (code == 404) {
// Do nothing, maybe add some sort of log entry
} else if (code == 400) {
sys.puts('Bad request: ' + URL);
} else {
sys.puts('Unknown code: ' + code + '\nHeaders is: ' + JSON.stringify(headers));
}
known_pages = unique(known_pages.concat(links));
//sys.puts('Known pages: ' + known_pages.length);
setTimeout(function() {
crawl_page(get_next_page(), connection, stream_id);
}, settings.crawl_timeout);
// Create new stream if available and have unvisited pages
if (num_of_streams < settings.max_streams && known_pages.length > visited_pages.length) {
num_of_streams++;
var new_connection = http.createClient(80, settings.targethost);
crawl_page(get_next_page(), new_connection, num_of_streams);
sys.puts('Starting another stream: ' + num_of_streams + ' of ' + settings.max_streams);
}
});
}
var doc_id = 1;
var save_page = function (URL, title, text) {
db.saveDoc({'url' : URL, 'title' : title, 'text' : text, '_id': doc_id});
doc_id++;
}
var save_user = function (URL, name, karma, rating) {
db.saveDoc({'url' : URL, 'name' : name, 'karma' : karma, 'rating': rating});
// doc_id++;
}
var save_twitter = function (user, accname) {
db2.saveDoc({'name' : user, 'account' : accname});
// doc_id++;
}
var save_leader = function (user, leader) {
db3.saveDoc({'user' : user, 'leaders' : leader});
/* db.save('astronauts', 'neil', {
'name':'Neil Armstrong',
'retired': true,
'daysinspace':8,
'missions':['Apollo 11', 'Gemini 8']
})(function(response, meta) {
sys.puts('Status code: ' + meta.statusCode);
});*/
// doc_id++;
}
// crawl_page('/people/', target_site, 1);
/*for (var i = 1; i < 100; i++) {
known_pages.push('/people/page'+ i + '/');
} */
num_of_streams = 1;
var twitters = [];
db2.allDocs({
success: function(docs) {
// people = docs.rows;
sys.puts('Got ' + docs.rows.length + ' twitterers');
for(row in docs.rows) {
// sys.puts('Twitterer id: ' + docs.rows[row].id);
twitters.push(docs.rows[row].id);
}
getleaders();
// sys.puts(JSON.stringify(docs));
/* for(row in docs.rows) {
//sys.puts(sys.inspect(docs.rows[row]));
db.openDoc(docs.rows[row].key, {
success: function(user) {
sys.puts('user:' + sys.inspect(user));
},
error: function() {
sys.puts('Error happened.');
}
});
}*/
},
error: function() {
sys.puts('Error getting docs');
}
});
var querystring = require('querystring');
var catchTwitter = function(parsed, callback) {
var dl = parsed.find("//dl");
var twitter = null;
//sys.puts(dl.length + ' dl"s ');
for (data in dl) {
//sys.puts('Dl> ' + dl[data].toString());
if (dl[data].find && dl[data].find('dt').length > 0 ) {
if (dl[data].find('dt')[0].text() == 'Twitter:') {
twitter = dl[data].find('dd')[0].find('a')[0].text();
} else {
var itemname = dl[data].find('dt')[0];
//if (itemname[itemname.length - 1] == ':') {
//sys.puts('Data: ' + itemname);
//}
}
// sys.puts('Twitter: ' + twitter);
}
}
callback(twitter);
}
var client = http.createClient(80, 'api.twitter.com');
var getleaders = function() {
if (twitters.length == 0) {
sys.puts('List finished');
process.exit();
}
var twi_user = twitters.pop();
//
db2.openDoc(twi_user, {
success: function(user) {
//sys.puts('User object: ' + sys.inspect(user));
sys.puts('Twi-user: ' + user.account);
sys.puts('Only ' + twitters.length + ' to go');
//var habrahost = querystring.parse(user.url);
//var url_parts = url.parse(user.url);
//sys.puts('Connecting to ' + url_parts.host);
getPage("/1/friends/ids.json?screen_name=" + user.account, client, function(code, pagetext, headers){
sys.puts('Code is ' + code);
if (code == 400) {
sys.puts('Hourly limit reached');
process.exit();
twitters.push(twi_user); // return lad to queue
setTimeout(function() {
sys.puts('New round!');
getleaders();
}, 25715); // 140 запросов в час
} else if (code == 200) {
var parsed = JSON.parse(pagetext);
sys.puts(parsed.length + ' leaders found');
save_leader(user.account, parsed);
sys.puts('leaders saved');
setTimeout(function() {
sys.puts('New round!');
getleaders();
}, 25710); // 140 запросов в час
} else if (code == 401) {
sys.puts('Authorization required');
setTimeout(function() {
sys.puts('New round!');
getleaders();
}, 25710); // 140 запросов в час
} else if (code == 404) {
sys.puts('Non-existing user');
setTimeout(function() {
sys.puts('New round!');
getleaders();
}, 25710); // 140 запросов в час
} else {
sys.puts('Something is wrong here: ' + code);
twitters.push(twi_user); // return lad to queue
setTimeout(function() {
sys.puts('New round!');
getleaders();
}, 12715); // Половина времени
}
});
},
error: function() {
sys.puts('Error happened.');
}
});
}
var unique = function(arr) {
var a = [];
var l = arr.length;
for(var i=0; i<l; i++) {
for(var j=i+1; j<l; j++) {
// If this[i] is found later in the array
if (arr[i] === arr[j])
j = ++i;
}
a.push(arr[i]);
}
return a;
};
function indexInArray(arr, val){
for (var i = 0; i < arr.length; i++) {
if(arr[i]==val) return true;
}
return false;
}
function get_content_type(headers) {
return headers['content-type'].split(';')[0];
}
var libxml = require("libxmljs"),
http = require("http"),
url = require("url"),
settings = require("./settings"),
couch = require("./node-couch").CouchDB,
sys = require("sys");
var target_site = http.createClient(80, settings.targethost);
var db = couch.db(settings.couchbase, settings.couchhost);
var db2 = couch.db('habratwitter', settings.couchhost);
var db3 = couch.db('leaders', settings.couchhost);
var parsePage = function(string) {
try {
var parsed = libxml.parseHtmlString(string);
} catch(e) {
sys.puts('Cannot parse: ' + string);
return {};
}
return parsed;
};
var getLinks = function(parsed_html, baseURL) {
var links = parsed_html.find('//a');
var destinations = [];
for (link in links) {
var attr = links[link].attr('href');
if (attr && attr.value) {
var url_parts = url.parse(url.resolve(baseURL, attr.value()));
if (!url_parts.hostname || url_parts.hostname.indexOf(settings.targethost) > -1) {
var destination = url_parts.pathname;
if (url_parts.search) {
destination = destination + url_parts.search;
}
destinations.push(destination);
} else {
// sys.puts('Found outbound link to ' + url_parts.hostname);
}
}
}
return destinations;
}
var getUsers = function(parsed_html) {
// <table class="users-list">
sys.puts('Getting users table');
var usertable = parsed_html.find("//div[@id='rate-table-wrap']/table");
sys.puts('Got users table');
//sys.puts(sys.inspect(usertable));
if (usertable[0].find) {
sys.puts('Getting user rows');
var users = usertable[0].find('tr');
sys.puts('Got user rows');
for (user in users) {
if (users[user].find && (users[user].find('td').length > 0)) {
var profile_dl = users[user].find('td')[2].find('dl')[0];
var karma = users[user].find('td')[3].text();
var rating = users[user].find('td')[4].text();
if (profile_dl.find) {
var profile_link = profile_dl.find('dt')[0].find('a')[0].attr('href');
var profile_name = profile_dl.find('dt')[0].find('a')[0].text()
save_user(profile_link, profile_name, karma, rating);
} else {
profile_link = 'none';
}
sys.puts('User URL: ' + profile_link);
} else {
sys.puts('Broken user');
}
}
}
}
var getPage = function(host, connection, callback) {
var request = connection.request("GET", '/', {"host": host});
request.addListener('response', function (response) {
response.setEncoding("utf8");
var text = '';
response.addListener("data", function (chunk) {
text += chunk;
});
response.addListener('end', function() {
// sys.puts('URL: ' + URL + ' > ' + response.statusCode);
// sys.puts('HEADERS > ' + JSON.stringify(response.headers));
callback(response.statusCode, text, response.headers);
});
});
request.end();
};
var cleanPage = function(parsed_html) {
var scripts = parsed_html.find('//script');
for (script in scripts) {
scripts[script].remove();
}
var styles = parsed_html.find('//style');
for (style in styles) {
styles[style].remove();
}
var body = parsed_html.get('/html/body');
if (body && body.text) {
body = body.text();
} else {
sys.puts('Body is empty?');
body = '';
}
return body;
}
var pageTitle = function(parsed_html) {
var title = parsed_html.get('//head/title');
return title.text();
}
var known_pages = [];
var visited_pages = [];
var num_of_streams = 0;
var get_next_page = function() {
for (page in known_pages) {
if (known_pages[page] && !indexInArray(visited_pages, known_pages[page]) && (typeof known_pages[page] != 'undefined')) {
visited_pages.push(known_pages[page]);
// sys.puts(known_pages[page] + ' marked as visited');
// sys.puts('Visited pages: ' + visited_pages.length);
return known_pages[page];
}
}
process.exit(); // End of list
}
var crawl_page = function (URL, connection, stream_id) {
sys.puts('Stream ' + stream_id + ' visiting ' + URL);
getPage(URL, connection, function(code, text, headers) {
// sys.puts('Got ' + code + ' answer from '+URL+', headers is: ' + JSON.stringify(headers));
sys.puts('Got ' + code + ' answer from ' + URL);
var links = [];
if (code == 200) {
var content_type = get_content_type(headers);
if (content_type == 'text/html' || content_type == 'text/plain' || content_type == '') {
parsed_page = parsePage(text);
if (parsed_page.find) {
var title = pageTitle(parsed_page);
var page_text = cleanPage(parsed_page);
// links = getLinks(parsed_page, URL);
getUsers(parsed_page);
// sys.puts('Got ' + users.length + ' users from ' + URL);
// save_page(URL, title, page_text);
} else {
sys.puts('Bad parsed page: ' + URL);
}
} else {
sys.puts('Strange content type: ' + content-type);
}
} else if (code == 301 || code == 303) {
// Return redirect location to known pages
links = [headers.location];
} else if (code == 404) {
// Do nothing, maybe add some sort of log entry
} else if (code == 400) {
sys.puts('Bad request: ' + URL);
} else {
sys.puts('Unknown code: ' + code + '\nHeaders is: ' + JSON.stringify(headers));
}
known_pages = unique(known_pages.concat(links));
//sys.puts('Known pages: ' + known_pages.length);
setTimeout(function() {
crawl_page(get_next_page(), connection, stream_id);
}, settings.crawl_timeout);
// Create new stream if available and have unvisited pages
if (num_of_streams < settings.max_streams && known_pages.length > visited_pages.length) {
num_of_streams++;
var new_connection = http.createClient(80, settings.targethost);
crawl_page(get_next_page(), new_connection, num_of_streams);
sys.puts('Starting another stream: ' + num_of_streams + ' of ' + settings.max_streams);
}
});
}
var doc_id = 1;
var save_page = function (URL, title, text) {
db.saveDoc({'url' : URL, 'title' : title, 'text' : text, '_id': doc_id});
doc_id++;
}
var save_user = function (URL, name, karma, rating) {
db.saveDoc({'url' : URL, 'name' : name, 'karma' : karma, 'rating': rating});
// doc_id++;
}
var save_twitter = function (user, accname) {
db2.saveDoc({'name' : user, 'account' : accname});
// doc_id++;
}
// crawl_page('/people/', target_site, 1);
/*for (var i = 1; i < 100; i++) {
known_pages.push('/people/page'+ i + '/');
} */
num_of_streams = 1;
var people = [];
db.allDocs({
success: function(docs) {
// people = docs.rows;
for(row in docs.rows) {
people.push(docs.rows[row].key);
}
getpeople();
// sys.puts(JSON.stringify(docs));
/* for(row in docs.rows) {
//sys.puts(sys.inspect(docs.rows[row]));
db.openDoc(docs.rows[row].key, {
success: function(user) {
sys.puts('user:' + sys.inspect(user));
},
error: function() {
sys.puts('Error happened.');
}
});
}*/
},
error: function() {
sys.puts('Error getting docs');
}
});
var querystring = require('querystring');
var catchTwitter = function(parsed, callback) {
var dl = parsed.find("//dl");
var twitter = null;
//sys.puts(dl.length + ' dl"s ');
for (data in dl) {
//sys.puts('Dl> ' + dl[data].toString());
if (dl[data].find && dl[data].find('dt').length > 0 ) {
if (dl[data].find('dt')[0].text() == 'Twitter:') {
twitter = dl[data].find('dd')[0].find('a')[0].text();
} else {
var itemname = dl[data].find('dt')[0];
//if (itemname[itemname.length - 1] == ':') {
//sys.puts('Data: ' + itemname);
//}
}
// sys.puts('Twitter: ' + twitter);
}
}
callback(twitter);
}
var getpeople = function() {
var habrauser = people.pop();
//
db.openDoc(habrauser, {
success: function(user) {
//sys.puts('User object: ' + sys.inspect(user));
sys.puts('habrauser: ' + user.name);
var habrahost = querystring.parse(user.url);
var url_parts = url.parse(user.url);
sys.puts('Connecting to ' + url_parts.host);
var client = http.createClient(80, url_parts.host);
getPage(url_parts.host, client, function(code, pagetext, headers){
delete client;
sys.puts('Code is ' + code);
if (code == 200) {
var parsed = parsePage(pagetext);
catchTwitter(parsed, function(twiacc) {
if (twiacc) {
sys.puts('Account found:' + twiacc);
save_twitter(user.name, twiacc);
}
setTimeout(function() {
sys.puts('New round!');
getpeople();
}, settings.crawl_timeout);
});
delete parsed;
} else {
people.push(habrauser);
setTimeout(function() {
sys.puts('New round!');
getpeople();
}, settings.crawl_timeout * 2);
}
});
},
error: function() {
sys.puts('Error happened.');
}
});
}
var unique = function(arr) {
var a = [];
var l = arr.length;
for(var i=0; i<l; i++) {
for(var j=i+1; j<l; j++) {
// If this[i] is found later in the array
if (arr[i] === arr[j])
j = ++i;
}
a.push(arr[i]);
}
return a;
};
function indexInArray(arr, val){
for (var i = 0; i < arr.length; i++) {
if(arr[i]==val) return true;
}
return false;
}
function get_content_type(headers) {
return headers['content-type'].split(';')[0];
}
var libxml = require("libxmljs"),
http = require("http"),
url = require("url"),
settings = require("./settings"),
couch = require("./node-couch").CouchDB,
sys = require("sys");
var target_site = http.createClient(80, settings.targethost);
var db = couch.db(settings.couchbase, settings.couchhost);
var parsePage = function(string) {
try {
var parsed = libxml.parseHtmlString(string);
} catch(e) {
sys.puts('Cannot parse: ' + string);
return {};
}
return parsed;
};
var getLinks = function(parsed_html, baseURL) {
var links = parsed_html.find('//a');
var destinations = [];
for (link in links) {
var attr = links[link].attr('href');
if (attr && attr.value) {
var url_parts = url.parse(url.resolve(baseURL, attr.value()));
if (!url_parts.hostname || url_parts.hostname.indexOf(settings.targethost) > -1) {
var destination = url_parts.pathname;
if (url_parts.search) {
destination = destination + url_parts.search;
}
destinations.push(destination);
} else {
// sys.puts('Found outbound link to ' + url_parts.hostname);
}
}
}
return destinations;
}
var getUsers = function(parsed_html) {
// <table class="users-list">
sys.puts('Getting users table');
var usertable = parsed_html.find("//div[@id='rate-table-wrap']/table");
sys.puts('Got users table');
//sys.puts(sys.inspect(usertable));
if (usertable[0].find) {
sys.puts('Getting user rows');
var users = usertable[0].find('tr');
sys.puts('Got user rows');
for (user in users) {
if (users[user].find && (users[user].find('td').length > 0)) {
var profile_dl = users[user].find('td')[2].find('dl')[0];
var karma = users[user].find('td')[3].text();
var rating = users[user].find('td')[4].text();
if (profile_dl.find) {
var profile_link = profile_dl.find('dt')[0].find('a')[0].attr('href').value();
var profile_name = profile_dl.find('dt')[0].find('a')[0].text()
save_user(profile_link.toString(), profile_name, karma, rating);
} else {
profile_link = 'none';
}
sys.puts('User URL: ' + profile_link);
} else {
sys.puts('Broken user');
}
}
}
}
var getPage = function(URL, connection, callback) {
var request = connection.request("GET", URL, {"host": settings.targethost});
request.addListener('response', function (response) {
response.setEncoding("utf8");
var text = '';
response.addListener("data", function (chunk) {
text += chunk;
});
response.addListener('end', function() {
// sys.puts('URL: ' + URL + ' > ' + response.statusCode);
// sys.puts('HEADERS > ' + JSON.stringify(response.headers));
callback(response.statusCode, text, response.headers);
});
});
request.end();
};
var cleanPage = function(parsed_html) {
var scripts = parsed_html.find('//script');
for (script in scripts) {
scripts[script].remove();
}
var styles = parsed_html.find('//style');
for (style in styles) {
styles[style].remove();
}
var body = parsed_html.get('/html/body');
if (body && body.text) {
body = body.text();
} else {
sys.puts('Body is empty?');
body = '';
}
return body;
}
var pageTitle = function(parsed_html) {
var title = parsed_html.get('//head/title');
return title.text();
}
var known_pages = [];
var visited_pages = [];
var num_of_streams = 0;
var get_next_page = function() {
for (page in known_pages) {
if (known_pages[page] && !indexInArray(visited_pages, known_pages[page]) && (typeof known_pages[page] != 'undefined')) {
visited_pages.push(known_pages[page]);
// sys.puts(known_pages[page] + ' marked as visited');
// sys.puts('Visited pages: ' + visited_pages.length);
return known_pages[page];
}
}
process.exit(); // End of list
}
var crawl_page = function (URL, connection, stream_id) {
sys.puts('Stream ' + stream_id + ' visiting ' + URL);
getPage(URL, connection, function(code, text, headers) {
// sys.puts('Got ' + code + ' answer from '+URL+', headers is: ' + JSON.stringify(headers));
sys.puts('Got ' + code + ' answer from ' + URL);
var links = [];
if (code == 200) {
var content_type = get_content_type(headers);
if (content_type == 'text/html' || content_type == 'text/plain' || content_type == '') {
parsed_page = parsePage(text);
if (parsed_page.find) {
var title = pageTitle(parsed_page);
//var page_text = cleanPage(parsed_page);
// links = getLinks(parsed_page, URL);
getUsers(parsed_page);
// sys.puts('Got ' + users.length + ' users from ' + URL);
// save_page(URL, title, page_text);
delete parsed_page;
} else {
sys.puts('Bad parsed page: ' + URL);
}
} else {
sys.puts('Strange content type: ' + content-type);
}
} else if (code == 301 || code == 303) {
// Return redirect location to known pages
links = [headers.location];
} else if (code == 404) {
// Do nothing, maybe add some sort of log entry
} else if (code == 400) {
sys.puts('Bad request: ' + URL);
} else {
sys.puts('Unknown code: ' + code + '\nHeaders is: ' + JSON.stringify(headers));
}
known_pages = unique(known_pages.concat(links));
//sys.puts('Known pages: ' + known_pages.length);
setTimeout(function() {
crawl_page(get_next_page(), connection, stream_id);
}, settings.crawl_timeout);
// Create new stream if available and have unvisited pages
if (num_of_streams < settings.max_streams && known_pages.length > visited_pages.length) {
num_of_streams++;
var new_connection = http.createClient(80, settings.targethost);
crawl_page(get_next_page(), new_connection, num_of_streams);
sys.puts('Starting another stream: ' + num_of_streams + ' of ' + settings.max_streams);
}
});
}
var doc_id = 1;
var save_page = function (URL, title, text) {
db.saveDoc({'url' : URL, 'title' : title, 'text' : text, '_id': doc_id});
doc_id++;
}
var save_user = function (URL, name, karma, rating) {
db.saveDoc({'url' : URL, 'name' : name, 'karma' : karma, 'rating': rating});
// doc_id++;
}
crawl_page('/people/', target_site, 1);
for (var i = 1; i < 100; i++) {
known_pages.push('/people/page'+ i + '/');
}
num_of_streams = 1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment