-
-
Save kurokikaze/e37a7d8378822cfb8969 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'), | |
sys = require('sys'); | |
fs.readFile('leaders.json', 'utf-8', function(err, dataraw) { | |
if (err) { | |
sys.puts('Err!'); | |
throw new Error(JSON.stringify(err)); | |
} | |
var data = JSON.parse(dataraw); | |
//sys.puts(sys.inspect(data.rows.pop())); | |
data.rows.sort(function(a, b) { | |
return a.value - b.value; | |
}); | |
//data.rows.reverse(); | |
for (var i = 1; i <= 30; i++) { | |
var leader = data.rows.pop(); | |
sys.puts(i + ' place: ' + leader.key + ' with ' + leader.value + ' followers'); | |
} | |
// sys.puts('Rows: ' + JSON.parse(data).rows.length); | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
exports.couchhost = '192.168.175.128:5984'; | |
exports.couchbase = 'habrausers'; | |
exports.targethost = 'habrahabr.ru'; | |
exports.max_streams = 1; | |
exports.crawl_timeout = 800; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var unique = function(arr) { | |
var a = []; | |
var l = arr.length; | |
for(var i=0; i<l; i++) { | |
for(var j=i+1; j<l; j++) { | |
// If this[i] is found later in the array | |
if (arr[i] === arr[j]) | |
j = ++i; | |
} | |
a.push(arr[i]); | |
} | |
return a; | |
}; | |
function indexInArray(arr, val){ | |
for (var i = 0; i < arr.length; i++) { | |
if(arr[i]==val) return true; | |
} | |
return false; | |
} | |
function get_content_type(headers) { | |
return headers['content-type'].split(';')[0]; | |
} | |
var libxml = {}, // require("libxmljs"), | |
http = require("http"), | |
url = require("url"), | |
settings = require("./settings"), | |
couch = require("./node-couch").CouchDB, | |
sys = require("sys"); | |
//var Riak = require('riak-js'); | |
var target_site = http.createClient(80, settings.targethost); | |
var db = couch.db(settings.couchbase, settings.couchhost); | |
var db2 = couch.db('habratwitter', settings.couchhost); | |
var db3 = couch.db('lead_new', settings.couchhost); | |
// var db = new Riak.Client({host: '127.0.0.1', port: 8098, debug: false}); | |
var parsePage = function(string) { | |
try { | |
var parsed = libxml.parseHtmlString(string); | |
} catch(e) { | |
sys.puts('Cannot parse: ' + string); | |
return {}; | |
} | |
return parsed; | |
}; | |
var getLinks = function(parsed_html, baseURL) { | |
var links = parsed_html.find('//a'); | |
var destinations = []; | |
for (link in links) { | |
var attr = links[link].attr('href'); | |
if (attr && attr.value) { | |
var url_parts = url.parse(url.resolve(baseURL, attr.value())); | |
if (!url_parts.hostname || url_parts.hostname.indexOf(settings.targethost) > -1) { | |
var destination = url_parts.pathname; | |
if (url_parts.search) { | |
destination = destination + url_parts.search; | |
} | |
destinations.push(destination); | |
} else { | |
// sys.puts('Found outbound link to ' + url_parts.hostname); | |
} | |
} | |
} | |
return destinations; | |
} | |
var getUsers = function(parsed_html) { | |
// <table class="users-list"> | |
sys.puts('Getting users table'); | |
var usertable = parsed_html.find("//div[@id='rate-table-wrap']/table"); | |
sys.puts('Got users table'); | |
//sys.puts(sys.inspect(usertable)); | |
if (usertable[0].find) { | |
sys.puts('Getting user rows'); | |
var users = usertable[0].find('tr'); | |
sys.puts('Got user rows'); | |
for (user in users) { | |
if (users[user].find && (users[user].find('td').length > 0)) { | |
var profile_dl = users[user].find('td')[2].find('dl')[0]; | |
var karma = users[user].find('td')[3].text(); | |
var rating = users[user].find('td')[4].text(); | |
if (profile_dl.find) { | |
var profile_link = profile_dl.find('dt')[0].find('a')[0].attr('href'); | |
var profile_name = profile_dl.find('dt')[0].find('a')[0].text() | |
save_user(profile_link, profile_name, karma, rating); | |
} else { | |
profile_link = 'none'; | |
} | |
sys.puts('User URL: ' + profile_link); | |
} else { | |
sys.puts('Broken user'); | |
} | |
} | |
} | |
} | |
var getPage = function(url, connection, callback) { | |
var request = connection.request("GET", url, {"host": 'api.twitter.com'}); | |
request.addListener('response', function (response) { | |
response.setEncoding("utf8"); | |
var text = ''; | |
response.addListener("data", function (chunk) { | |
text += chunk; | |
}); | |
response.addListener('end', function() { | |
// sys.puts('URL: ' + URL + ' > ' + response.statusCode); | |
// sys.puts('HEADERS > ' + JSON.stringify(response.headers)); | |
callback(response.statusCode, text, response.headers); | |
}); | |
}); | |
request.end(); | |
}; | |
var cleanPage = function(parsed_html) { | |
var scripts = parsed_html.find('//script'); | |
for (script in scripts) { | |
scripts[script].remove(); | |
} | |
var styles = parsed_html.find('//style'); | |
for (style in styles) { | |
styles[style].remove(); | |
} | |
var body = parsed_html.get('/html/body'); | |
if (body && body.text) { | |
body = body.text(); | |
} else { | |
sys.puts('Body is empty?'); | |
body = ''; | |
} | |
return body; | |
} | |
var pageTitle = function(parsed_html) { | |
var title = parsed_html.get('//head/title'); | |
return title.text(); | |
} | |
var known_pages = []; | |
var visited_pages = []; | |
var num_of_streams = 0; | |
var get_next_page = function() { | |
for (page in known_pages) { | |
if (known_pages[page] && !indexInArray(visited_pages, known_pages[page]) && (typeof known_pages[page] != 'undefined')) { | |
visited_pages.push(known_pages[page]); | |
// sys.puts(known_pages[page] + ' marked as visited'); | |
// sys.puts('Visited pages: ' + visited_pages.length); | |
return known_pages[page]; | |
} | |
} | |
process.exit(); // End of list | |
} | |
var crawl_page = function (URL, connection, stream_id) { | |
sys.puts('Stream ' + stream_id + ' visiting ' + URL); | |
getPage(URL, connection, function(code, text, headers) { | |
// sys.puts('Got ' + code + ' answer from '+URL+', headers is: ' + JSON.stringify(headers)); | |
sys.puts('Got ' + code + ' answer from ' + URL); | |
var links = []; | |
if (code == 200) { | |
var content_type = get_content_type(headers); | |
if (content_type == 'text/html' || content_type == 'text/plain' || content_type == '') { | |
parsed_page = parsePage(text); | |
if (parsed_page.find) { | |
var title = pageTitle(parsed_page); | |
var page_text = cleanPage(parsed_page); | |
// links = getLinks(parsed_page, URL); | |
getUsers(parsed_page); | |
// sys.puts('Got ' + users.length + ' users from ' + URL); | |
// save_page(URL, title, page_text); | |
} else { | |
sys.puts('Bad parsed page: ' + URL); | |
} | |
} else { | |
sys.puts('Strange content type: ' + content-type); | |
} | |
} else if (code == 301 || code == 303) { | |
// Return redirect location to known pages | |
links = [headers.location]; | |
} else if (code == 404) { | |
// Do nothing, maybe add some sort of log entry | |
} else if (code == 400) { | |
sys.puts('Bad request: ' + URL); | |
} else { | |
sys.puts('Unknown code: ' + code + '\nHeaders is: ' + JSON.stringify(headers)); | |
} | |
known_pages = unique(known_pages.concat(links)); | |
//sys.puts('Known pages: ' + known_pages.length); | |
setTimeout(function() { | |
crawl_page(get_next_page(), connection, stream_id); | |
}, settings.crawl_timeout); | |
// Create new stream if available and have unvisited pages | |
if (num_of_streams < settings.max_streams && known_pages.length > visited_pages.length) { | |
num_of_streams++; | |
var new_connection = http.createClient(80, settings.targethost); | |
crawl_page(get_next_page(), new_connection, num_of_streams); | |
sys.puts('Starting another stream: ' + num_of_streams + ' of ' + settings.max_streams); | |
} | |
}); | |
} | |
var doc_id = 1; | |
var save_page = function (URL, title, text) { | |
db.saveDoc({'url' : URL, 'title' : title, 'text' : text, '_id': doc_id}); | |
doc_id++; | |
} | |
var save_user = function (URL, name, karma, rating) { | |
db.saveDoc({'url' : URL, 'name' : name, 'karma' : karma, 'rating': rating}); | |
// doc_id++; | |
} | |
var save_twitter = function (user, accname) { | |
db2.saveDoc({'name' : user, 'account' : accname}); | |
// doc_id++; | |
} | |
var save_leader = function (user, leader) { | |
db3.saveDoc({'user' : user, 'leaders' : leader}); | |
/* db.save('astronauts', 'neil', { | |
'name':'Neil Armstrong', | |
'retired': true, | |
'daysinspace':8, | |
'missions':['Apollo 11', 'Gemini 8'] | |
})(function(response, meta) { | |
sys.puts('Status code: ' + meta.statusCode); | |
});*/ | |
// doc_id++; | |
} | |
// crawl_page('/people/', target_site, 1); | |
/*for (var i = 1; i < 100; i++) { | |
known_pages.push('/people/page'+ i + '/'); | |
} */ | |
num_of_streams = 1; | |
var twitters = []; | |
db2.allDocs({ | |
success: function(docs) { | |
// people = docs.rows; | |
sys.puts('Got ' + docs.rows.length + ' twitterers'); | |
for(row in docs.rows) { | |
// sys.puts('Twitterer id: ' + docs.rows[row].id); | |
twitters.push(docs.rows[row].id); | |
} | |
getleaders(); | |
// sys.puts(JSON.stringify(docs)); | |
/* for(row in docs.rows) { | |
//sys.puts(sys.inspect(docs.rows[row])); | |
db.openDoc(docs.rows[row].key, { | |
success: function(user) { | |
sys.puts('user:' + sys.inspect(user)); | |
}, | |
error: function() { | |
sys.puts('Error happened.'); | |
} | |
}); | |
}*/ | |
}, | |
error: function() { | |
sys.puts('Error getting docs'); | |
} | |
}); | |
var querystring = require('querystring'); | |
var catchTwitter = function(parsed, callback) { | |
var dl = parsed.find("//dl"); | |
var twitter = null; | |
//sys.puts(dl.length + ' dl"s '); | |
for (data in dl) { | |
//sys.puts('Dl> ' + dl[data].toString()); | |
if (dl[data].find && dl[data].find('dt').length > 0 ) { | |
if (dl[data].find('dt')[0].text() == 'Twitter:') { | |
twitter = dl[data].find('dd')[0].find('a')[0].text(); | |
} else { | |
var itemname = dl[data].find('dt')[0]; | |
//if (itemname[itemname.length - 1] == ':') { | |
//sys.puts('Data: ' + itemname); | |
//} | |
} | |
// sys.puts('Twitter: ' + twitter); | |
} | |
} | |
callback(twitter); | |
} | |
var client = http.createClient(80, 'api.twitter.com'); | |
var getleaders = function() { | |
if (twitters.length == 0) { | |
sys.puts('List finished'); | |
process.exit(); | |
} | |
var twi_user = twitters.pop(); | |
// | |
db2.openDoc(twi_user, { | |
success: function(user) { | |
//sys.puts('User object: ' + sys.inspect(user)); | |
sys.puts('Twi-user: ' + user.account); | |
sys.puts('Only ' + twitters.length + ' to go'); | |
//var habrahost = querystring.parse(user.url); | |
//var url_parts = url.parse(user.url); | |
//sys.puts('Connecting to ' + url_parts.host); | |
getPage("/1/friends/ids.json?screen_name=" + user.account, client, function(code, pagetext, headers){ | |
sys.puts('Code is ' + code); | |
if (code == 400) { | |
sys.puts('Hourly limit reached'); | |
process.exit(); | |
twitters.push(twi_user); // return lad to queue | |
setTimeout(function() { | |
sys.puts('New round!'); | |
getleaders(); | |
}, 25715); // 140 запросов в час | |
} else if (code == 200) { | |
var parsed = JSON.parse(pagetext); | |
sys.puts(parsed.length + ' leaders found'); | |
save_leader(user.account, parsed); | |
sys.puts('leaders saved'); | |
setTimeout(function() { | |
sys.puts('New round!'); | |
getleaders(); | |
}, 25710); // 140 запросов в час | |
} else if (code == 401) { | |
sys.puts('Authorization required'); | |
setTimeout(function() { | |
sys.puts('New round!'); | |
getleaders(); | |
}, 25710); // 140 запросов в час | |
} else if (code == 404) { | |
sys.puts('Non-existing user'); | |
setTimeout(function() { | |
sys.puts('New round!'); | |
getleaders(); | |
}, 25710); // 140 запросов в час | |
} else { | |
sys.puts('Something is wrong here: ' + code); | |
twitters.push(twi_user); // return lad to queue | |
setTimeout(function() { | |
sys.puts('New round!'); | |
getleaders(); | |
}, 12715); // Половина времени | |
} | |
}); | |
}, | |
error: function() { | |
sys.puts('Error happened.'); | |
} | |
}); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var unique = function(arr) { | |
var a = []; | |
var l = arr.length; | |
for(var i=0; i<l; i++) { | |
for(var j=i+1; j<l; j++) { | |
// If this[i] is found later in the array | |
if (arr[i] === arr[j]) | |
j = ++i; | |
} | |
a.push(arr[i]); | |
} | |
return a; | |
}; | |
function indexInArray(arr, val){ | |
for (var i = 0; i < arr.length; i++) { | |
if(arr[i]==val) return true; | |
} | |
return false; | |
} | |
function get_content_type(headers) { | |
return headers['content-type'].split(';')[0]; | |
} | |
var libxml = require("libxmljs"), | |
http = require("http"), | |
url = require("url"), | |
settings = require("./settings"), | |
couch = require("./node-couch").CouchDB, | |
sys = require("sys"); | |
var target_site = http.createClient(80, settings.targethost); | |
var db = couch.db(settings.couchbase, settings.couchhost); | |
var db2 = couch.db('habratwitter', settings.couchhost); | |
var db3 = couch.db('leaders', settings.couchhost); | |
var parsePage = function(string) { | |
try { | |
var parsed = libxml.parseHtmlString(string); | |
} catch(e) { | |
sys.puts('Cannot parse: ' + string); | |
return {}; | |
} | |
return parsed; | |
}; | |
var getLinks = function(parsed_html, baseURL) { | |
var links = parsed_html.find('//a'); | |
var destinations = []; | |
for (link in links) { | |
var attr = links[link].attr('href'); | |
if (attr && attr.value) { | |
var url_parts = url.parse(url.resolve(baseURL, attr.value())); | |
if (!url_parts.hostname || url_parts.hostname.indexOf(settings.targethost) > -1) { | |
var destination = url_parts.pathname; | |
if (url_parts.search) { | |
destination = destination + url_parts.search; | |
} | |
destinations.push(destination); | |
} else { | |
// sys.puts('Found outbound link to ' + url_parts.hostname); | |
} | |
} | |
} | |
return destinations; | |
} | |
var getUsers = function(parsed_html) { | |
// <table class="users-list"> | |
sys.puts('Getting users table'); | |
var usertable = parsed_html.find("//div[@id='rate-table-wrap']/table"); | |
sys.puts('Got users table'); | |
//sys.puts(sys.inspect(usertable)); | |
if (usertable[0].find) { | |
sys.puts('Getting user rows'); | |
var users = usertable[0].find('tr'); | |
sys.puts('Got user rows'); | |
for (user in users) { | |
if (users[user].find && (users[user].find('td').length > 0)) { | |
var profile_dl = users[user].find('td')[2].find('dl')[0]; | |
var karma = users[user].find('td')[3].text(); | |
var rating = users[user].find('td')[4].text(); | |
if (profile_dl.find) { | |
var profile_link = profile_dl.find('dt')[0].find('a')[0].attr('href'); | |
var profile_name = profile_dl.find('dt')[0].find('a')[0].text() | |
save_user(profile_link, profile_name, karma, rating); | |
} else { | |
profile_link = 'none'; | |
} | |
sys.puts('User URL: ' + profile_link); | |
} else { | |
sys.puts('Broken user'); | |
} | |
} | |
} | |
} | |
var getPage = function(host, connection, callback) { | |
var request = connection.request("GET", '/', {"host": host}); | |
request.addListener('response', function (response) { | |
response.setEncoding("utf8"); | |
var text = ''; | |
response.addListener("data", function (chunk) { | |
text += chunk; | |
}); | |
response.addListener('end', function() { | |
// sys.puts('URL: ' + URL + ' > ' + response.statusCode); | |
// sys.puts('HEADERS > ' + JSON.stringify(response.headers)); | |
callback(response.statusCode, text, response.headers); | |
}); | |
}); | |
request.end(); | |
}; | |
var cleanPage = function(parsed_html) { | |
var scripts = parsed_html.find('//script'); | |
for (script in scripts) { | |
scripts[script].remove(); | |
} | |
var styles = parsed_html.find('//style'); | |
for (style in styles) { | |
styles[style].remove(); | |
} | |
var body = parsed_html.get('/html/body'); | |
if (body && body.text) { | |
body = body.text(); | |
} else { | |
sys.puts('Body is empty?'); | |
body = ''; | |
} | |
return body; | |
} | |
var pageTitle = function(parsed_html) { | |
var title = parsed_html.get('//head/title'); | |
return title.text(); | |
} | |
var known_pages = []; | |
var visited_pages = []; | |
var num_of_streams = 0; | |
var get_next_page = function() { | |
for (page in known_pages) { | |
if (known_pages[page] && !indexInArray(visited_pages, known_pages[page]) && (typeof known_pages[page] != 'undefined')) { | |
visited_pages.push(known_pages[page]); | |
// sys.puts(known_pages[page] + ' marked as visited'); | |
// sys.puts('Visited pages: ' + visited_pages.length); | |
return known_pages[page]; | |
} | |
} | |
process.exit(); // End of list | |
} | |
var crawl_page = function (URL, connection, stream_id) { | |
sys.puts('Stream ' + stream_id + ' visiting ' + URL); | |
getPage(URL, connection, function(code, text, headers) { | |
// sys.puts('Got ' + code + ' answer from '+URL+', headers is: ' + JSON.stringify(headers)); | |
sys.puts('Got ' + code + ' answer from ' + URL); | |
var links = []; | |
if (code == 200) { | |
var content_type = get_content_type(headers); | |
if (content_type == 'text/html' || content_type == 'text/plain' || content_type == '') { | |
parsed_page = parsePage(text); | |
if (parsed_page.find) { | |
var title = pageTitle(parsed_page); | |
var page_text = cleanPage(parsed_page); | |
// links = getLinks(parsed_page, URL); | |
getUsers(parsed_page); | |
// sys.puts('Got ' + users.length + ' users from ' + URL); | |
// save_page(URL, title, page_text); | |
} else { | |
sys.puts('Bad parsed page: ' + URL); | |
} | |
} else { | |
sys.puts('Strange content type: ' + content-type); | |
} | |
} else if (code == 301 || code == 303) { | |
// Return redirect location to known pages | |
links = [headers.location]; | |
} else if (code == 404) { | |
// Do nothing, maybe add some sort of log entry | |
} else if (code == 400) { | |
sys.puts('Bad request: ' + URL); | |
} else { | |
sys.puts('Unknown code: ' + code + '\nHeaders is: ' + JSON.stringify(headers)); | |
} | |
known_pages = unique(known_pages.concat(links)); | |
//sys.puts('Known pages: ' + known_pages.length); | |
setTimeout(function() { | |
crawl_page(get_next_page(), connection, stream_id); | |
}, settings.crawl_timeout); | |
// Create new stream if available and have unvisited pages | |
if (num_of_streams < settings.max_streams && known_pages.length > visited_pages.length) { | |
num_of_streams++; | |
var new_connection = http.createClient(80, settings.targethost); | |
crawl_page(get_next_page(), new_connection, num_of_streams); | |
sys.puts('Starting another stream: ' + num_of_streams + ' of ' + settings.max_streams); | |
} | |
}); | |
} | |
var doc_id = 1; | |
var save_page = function (URL, title, text) { | |
db.saveDoc({'url' : URL, 'title' : title, 'text' : text, '_id': doc_id}); | |
doc_id++; | |
} | |
var save_user = function (URL, name, karma, rating) { | |
db.saveDoc({'url' : URL, 'name' : name, 'karma' : karma, 'rating': rating}); | |
// doc_id++; | |
} | |
var save_twitter = function (user, accname) { | |
db2.saveDoc({'name' : user, 'account' : accname}); | |
// doc_id++; | |
} | |
// crawl_page('/people/', target_site, 1); | |
/*for (var i = 1; i < 100; i++) { | |
known_pages.push('/people/page'+ i + '/'); | |
} */ | |
num_of_streams = 1; | |
var people = []; | |
db.allDocs({ | |
success: function(docs) { | |
// people = docs.rows; | |
for(row in docs.rows) { | |
people.push(docs.rows[row].key); | |
} | |
getpeople(); | |
// sys.puts(JSON.stringify(docs)); | |
/* for(row in docs.rows) { | |
//sys.puts(sys.inspect(docs.rows[row])); | |
db.openDoc(docs.rows[row].key, { | |
success: function(user) { | |
sys.puts('user:' + sys.inspect(user)); | |
}, | |
error: function() { | |
sys.puts('Error happened.'); | |
} | |
}); | |
}*/ | |
}, | |
error: function() { | |
sys.puts('Error getting docs'); | |
} | |
}); | |
var querystring = require('querystring'); | |
var catchTwitter = function(parsed, callback) { | |
var dl = parsed.find("//dl"); | |
var twitter = null; | |
//sys.puts(dl.length + ' dl"s '); | |
for (data in dl) { | |
//sys.puts('Dl> ' + dl[data].toString()); | |
if (dl[data].find && dl[data].find('dt').length > 0 ) { | |
if (dl[data].find('dt')[0].text() == 'Twitter:') { | |
twitter = dl[data].find('dd')[0].find('a')[0].text(); | |
} else { | |
var itemname = dl[data].find('dt')[0]; | |
//if (itemname[itemname.length - 1] == ':') { | |
//sys.puts('Data: ' + itemname); | |
//} | |
} | |
// sys.puts('Twitter: ' + twitter); | |
} | |
} | |
callback(twitter); | |
} | |
var getpeople = function() { | |
var habrauser = people.pop(); | |
// | |
db.openDoc(habrauser, { | |
success: function(user) { | |
//sys.puts('User object: ' + sys.inspect(user)); | |
sys.puts('habrauser: ' + user.name); | |
var habrahost = querystring.parse(user.url); | |
var url_parts = url.parse(user.url); | |
sys.puts('Connecting to ' + url_parts.host); | |
var client = http.createClient(80, url_parts.host); | |
getPage(url_parts.host, client, function(code, pagetext, headers){ | |
delete client; | |
sys.puts('Code is ' + code); | |
if (code == 200) { | |
var parsed = parsePage(pagetext); | |
catchTwitter(parsed, function(twiacc) { | |
if (twiacc) { | |
sys.puts('Account found:' + twiacc); | |
save_twitter(user.name, twiacc); | |
} | |
setTimeout(function() { | |
sys.puts('New round!'); | |
getpeople(); | |
}, settings.crawl_timeout); | |
}); | |
delete parsed; | |
} else { | |
people.push(habrauser); | |
setTimeout(function() { | |
sys.puts('New round!'); | |
getpeople(); | |
}, settings.crawl_timeout * 2); | |
} | |
}); | |
}, | |
error: function() { | |
sys.puts('Error happened.'); | |
} | |
}); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var unique = function(arr) { | |
var a = []; | |
var l = arr.length; | |
for(var i=0; i<l; i++) { | |
for(var j=i+1; j<l; j++) { | |
// If this[i] is found later in the array | |
if (arr[i] === arr[j]) | |
j = ++i; | |
} | |
a.push(arr[i]); | |
} | |
return a; | |
}; | |
function indexInArray(arr, val){ | |
for (var i = 0; i < arr.length; i++) { | |
if(arr[i]==val) return true; | |
} | |
return false; | |
} | |
function get_content_type(headers) { | |
return headers['content-type'].split(';')[0]; | |
} | |
var libxml = require("libxmljs"), | |
http = require("http"), | |
url = require("url"), | |
settings = require("./settings"), | |
couch = require("./node-couch").CouchDB, | |
sys = require("sys"); | |
var target_site = http.createClient(80, settings.targethost); | |
var db = couch.db(settings.couchbase, settings.couchhost); | |
var parsePage = function(string) { | |
try { | |
var parsed = libxml.parseHtmlString(string); | |
} catch(e) { | |
sys.puts('Cannot parse: ' + string); | |
return {}; | |
} | |
return parsed; | |
}; | |
var getLinks = function(parsed_html, baseURL) { | |
var links = parsed_html.find('//a'); | |
var destinations = []; | |
for (link in links) { | |
var attr = links[link].attr('href'); | |
if (attr && attr.value) { | |
var url_parts = url.parse(url.resolve(baseURL, attr.value())); | |
if (!url_parts.hostname || url_parts.hostname.indexOf(settings.targethost) > -1) { | |
var destination = url_parts.pathname; | |
if (url_parts.search) { | |
destination = destination + url_parts.search; | |
} | |
destinations.push(destination); | |
} else { | |
// sys.puts('Found outbound link to ' + url_parts.hostname); | |
} | |
} | |
} | |
return destinations; | |
} | |
var getUsers = function(parsed_html) { | |
// <table class="users-list"> | |
sys.puts('Getting users table'); | |
var usertable = parsed_html.find("//div[@id='rate-table-wrap']/table"); | |
sys.puts('Got users table'); | |
//sys.puts(sys.inspect(usertable)); | |
if (usertable[0].find) { | |
sys.puts('Getting user rows'); | |
var users = usertable[0].find('tr'); | |
sys.puts('Got user rows'); | |
for (user in users) { | |
if (users[user].find && (users[user].find('td').length > 0)) { | |
var profile_dl = users[user].find('td')[2].find('dl')[0]; | |
var karma = users[user].find('td')[3].text(); | |
var rating = users[user].find('td')[4].text(); | |
if (profile_dl.find) { | |
var profile_link = profile_dl.find('dt')[0].find('a')[0].attr('href').value(); | |
var profile_name = profile_dl.find('dt')[0].find('a')[0].text() | |
save_user(profile_link.toString(), profile_name, karma, rating); | |
} else { | |
profile_link = 'none'; | |
} | |
sys.puts('User URL: ' + profile_link); | |
} else { | |
sys.puts('Broken user'); | |
} | |
} | |
} | |
} | |
var getPage = function(URL, connection, callback) { | |
var request = connection.request("GET", URL, {"host": settings.targethost}); | |
request.addListener('response', function (response) { | |
response.setEncoding("utf8"); | |
var text = ''; | |
response.addListener("data", function (chunk) { | |
text += chunk; | |
}); | |
response.addListener('end', function() { | |
// sys.puts('URL: ' + URL + ' > ' + response.statusCode); | |
// sys.puts('HEADERS > ' + JSON.stringify(response.headers)); | |
callback(response.statusCode, text, response.headers); | |
}); | |
}); | |
request.end(); | |
}; | |
var cleanPage = function(parsed_html) { | |
var scripts = parsed_html.find('//script'); | |
for (script in scripts) { | |
scripts[script].remove(); | |
} | |
var styles = parsed_html.find('//style'); | |
for (style in styles) { | |
styles[style].remove(); | |
} | |
var body = parsed_html.get('/html/body'); | |
if (body && body.text) { | |
body = body.text(); | |
} else { | |
sys.puts('Body is empty?'); | |
body = ''; | |
} | |
return body; | |
} | |
var pageTitle = function(parsed_html) { | |
var title = parsed_html.get('//head/title'); | |
return title.text(); | |
} | |
var known_pages = []; | |
var visited_pages = []; | |
var num_of_streams = 0; | |
var get_next_page = function() { | |
for (page in known_pages) { | |
if (known_pages[page] && !indexInArray(visited_pages, known_pages[page]) && (typeof known_pages[page] != 'undefined')) { | |
visited_pages.push(known_pages[page]); | |
// sys.puts(known_pages[page] + ' marked as visited'); | |
// sys.puts('Visited pages: ' + visited_pages.length); | |
return known_pages[page]; | |
} | |
} | |
process.exit(); // End of list | |
} | |
var crawl_page = function (URL, connection, stream_id) { | |
sys.puts('Stream ' + stream_id + ' visiting ' + URL); | |
getPage(URL, connection, function(code, text, headers) { | |
// sys.puts('Got ' + code + ' answer from '+URL+', headers is: ' + JSON.stringify(headers)); | |
sys.puts('Got ' + code + ' answer from ' + URL); | |
var links = []; | |
if (code == 200) { | |
var content_type = get_content_type(headers); | |
if (content_type == 'text/html' || content_type == 'text/plain' || content_type == '') { | |
parsed_page = parsePage(text); | |
if (parsed_page.find) { | |
var title = pageTitle(parsed_page); | |
//var page_text = cleanPage(parsed_page); | |
// links = getLinks(parsed_page, URL); | |
getUsers(parsed_page); | |
// sys.puts('Got ' + users.length + ' users from ' + URL); | |
// save_page(URL, title, page_text); | |
delete parsed_page; | |
} else { | |
sys.puts('Bad parsed page: ' + URL); | |
} | |
} else { | |
sys.puts('Strange content type: ' + content-type); | |
} | |
} else if (code == 301 || code == 303) { | |
// Return redirect location to known pages | |
links = [headers.location]; | |
} else if (code == 404) { | |
// Do nothing, maybe add some sort of log entry | |
} else if (code == 400) { | |
sys.puts('Bad request: ' + URL); | |
} else { | |
sys.puts('Unknown code: ' + code + '\nHeaders is: ' + JSON.stringify(headers)); | |
} | |
known_pages = unique(known_pages.concat(links)); | |
//sys.puts('Known pages: ' + known_pages.length); | |
setTimeout(function() { | |
crawl_page(get_next_page(), connection, stream_id); | |
}, settings.crawl_timeout); | |
// Create new stream if available and have unvisited pages | |
if (num_of_streams < settings.max_streams && known_pages.length > visited_pages.length) { | |
num_of_streams++; | |
var new_connection = http.createClient(80, settings.targethost); | |
crawl_page(get_next_page(), new_connection, num_of_streams); | |
sys.puts('Starting another stream: ' + num_of_streams + ' of ' + settings.max_streams); | |
} | |
}); | |
} | |
var doc_id = 1; | |
var save_page = function (URL, title, text) { | |
db.saveDoc({'url' : URL, 'title' : title, 'text' : text, '_id': doc_id}); | |
doc_id++; | |
} | |
var save_user = function (URL, name, karma, rating) { | |
db.saveDoc({'url' : URL, 'name' : name, 'karma' : karma, 'rating': rating}); | |
// doc_id++; | |
} | |
crawl_page('/people/', target_site, 1); | |
for (var i = 1; i < 100; i++) { | |
known_pages.push('/people/page'+ i + '/'); | |
} | |
num_of_streams = 1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment