Create a gist now

Instantly share code, notes, and snippets.

@boxedfish /ps.js
Last active Dec 28, 2015

I love nodeJS and have played with crawling in nodeJS before and ditched it for php and used DOM document for the actual HTML processing. These tests where to see if I could beat some times and examples given to me by Phil Sturgeon. What I realised was I have worked out a lot about node in recent years from my experiences but there is plenty to …
var start = +new Date();
var request = require('request');
var cheerio = require('cheerio');
var Seq = require('seq');
var total_page = 200;
var page_array = [];
var page = 1;
var header = ['', 'Rank', 'Team', 'Name', 'Point', 'Total'];
console.log("Page number, Time taken");
while (page <= total_page) {
page_array.push('http://fantasy.premierleague.com/my-leagues/303/standings/?ls-page='+page)
page++;
}
Seq(page_array)
.forEach(function(url,i){
var that = this;
request({url: url, maxSockets: 8 /* tweak this for max performance */ }, (function(j) {
return function (error, response, body) {
$ = cheerio.load(body);
$('.ismStandingsTable').find('tr').each(function(index, elem){
$(this).find('td').each(function(head){
if (head == 2) {
//console.log(header[head]+ ' : '+$(this).text());
//console.log($(this).text());
}
});
});
var end = +new Date();
console.log(i +", "+(end-start)/1000);
}
})(page)); //bind everything with page number
that();
});
var start = +new Date();
var request = require('request');
var cheerio = require('cheerio');
var total_page = 200;
var page = 1;
var header = ['', 'Rank', 'Team', 'Name', 'Point', 'Total'];
console.log("Page number, Time taken");
while (page <= total_page) {
var url = 'http://fantasy.premierleague.com/my-leagues/303/standings/?ls-page='+page;
request({url: url, maxSockets: 8 /* tweak this for max performance */ }, (function(i) {
return function (error, response, body) {
$ = cheerio.load(body);
$('.ismStandingsTable').find('tr').each(function(index, elem){
$(this).find('td').each(function(head){
if (head == 2) {
//console.log(header[head]+ ' : '+$(this).text());
//console.log($(this).text());
}
});
});
var end = +new Date();
console.log(i +", "+(end-start)/1000);
}
})(page)); //bind everything with page number
page++;
}
var start = +new Date();
var http = require('http');
var cheerio = require('cheerio');
var Seq = require('seq');
var Seq2 = require('seq');
var total_page = 200;
var page_array = [];
var page = 1;
var header = ['', 'Rank', 'Team', 'Name', 'Point', 'Total'];
var data = [];
console.log("Page number, Time taken");
http.globalAgent.maxSockets = 8;
while (page <= total_page) {
page_array.push('http://fantasy.premierleague.com/my-leagues/303/standings/?ls-page='+page)
page++;
}
Seq(page_array)
.parEach(function(url,i){
var that = this;
http.get(url, function(res) {
var pageData = "";
res.setEncoding('utf8');
res.on('data', function (chunk) {
pageData += chunk;
});
res.on('end', function(){
data.push(pageData);
console.log(i);
that();
});
}).on('error', function(e) {
console.log("Got error: " + e.message);
});
}).
seq(function(){
var that = this;
Seq2(data)
.parEach(function(d,i){
var that2 = this;
$ = cheerio.load(d);
$('.ismStandingsTable').find('tr').each(function(index, elem){
$(this).find('td').each(function(head){
if (head == 2) {
//console.log(header[head]+ ' : '+$(this).text());
//console.log($(this).text());
}
});
});
var end = +new Date();
console.log(i +", "+(end-start)/1000);
that2();
})
.seq(function(){
that();
this();
});
});
var start = +new Date();
var request = require('request');
var cheerio = require('cheerio');
var page = process.argv[0];
request(process.argv[1], (function(j) {
return function (error, response, body) {
$ = cheerio.load(body);
$('.ismStandingsTable').find('tr').each(function(index, elem){
$(this).find('td').each(function(head){
if (head == 2) {
//console.log(header[head]+ ' : '+$(this).text());
//console.log($(this).text());
}
});
});
process.stdout.write("WRITE DATA");
process.exit(0);
}
})(page));
var start = +new Date();
var request = require('request');
var cheerio = require('cheerio');
var Seq = require('seq');
var total_page = 200;
var page_array = [];
var page = 1;
var header = ['', 'Rank', 'Team', 'Name', 'Point', 'Total'];
console.log("Page number, Time taken");
while (page <= total_page) {
page_array.push('http://fantasy.premierleague.com/my-leagues/303/standings/?ls-page='+page)
page++;
}
Seq(page_array)
.parEach(function(url,i){
var that = this;
var exec = require('child_process').exec;
exec("node child.js " + i + " " + url, function(err, stdout, stderr) {
var end = +new Date();
console.log(i +", "+(end-start)/1000);
console.log(stdout);
});
that();
});
ps.js - Using Seq and parEach to try to speed things up! results, fail:
196, 55.163
198, 55.37
199, 55.582
197, 55.735
real 0m55.972s
user 0m50.101s
sys 0m1.520s
ps2.js is the original script, a little slower:
193, 68.154
192, 68.317
200, 68.505
198, 68.672
199, 68.886
real 1m9.103s
user 0m54.261s
sys 0m1.763s
ps3.js is slightly off the wall, I just wanted to see how the processing the HTML outside the request would play, it was not any better, infact script one is still better!
195, 62.689
196, 62.827
197, 62.937
198, 63.072
199, 63.202
real 1m3.434s
user 0m30.608s
sys 0m1.027s
ps4.js is our winner by a long way but still is no where close to Phil's React php script!
192, 41.717
WRITE DATA
197, 41.771
WRITE DATA
199, 41.772
WRITE DATA
198, 41.791
WRITE DATA
real 0m41.992s
user 3m30.943s
sys 0m52.061s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment