Skip to content

Instantly share code, notes, and snippets.

@keenwon
Last active August 29, 2015 14:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save keenwon/cf6efc6e8d928b66ac63 to your computer and use it in GitHub Desktop.
Save keenwon/cf6efc6e8d928b66ac63 to your computer and use it in GitHub Desktop.
Nodejs实现的简易网页抓取程序,支持代理(抓取墙外的资源)
var fs = require('fs'),
path = require('path'),
filePath = path.join(__dirname, 'log.txt'), //抓取结果写入log.txt文件
http = require('http');
var page = 1, //开始页码
maxPage = 100, //结束页码
search,
next,
run;
search = function() {
var opt = {
host: '127.0.0.1', //代理ip
port: '1080', //代理端口号
method: 'GET',
path: 'http://xxxxxx.com/?page=' + page, //目标页面地址
headers: {
'Cookie': 'language=cn_CN;' //头信息带上cookie
}
};
// 执行抓取
var body = '';
var req = http.request(opt, function(res) {
res.on('data', function(d) {
body += d;
}).on('end', function() {
var reg = /\<a href=\"(.*)\" title=\"(.*)\">/ig;
body.replace(reg, function(s, url, title) {
// 正则匹配标题
if (/测试/.test(title)) {
fs.appendFileSync(filePath, url + '【' + title + '】' + '\r\n');
}
});
console.log(page);
next();
});
}).on('error', function(e) {
console.log("Got error: " + e.message);
page--;
next();
})
req.end();
};
next = function() {
page++;
if (page >= maxPage) {
return;
}
search();
};
run = function() {
console.log('start');
search();
}
run();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment