Last active
August 29, 2015 14:17
-
-
Save keenwon/cf6efc6e8d928b66ac63 to your computer and use it in GitHub Desktop.
Nodejs实现的简易网页抓取程序,支持代理(抓取墙外的资源)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'), | |
path = require('path'), | |
filePath = path.join(__dirname, 'log.txt'), //抓取结果写入log.txt文件 | |
http = require('http'); | |
var page = 1, //开始页码 | |
maxPage = 100, //结束页码 | |
search, | |
next, | |
run; | |
search = function() { | |
var opt = { | |
host: '127.0.0.1', //代理ip | |
port: '1080', //代理端口号 | |
method: 'GET', | |
path: 'http://xxxxxx.com/?page=' + page, //目标页面地址 | |
headers: { | |
'Cookie': 'language=cn_CN;' //头信息带上cookie | |
} | |
}; | |
// 执行抓取 | |
var body = ''; | |
var req = http.request(opt, function(res) { | |
res.on('data', function(d) { | |
body += d; | |
}).on('end', function() { | |
var reg = /\<a href=\"(.*)\" title=\"(.*)\">/ig; | |
body.replace(reg, function(s, url, title) { | |
// 正则匹配标题 | |
if (/测试/.test(title)) { | |
fs.appendFileSync(filePath, url + '【' + title + '】' + '\r\n'); | |
} | |
}); | |
console.log(page); | |
next(); | |
}); | |
}).on('error', function(e) { | |
console.log("Got error: " + e.message); | |
page--; | |
next(); | |
}) | |
req.end(); | |
}; | |
next = function() { | |
page++; | |
if (page >= maxPage) { | |
return; | |
} | |
search(); | |
}; | |
run = function() { | |
console.log('start'); | |
search(); | |
} | |
run(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment