Skip to content

Instantly share code, notes, and snippets.

@vadimii
Created March 13, 2014 21:59
Show Gist options
  • Save vadimii/9537916 to your computer and use it in GitHub Desktop.
Save vadimii/9537916 to your computer and use it in GitHub Desktop.
utkonos.ru crawler
var Crawler = require('crawler').Crawler
var fs = require('fs')
var base = 'http://www.utkonos.ru'
var start = base + '/'
var queued = {}
var crawledCount = 0
var queuedCount = 0
var canonical = function (url) {
var t1 = /^\/cat\/(\d+)\/?$/
var t2 = /^\/cat\/catalogue\/(\d+)\/?$/
var target = '/cat/catalogue/$1/page/1'
if (t1.test(url)) return url.replace(t1, target)
if (t2.test(url)) return url.replace(t2, target)
return url
}
var isTarget = function (url) {
var tt = /^\/cat\/catalogue\/\d+\/page\/\d+$/
return url && tt.test(url)
}
var saveToDisk = function (url, content) {
var fname = url.replace(/[\/,:]/g, '_')
fname = fname.replace(/_$/g, '')
fname = fname.replace(/_+/g, '_')
fs.writeFile('./downloads/' + fname + '.html', content)
}
var statstep = function (url, $) {
var logstr = '\n' + $('h1:first').text()
var pageMatch = url && url.match(/\/(\d+)$/)
if (pageMatch && pageMatch.length > 1) logstr += ' ' + pageMatch[1]
crawledCount += 1
queuedCount -= 1
logstr += ' [' + crawledCount + ':' + queuedCount + ']'
console.log(logstr)
}
var crawler = new Crawler({
maxConnections: 2,
callback: function (error, result, $) {
if (error) {
console.log(error)
return
}
statstep(result.uri, $)
saveToDisk(result.uri, result.body)
$('a').each(function (index, a) {
var $a = $(a)
var url = canonical($a.attr('href'))
if (!isTarget(url)) return
if (queued[url]) return
queued[url] = true
crawler.queue(base + url)
queuedCount += 1
console.log('-> ' + $a.text())
})
}
})
queued[start] = true
queuedCount += 1
crawler.queue(start)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment