Skip to content

Instantly share code, notes, and snippets.

@mohamedmansour
Created December 6, 2017 04:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mohamedmansour/02f5c356ab1bda68637249708defe591 to your computer and use it in GitHub Desktop.
Save mohamedmansour/02f5c356ab1bda68637249708defe591 to your computer and use it in GitHub Desktop.
Node Crawler
const cheerio = require('cheerio')
const request = require('request')
const { URL } = require('url')
const async = require('async')
var links = {}
fetchAllLinks('https://crawl.codeferret.net/1.html', links, () => {
let listLinks = Object.keys(links).map(link => {return `"${link}"`})
console.log(`[\n ${listLinks.join(',\n ')}\n ]`)
})
function fetchAllLinks(urlString, linkMap, done) {
if (!urlString) {
done()
return
}
const url = new URL(urlString)
if (linkMap[url.href]) {
done()
return
}
request(url.href, function (error, response, body) {
if (!response || response.statusCode !== 200) {
done()
return
}
linkMap[url.href] = true
const $ = cheerio.load(body)
const linksFound = $('a[href]')
if (!linksFound.length) {
done()
return
}
async.each(linksFound, (linkNode, callback) => {
const href = $(linkNode).attr('href')
if (href) {
const urlFound = new URL(href, url.href)
fetchAllLinks(urlFound.href, linkMap, callback)
}
else {
callback()
}
}, () => {
done()
})
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment