Skip to content

Instantly share code, notes, and snippets.

@monkeym4ster
Created January 15, 2016 03:54
Show Gist options
  • Save monkeym4ster/6206802401c32c13ebc9 to your computer and use it in GitHub Desktop.
Save monkeym4ster/6206802401c32c13ebc9 to your computer and use it in GitHub Desktop.
Simple html and javascript files crawler
request = require('request')
cheerio = require('cheerio')
async = require('async')
URL = process.argv[2] or 'http://127.0.0.1:3000/'
crawler = (url) ->
console.log 'crawler', url
request url, (err, res, body) ->
if err
throw err
protocol = res.request.uri.protocol
format = (link) ->
if link is '/'
return false
if link.startsWith('javascript:')
return false
if link.startsWith('/')
if link.startsWith('//')
return protocol + link
return url + link
if not link.startsWith('http')
return url + link
return link
$ = cheerio.load(body)
getScripts = ($) ->
scripts = []
$('script').each () ->
self = $(this)
script = self.attr('src')
if scripts.indexOf(script) isnt -1
return
if script
scripts.push(script)
script = format(script)
if script then console.log 'script', script
getLinks = ($) ->
links = []
$('a').each () ->
self = $(this)
link = self.attr('href')
if links.indexOf(link) isnt -1
return
if link
links.push(link)
link = format(link)
if link then console.log 'link', link
getScripts($)
getLinks($)
crawler URL
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment