Skip to content

Instantly share code, notes, and snippets.

@ahmednuaman
Last active December 26, 2015 01:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ahmednuaman/7074040 to your computer and use it in GitHub Desktop.
Save ahmednuaman/7074040 to your computer and use it in GitHub Desktop.
A script that indexes you single page application (in this case using AngularJS and RequireJS, hence removing script and style tags from the `head`) and creates `.html` files of the generated code.
fs = require 'fs'
page = (require 'webpage').create()
system = require 'system'
base = system.args[1]
indexed = []
queue = []
getHTML = () ->
tags = document.querySelectorAll 'head script, head style'
len = tags.length
while len--
tag = tags[len]
tag.parentNode.removeChild tag
return document.documentElement.innerHTML
findLinks = () ->
as = document.querySelectorAll 'a[href]'
len = as.length
links = []
while len--
links.push as[len].href
return links
index = (address) ->
html = page.evaluate getHTML
links = page.evaluate findLinks
len = links.length
while len--
href = links[len]
if (href.indexOf base) isnt 0
continue
if (indexed.indexOf href) is -1 and (queue.indexOf href) is -1
queue.push href
save address, html
if queue.length
next queue.pop()
else
phantom.exit()
next = (address) ->
page.open address, (status) ->
indexed.push address
setTimeout index, 3000, address
save = (address, html) ->
name = address.replace base, ''
if name
fs.makeTree name
if (name.charAt name.length - 1) isnt '/'
name += '/'
fs.write name + 'index.html', html, 'w'
next base
@ahmednuaman
Copy link
Author

Run like this: phantomjs crawler.coffee 'http://localhost:8000/'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment