Skip to content

Instantly share code, notes, and snippets.

@akrawchyk
Last active June 20, 2016 18:40
Show Gist options
  • Save akrawchyk/0e87ee70e1ddc1bbeacb7836f76ef44c to your computer and use it in GitHub Desktop.
Save akrawchyk/0e87ee70e1ddc1bbeacb7836f76ef44c to your computer and use it in GitHub Desktop.
Streams HTML from phantomjs after document load as ndjson
/**
* Crawler
* PhantomJS writes to stream with content of crawled pages
* Streams HTML from phantomjs after document load as ndjson
*/
'use strict'
const ndjson = require('ndjson')
const phantom = require('phantom')
const Readable = require('readable-stream').Readable
const rs = new Readable({
read() {
},
})
function worker(url) {
let phInstance
let sitepage
return phantom.create(['--ignore-ssl-errors=yes'/*, '--load-images=no'*/])
.then((instance) => {
instance.process.stderr.pipe(process.stderr)
phInstance = instance
return instance.createPage()
})
.then((page) => {
sitepage = page
return page.open(url)
})
.then((status) => {
// TODO handle unsuccessful
return sitepage.property('content')
})
.then((content) => {
sitepage.close()
phInstance.exit()
rs.push(`${JSON.stringify({ url: url, response: content })}\n`)
})
.catch((err) => {
process.stderr.write(err.message)
phInstance.exit()
})
}
function spawn(urls) {
// TODO queue up spawns for worker pool w/max worker count
if (urls.length > 0) {
worker(urls[0])
.then(() => {
spawn(urls.slice(1))
})
}
}
function run(urls = [], options = {}) {
const u = Array.isArray(urls) ? urls : [urls]
spawn(u)
return rs
}
run([
'https://serviceyr.org',
'https://google.com',
])
.pipe(ndjson.parse())
.on('error', (err) => {
// process.stderr.write(err.message)
console.dir(err)
})
.on('data', (obj) => {
console.dir(obj)
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment