Created
July 7, 2015 19:06
-
-
Save kurtroberts/c7ffb93b0822c07508f3 to your computer and use it in GitHub Desktop.
Create a CSV file with some basic SEO evaluation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var huntsman = require('huntsman'), | |
spider = huntsman.spider(), | |
fs = require('fs'), | |
opts = require('nomnom') | |
.option('domain', { | |
abbr: 'd', | |
help: 'Domain to scan.', | |
required: true | |
}) | |
.option('startUrl', { | |
abbr: 's', | |
help: 'Url to start at.', | |
required: true | |
}) | |
.parse(), | |
out = fs.createWriteStream(opts.domain + '.txt'), //TODO: convert to using Mongo backend | |
eventExpr = new RegExp('^https?\:\/\/' + opts.domain.replace('.', '\.') + '\/.*'); | |
spider.extensions = [ | |
huntsman.extension( 'recurse' ), // load recurse extension & follow anchor links | |
huntsman.extension( 'cheerio' ) // load cheerio extension | |
]; | |
console.log('Using expression: %s', eventExpr.toString()); | |
// follow pages which match this uri regex | |
spider.on( eventExpr , function ( err, res ){ | |
console.log('Found page: %s', res.uri); | |
// use jquery-style selectors & functions | |
var $ = res.extension.cheerio; | |
if ( !$ ) { | |
console.log('NOT HTML: ', res.uri); | |
return; // content is not html | |
} | |
// extract information from page body | |
var metadata = { | |
uri: res.uri, | |
title: $('title').text().trim(), | |
description: $('meta[name=description]').attr('content') | |
}; | |
console.log( metadata ); | |
out.write(metadata.uri + "\t" + metadata.title + "\t" + metadata.description + "\r\n"); | |
}); | |
spider.queue.add( opts.startUrl ); | |
spider.start(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment