Skip to content

Instantly share code, notes, and snippets.

@kurtroberts
Created July 7, 2015 19:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kurtroberts/c7ffb93b0822c07508f3 to your computer and use it in GitHub Desktop.
Save kurtroberts/c7ffb93b0822c07508f3 to your computer and use it in GitHub Desktop.
Create a CSV file with some basic SEO evaluation
var huntsman = require('huntsman'),
spider = huntsman.spider(),
fs = require('fs'),
opts = require('nomnom')
.option('domain', {
abbr: 'd',
help: 'Domain to scan.',
required: true
})
.option('startUrl', {
abbr: 's',
help: 'Url to start at.',
required: true
})
.parse(),
out = fs.createWriteStream(opts.domain + '.txt'), //TODO: convert to using Mongo backend
eventExpr = new RegExp('^https?\:\/\/' + opts.domain.replace('.', '\.') + '\/.*');
spider.extensions = [
huntsman.extension( 'recurse' ), // load recurse extension & follow anchor links
huntsman.extension( 'cheerio' ) // load cheerio extension
];
console.log('Using expression: %s', eventExpr.toString());
// follow pages which match this uri regex
spider.on( eventExpr , function ( err, res ){
console.log('Found page: %s', res.uri);
// use jquery-style selectors & functions
var $ = res.extension.cheerio;
if ( !$ ) {
console.log('NOT HTML: ', res.uri);
return; // content is not html
}
// extract information from page body
var metadata = {
uri: res.uri,
title: $('title').text().trim(),
description: $('meta[name=description]').attr('content')
};
console.log( metadata );
out.write(metadata.uri + "\t" + metadata.title + "\t" + metadata.description + "\r\n");
});
spider.queue.add( opts.startUrl );
spider.start();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment