Skip to content

Instantly share code, notes, and snippets.

@manviny
Created July 14, 2017 22:31
Show Gist options
  • Save manviny/8c73b539facf6771ef702c429829d75b to your computer and use it in GitHub Desktop.
Save manviny/8c73b539facf6771ef702c429829d75b to your computer and use it in GitHub Desktop.
SEO Site Crawler
/**
* @author Eric Tucker <tucker.ericm@gmail.com>
* Written for nodejs
* requires phantomjs and nightmarejs
*
* Accessible through command line as:
* node seo-scraper.js www.YOUR-WEBSITE.com
*
*/
var Nightmare = require('nightmare');
var nightmare = new Nightmare();
var fs = require('fs');
//This is our start page
var siteRoot = process.argv[2].indexOf('http://') === -1 ? 'http://'+process.argv[2] : process.argv[2];
console.log(siteRoot);
//This is going to be our pages csv - We set headers here
var pageStream = fs.createWriteStream("pages.csv",{flags: 'a', encoding: 'utf8'});
pageStream.write("\"Page\",\"Title\",\"Keywords\",\"Description\",\"H1\",\"H2\",\"H3\",\"H4\",\"H5\"\n");
//This is going to be all anchors across all pages
var anchorStream = fs.createWriteStream("anchors.csv",{flags:'a', encoding: 'utf8'});
anchorStream.write("\"Page\",\"Link\",\"Inner Text/HTML\"\n");
var pagesCrawled = [];
var pagesNotCrawled = [];
//Crawl page and extract tags and stuff
function crawlPage(site) {
var anchorTags = document.getElementsByTagName('a');
var anchors = [];
for(var i = 0; i < anchorTags.length; i++) {
var innerText = anchorTags[i].innerText.replace(/\r?\n|\r/g,"").trim();
var row = {url:anchorTags[i].href.trim()};
row.inner = innerText != '' ? innerText.replace(/"/gm, '""') : anchorTags[i].innerHTML.trim().replace(/\s\s|\t|\r?\n|\r/gm,' ').replace(/"/gm, '""');
anchors.push(row);
}
var hTags = {
h1: [],
h2: [],
h3: [],
h4: [],
h5: []
};
for(var tag in hTags) {
var tagCollection = document.getElementsByTagName(tag);
if(tagCollection.length != 0) {
for(var t = 0; t < tagCollection.length; t++) {
hTags[tag].push(tagCollection[t].innerHTML.trim().replace(/\s\s/gm, "").replace(/"/gm, '""'));
}
} else {
hTags[tag].push('**NONE**');
}
}
return {
url: document.URL.replace(site,''),
title: document.title,
keywords: document.getElementsByName('keywords')[0].getAttribute('content'),
description: document.getElementsByName('description')[0].getAttribute('content'),
anchors: anchors,
hTags: hTags
};
}
Array.prototype.getUnique = function() {
var u = [];
for(var i = 0; i < this.length; i++) {
if(
//Standard for unique
u.indexOf(this[i]) === -1
//Get rid of blanks and file downloads
&& this[i] != ''
&& this[i].indexOf('.pdf') === -1
&& this[i].indexOf('#') === -1
) {
u.push(this[i]);
}
}
return u;
};
Array.prototype.removeArray = function(arrayToRemove) {
for(var i = 0; i < arrayToRemove.length; i++) {
var index = this.indexOf(arrayToRemove[i]);
if(index != -1) {
this.splice(index,1);
}
}
return this;
};
//Set our evaluate function
Nightmare.prototype.seoScrape = function() {
return this.evaluate(crawlPage, function(value) {
var colDelim = '","'
,rowDelim = '"\r\n"';
console.log('crawling '+ value.url);
if(pagesCrawled.indexOf(value.url) === -1) {
pagesCrawled.push(value.url);
}
for(var i = 0; i < value.anchors.length; i++) {
if(value.anchors[i].url.indexOf(siteRoot) != -1) {
//remove trailing slash for our crawling purposes (WONT AFFECT OUTPUT FILES)
if(value.anchors[i].url.length > 1 && value.anchors[i].url[value.anchors[i].url.length - 1] == '/') {
value.anchors[i].url = value.anchors[i].url.substring(0,value.anchors[i].url.length - 1);
}
//Make sure this is our start domain
pagesNotCrawled.push(value.anchors[i].url.replace(siteRoot,''));
}
//Add the anchor tag info to our csv
anchorStream.write('"'+value.url + colDelim + value.anchors[i].url + colDelim + value.anchors[i].inner+"\"\n");
}
pageStream.write(
'"'+value.url + '","'
+value.title + '","'
+value.keywords + '","'
+value.description + '","'
+value.hTags.h1.join(' | ') + '","'
+value.hTags.h2.join(' | ') + '","'
+value.hTags.h3.join(' | ') + '","'
+value.hTags.h4.join(' | ') + '","'
+value.hTags.h5.join(' | ') + '"\n"'
);
//Update our crawled and uncrawled pages
//This returns our unique array
pagesNotCrawled = pagesNotCrawled.getUnique();
pagesNotCrawled = pagesNotCrawled.removeArray(pagesCrawled);
}, siteRoot ).run(function(err, nightmare){
if(pagesNotCrawled.length != 0) {
console.log('Navigating to '+pagesNotCrawled[0]);
pagesCrawled.push(pagesNotCrawled[0]);
nightmare.goto(siteRoot+pagesNotCrawled[0]).seoScrape();
} else {
console.log('Your Site Has Been Crawled!');
}
});
};
nightmare.goto(siteRoot);
nightmare.seoScrape();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment