manviny/seo-scraper.js

## seo-scraper.js
/**
 * @author Eric Tucker <tucker.ericm@gmail.com>
 * Written for nodejs
 * requires phantomjs and nightmarejs
 *
 * Accessible through command line as:
 * node seo-scraper.js www.YOUR-WEBSITE.com
 *
 */


var Nightmare = require('nightmare');
var nightmare = new Nightmare();
var fs = require('fs');

//This is our start page
var siteRoot = process.argv[2].indexOf('http://') === -1 ? 'http://'+process.argv[2] : process.argv[2];
console.log(siteRoot);

//This is going to be our pages csv - We set headers here
var pageStream = fs.createWriteStream("pages.csv",{flags: 'a', encoding: 'utf8'});
pageStream.write("\"Page\",\"Title\",\"Keywords\",\"Description\",\"H1\",\"H2\",\"H3\",\"H4\",\"H5\"\n");

//This is going to be all anchors across all pages
var anchorStream = fs.createWriteStream("anchors.csv",{flags:'a', encoding: 'utf8'});
anchorStream.write("\"Page\",\"Link\",\"Inner Text/HTML\"\n");

var pagesCrawled = [];
var pagesNotCrawled = [];

//Crawl page and extract tags and stuff
function crawlPage(site) {
    var anchorTags = document.getElementsByTagName('a');
    var anchors = [];
    for(var i = 0; i < anchorTags.length; i++) {
        var innerText = anchorTags[i].innerText.replace(/\r?\n|\r/g,"").trim();
        var row = {url:anchorTags[i].href.trim()};
        row.inner = innerText != '' ? innerText.replace(/"/gm, '""') : anchorTags[i].innerHTML.trim().replace(/\s\s|\t|\r?\n|\r/gm,' ').replace(/"/gm, '""');
        anchors.push(row);
    }
    var hTags = {
        h1: [],
        h2: [],
        h3: [],
        h4: [],
        h5: []
    };

    for(var tag in hTags) {
        var tagCollection = document.getElementsByTagName(tag);
        if(tagCollection.length != 0) {
            for(var t = 0; t < tagCollection.length; t++) {
                hTags[tag].push(tagCollection[t].innerHTML.trim().replace(/\s\s/gm, "").replace(/"/gm, '""'));
            }
        } else {
            hTags[tag].push('**NONE**');
        }
    }

    return {
        url: document.URL.replace(site,''),
        title: document.title,
        keywords: document.getElementsByName('keywords')[0].getAttribute('content'),
        description: document.getElementsByName('description')[0].getAttribute('content'),
        anchors: anchors,
        hTags: hTags
    };
}

Array.prototype.getUnique = function() {
    var u = [];
    for(var i = 0; i < this.length; i++) {
        if(
            //Standard for unique
        u.indexOf(this[i]) === -1
            //Get rid of blanks and file downloads
        && this[i] != ''
        && this[i].indexOf('.pdf') === -1
        && this[i].indexOf('#') === -1
        ) {
            u.push(this[i]);
        }
    }
    return u;
};

Array.prototype.removeArray = function(arrayToRemove) {
    for(var i = 0; i < arrayToRemove.length; i++) {
        var index = this.indexOf(arrayToRemove[i]);
        if(index != -1) {
            this.splice(index,1);
        }
    }
    return this;
};

//Set our evaluate function
Nightmare.prototype.seoScrape = function() {

    return this.evaluate(crawlPage, function(value) {

        var colDelim = '","'
            ,rowDelim = '"\r\n"';

        console.log('crawling '+ value.url);
        if(pagesCrawled.indexOf(value.url) === -1) {
            pagesCrawled.push(value.url);
        }

        for(var i = 0; i < value.anchors.length; i++) {

            if(value.anchors[i].url.indexOf(siteRoot) != -1) {
                //remove trailing slash for our crawling purposes (WONT AFFECT OUTPUT FILES)
                if(value.anchors[i].url.length > 1 && value.anchors[i].url[value.anchors[i].url.length - 1] == '/') {
                    value.anchors[i].url = value.anchors[i].url.substring(0,value.anchors[i].url.length - 1);
                }
                //Make sure this is our start domain
                pagesNotCrawled.push(value.anchors[i].url.replace(siteRoot,''));
            }
            //Add the anchor tag info to our csv
            anchorStream.write('"'+value.url + colDelim + value.anchors[i].url + colDelim + value.anchors[i].inner+"\"\n");
        }

        pageStream.write(
            '"'+value.url + '","'
            +value.title + '","'
            +value.keywords + '","'
            +value.description + '","'
            +value.hTags.h1.join(' | ') + '","'
            +value.hTags.h2.join(' | ') + '","'
            +value.hTags.h3.join(' | ') + '","'
            +value.hTags.h4.join(' | ') + '","'
            +value.hTags.h5.join(' | ') + '"\n"'

        );
        //Update our crawled and uncrawled pages
        //This returns our unique array
        pagesNotCrawled = pagesNotCrawled.getUnique();
        pagesNotCrawled = pagesNotCrawled.removeArray(pagesCrawled);

    }, siteRoot ).run(function(err, nightmare){

        if(pagesNotCrawled.length != 0) {

            console.log('Navigating to '+pagesNotCrawled[0]);
            pagesCrawled.push(pagesNotCrawled[0]);
            nightmare.goto(siteRoot+pagesNotCrawled[0]).seoScrape();
        } else {
            console.log('Your Site Has Been Crawled!');
        }
    });
};

nightmare.goto(siteRoot);
nightmare.seoScrape();
	/**
	* @author Eric Tucker <tucker.ericm@gmail.com>
	* Written for nodejs
	* requires phantomjs and nightmarejs
	*
	* Accessible through command line as:
	* node seo-scraper.js www.YOUR-WEBSITE.com
	*
	*/


	var Nightmare = require('nightmare');
	var nightmare = new Nightmare();
	var fs = require('fs');

	//This is our start page
	var siteRoot = process.argv[2].indexOf('http://') === -1 ? 'http://'+process.argv[2] : process.argv[2];
	console.log(siteRoot);

	//This is going to be our pages csv - We set headers here
	var pageStream = fs.createWriteStream("pages.csv",{flags: 'a', encoding: 'utf8'});
	pageStream.write("\"Page\",\"Title\",\"Keywords\",\"Description\",\"H1\",\"H2\",\"H3\",\"H4\",\"H5\"\n");

	//This is going to be all anchors across all pages
	var anchorStream = fs.createWriteStream("anchors.csv",{flags:'a', encoding: 'utf8'});
	anchorStream.write("\"Page\",\"Link\",\"Inner Text/HTML\"\n");

	var pagesCrawled = [];
	var pagesNotCrawled = [];

	//Crawl page and extract tags and stuff
	function crawlPage(site) {
	var anchorTags = document.getElementsByTagName('a');
	var anchors = [];
	for(var i = 0; i < anchorTags.length; i++) {
	var innerText = anchorTags[i].innerText.replace(/\r?\n\|\r/g,"").trim();
	var row = {url:anchorTags[i].href.trim()};
	row.inner = innerText != '' ? innerText.replace(/"/gm, '""') : anchorTags[i].innerHTML.trim().replace(/\s\s\|\t\|\r?\n\|\r/gm,' ').replace(/"/gm, '""');
	anchors.push(row);
	}
	var hTags = {
	h1: [],
	h2: [],
	h3: [],
	h4: [],
	h5: []
	};

	for(var tag in hTags) {
	var tagCollection = document.getElementsByTagName(tag);
	if(tagCollection.length != 0) {
	for(var t = 0; t < tagCollection.length; t++) {
	hTags[tag].push(tagCollection[t].innerHTML.trim().replace(/\s\s/gm, "").replace(/"/gm, '""'));
	}
	} else {
	hTags[tag].push('NONE');
	}
	}

	return {
	url: document.URL.replace(site,''),
	title: document.title,
	keywords: document.getElementsByName('keywords')[0].getAttribute('content'),
	description: document.getElementsByName('description')[0].getAttribute('content'),
	anchors: anchors,
	hTags: hTags
	};
	}

	Array.prototype.getUnique = function() {
	var u = [];
	for(var i = 0; i < this.length; i++) {
	if(
	//Standard for unique
	u.indexOf(this[i]) === -1
	//Get rid of blanks and file downloads
	&& this[i] != ''
	&& this[i].indexOf('.pdf') === -1
	&& this[i].indexOf('#') === -1
	) {
	u.push(this[i]);
	}
	}
	return u;
	};

	Array.prototype.removeArray = function(arrayToRemove) {
	for(var i = 0; i < arrayToRemove.length; i++) {
	var index = this.indexOf(arrayToRemove[i]);
	if(index != -1) {
	this.splice(index,1);
	}
	}
	return this;
	};

	//Set our evaluate function
	Nightmare.prototype.seoScrape = function() {

	return this.evaluate(crawlPage, function(value) {

	var colDelim = '","'
	,rowDelim = '"\r\n"';

	console.log('crawling '+ value.url);
	if(pagesCrawled.indexOf(value.url) === -1) {
	pagesCrawled.push(value.url);
	}

	for(var i = 0; i < value.anchors.length; i++) {

	if(value.anchors[i].url.indexOf(siteRoot) != -1) {
	//remove trailing slash for our crawling purposes (WONT AFFECT OUTPUT FILES)
	if(value.anchors[i].url.length > 1 && value.anchors[i].url[value.anchors[i].url.length - 1] == '/') {
	value.anchors[i].url = value.anchors[i].url.substring(0,value.anchors[i].url.length - 1);
	}
	//Make sure this is our start domain
	pagesNotCrawled.push(value.anchors[i].url.replace(siteRoot,''));
	}
	//Add the anchor tag info to our csv
	anchorStream.write('"'+value.url + colDelim + value.anchors[i].url + colDelim + value.anchors[i].inner+"\"\n");
	}

	pageStream.write(
	'"'+value.url + '","'
	+value.title + '","'
	+value.keywords + '","'
	+value.description + '","'
	+value.hTags.h1.join(' \| ') + '","'
	+value.hTags.h2.join(' \| ') + '","'
	+value.hTags.h3.join(' \| ') + '","'
	+value.hTags.h4.join(' \| ') + '","'
	+value.hTags.h5.join(' \| ') + '"\n"'

	);
	//Update our crawled and uncrawled pages
	//This returns our unique array
	pagesNotCrawled = pagesNotCrawled.getUnique();
	pagesNotCrawled = pagesNotCrawled.removeArray(pagesCrawled);

	}, siteRoot ).run(function(err, nightmare){

	if(pagesNotCrawled.length != 0) {

	console.log('Navigating to '+pagesNotCrawled[0]);
	pagesCrawled.push(pagesNotCrawled[0]);
	nightmare.goto(siteRoot+pagesNotCrawled[0]).seoScrape();
	} else {
	console.log('Your Site Has Been Crawled!');
	}
	});
	};

	nightmare.goto(siteRoot);
	nightmare.seoScrape();