lukekarrys/ README.md

## README.md

      
    Raw
  

               README.md
            
          
    Node Crawler to find all domain links on a site and run a function on them

Linked to from http://lukecod.es/2012/11/18/random-problem-of-the-night/
What

This is a node.js crawler that will crawl an entire site (using crawl) to find all internal links in the entire site. It will then test each unique internal link for the presence of an optional string and then the query string into an object. All values with the same key from the query string will be pushed to an array for that key.
Usage


npm install
node app.js http://site-to-crawl.com /only/return/links/containing/this/path

Example Output

{
  'a': [
    'x',
    'y',
    'z'
  ],
  'b': [
    'c',
    'd'
  ],
  'z': [
    1
  ]
}

How to get all possible clinical trial query parameters from biooncology.com

node app.js http://www.biooncology.com /clinical-trials
Latest Output (11/18/12)

{
    "tumor": [
        "breast cancer",
        "cll",
        "dlbcl",
        "fnhl",
        "colorectal cancer",
        "gastric cancer",
        "glioblastoma",
        "lung cancer",
        "melanoma",
        "ovarian cancer",
        "multiple myeloma",
        "pancreatic cancer",
        "other tumor types",
        "renal cell carcinoma",
        "colon cancer",
        "liver cancer"
    ],
    "drug": [
        "pi3k inhibitor (gdc-0941)",
        "pi3k/mtor inhibitor (gdc-0980)",
        "obinutuzumab (ga101)",
        "onartuzumab (metmab)",
        "mek inhibitor (gdc-0973)",
        "akt inhibitor (gdc-0068)",
        "anti-egfl7",
        "dulanermin"
    ]
}


## .gitignore
node_modules/
*.log

## app.js
/*global console process require */

var crawler = require('./node_modules/crawl/lib/crawler'),
    _ = require('underscore'),
    url = require('url'),
    qs = require('qs'),
    ent = require('ent'),
    startUrl = process.argv[2],
    urlPath = process.argv[3] || '',
    parsedStartUrl = url.parse(startUrl);

crawler.crawl(startUrl, { headers: false, body: false }, function(err, pages) {

  if (err) {
    console.log("An error occured: " + err);
    process.exit(1);
  }

  var // An array of unique urls within the site, falsy values removed
      allLinks = _.uniq(_.compact(_.flatten(_.pluck(pages, 'links')))),

      // Internal links: check if it contains our original host or is relative
      // if it is relative, prepend 'protocol//host'
      internalLinks = _.map(allLinks, function(link) {
        link = link.split('#')[0];
        if (link.indexOf(parsedStartUrl.host) > -1) {
          return link;
        } else {
          return link.charAt(0) === '/' ? parsedStartUrl.protocol + '//' + parsedStartUrl.host + link : '';
        }
      }),

      // Remove falsy and make unique again to account for relative links that are now absolute
      uniqueLinks = _.uniq(_.compact(internalLinks)),

      alreadyViewedQueryStrings = [],

      results = {};

  _.each(uniqueLinks, function(link) {

    // The query string (minus the ?)
    var queryString = (url.parse(link).search || '').slice(1);

    /* Continue if:
     * our link contains our passed in path
     * our link has a query string
     * we haven't seen this query string before
     */
    if (link.indexOf(urlPath) > -1 && queryString && !_.contains(alreadyViewedQueryStrings, queryString)) {

      alreadyViewedQueryStrings.push(queryString);

      // Decode html entities, +'s to spaces, pass to decodeURIComponent, then parse to an object with qs
      queryString = qs.parse(decodeURIComponent(ent.decode(queryString).replace(/\+/g, ' ')));

      /* For each key:value pair of the query string we either create a new array with the value
       * if we haven't seen that key before, or if we have then we push to that array.
       * Also, always make the value lowercase.
       */
      _.each(queryString, function(value, key) {
        if (typeof results[key] === 'undefined') {
          results[key] = [value.toLowerCase()];
        } else {
          results[key].push(value.toLowerCase());
        }
      });

      // Make each key only contain unique values in its array
      _.each(results, function(value, key) {
        results[key] = _.uniq(results[key]);
      });
    }
  });

  console.log(JSON.stringify(results, null, 4));

});


## package.json
{
  "name": "crawling-link-matcher",
  "version": "0.1.0",
  "dependencies": {
    "crawl": "0.1.0",
    "qs": "0.5.2",
    "underscore": "1.4.2",
    "ent": "0.0.4"
  }
}
	/global console process require /

	var crawler = require('./node_modules/crawl/lib/crawler'),
	_ = require('underscore'),
	url = require('url'),
	qs = require('qs'),
	ent = require('ent'),
	startUrl = process.argv[2],
	urlPath = process.argv[3] \|\| '',
	parsedStartUrl = url.parse(startUrl);

	crawler.crawl(startUrl, { headers: false, body: false }, function(err, pages) {

	if (err) {
	console.log("An error occured: " + err);
	process.exit(1);
	}

	var // An array of unique urls within the site, falsy values removed
	allLinks = _.uniq(_.compact(_.flatten(_.pluck(pages, 'links')))),

	// Internal links: check if it contains our original host or is relative
	// if it is relative, prepend 'protocol//host'
	internalLinks = _.map(allLinks, function(link) {
	link = link.split('#')[0];
	if (link.indexOf(parsedStartUrl.host) > -1) {
	return link;
	} else {
	return link.charAt(0) === '/' ? parsedStartUrl.protocol + '//' + parsedStartUrl.host + link : '';
	}
	}),

	// Remove falsy and make unique again to account for relative links that are now absolute
	uniqueLinks = _.uniq(_.compact(internalLinks)),

	alreadyViewedQueryStrings = [],

	results = {};

	_.each(uniqueLinks, function(link) {

	// The query string (minus the ?)
	var queryString = (url.parse(link).search \|\| '').slice(1);

	/* Continue if:
	* our link contains our passed in path
	* our link has a query string
	* we haven't seen this query string before
	*/
	if (link.indexOf(urlPath) > -1 && queryString && !_.contains(alreadyViewedQueryStrings, queryString)) {

	alreadyViewedQueryStrings.push(queryString);

	// Decode html entities, +'s to spaces, pass to decodeURIComponent, then parse to an object with qs
	queryString = qs.parse(decodeURIComponent(ent.decode(queryString).replace(/\+/g, ' ')));

	/* For each key:value pair of the query string we either create a new array with the value
	* if we haven't seen that key before, or if we have then we push to that array.
	* Also, always make the value lowercase.
	*/
	_.each(queryString, function(value, key) {
	if (typeof results[key] === 'undefined') {
	results[key] = [value.toLowerCase()];
	} else {
	results[key].push(value.toLowerCase());
	}
	});

	// Make each key only contain unique values in its array
	_.each(results, function(value, key) {
	results[key] = _.uniq(results[key]);
	});
	}
	});

	console.log(JSON.stringify(results, null, 4));

	});
	{
	"name": "crawling-link-matcher",
	"version": "0.1.0",
	"dependencies": {
	"crawl": "0.1.0",
	"qs": "0.5.2",
	"underscore": "1.4.2",
	"ent": "0.0.4"
	}
	}