daithiw44/NodeJS web scraper for Anagrams

## NodeJS web scraper for Anagrams
//Web Scraper that scrapes a web page (without permission I may add).
//Simple test using node is all this is, no error handling etc.
//Returns JSON OR XML
//Format : localhost:'PORT'/scrape/'WORD'/'FORMAT'
//Example CURL
//JSON - curl -X GET http://localhost:3000/scrape/fundamental/json
//XML - curl -X GET http://localhost:3000/scrape/fundamental/xml

var express = require('express'),
	request = require('request'),
	jsdom = require('jsdom'),
	builder = require('xmlbuilder'),
	sys = require('sys');

var app = express.createServer();
app.configure('development', function() {/*whatever you want*/});

app.configure('production', function() {
	app.use(express.errorHandler({
	}));
});

app.get('/scrape/:id/:format', function(req, res) {
	//Get the word
	var word = req.params.id, formatType = req.params.format;

	//Hit the website
	request({uri: 'http://wordsmith.org/anagram/anagram.cgi?anagram=' + word + '&t=1000&a=n'}, function(error, response, body) {
		var b, doc, handleResults, handResultsOutput, sendResponse;
		if (!error && response.statusCode == 200) {
			doc = jsdom.jsdom(body, null, {
				features: {
				FetchExternalResources: ['script'],
				ProcessExternalResources: false
				}
			});
			//determine the formatType;
			handleResults = (formatType === 'xml') ? getXMLResults : getJSONResults;
			//I know where the words are in the DOM.
			b = doc.getElementsByTagName('b');
			for (var i = 0; i < b.length; i++) {
				var text = b[i].innerHTML;
				if (text.search(/found/i) !== -1) {
					handleResultsOutput = handleResults(b[2], parseInt(text, 10));
				}
			}

			//Handle the XML results (if we have any)
			function getXMLResults(node, count) {
				count = count * 2;
				var el = node.nextSibling;
				var j = 0;
				var root = builder.begin('anagrams');
				while (j < count) {
					if (el.nodeType === 3 && el.nodeValue.length > 1) {
						var item = root.ele('anagram');
						item.txt(el.nodeValue.replace(/^\s+|\s+$/g, ''));
					}
					if (el.tagName === 'bottomlinks') { break;}
					j++;
					el = el.nextSibling;
				}
				root.up();
				return {'output' : builder.toString(), 'ctype': 'text/xml'};
			}

			//JSON Results
			function getJSONResults(node, count) {
				count = count * 2;
				var el = node.nextSibling, j = 0, anagObj = {},elArray = [];
				while (j < count) {
					if (el.nodeType === 3 && el.nodeValue.length > 1) {
						elArray.push(el.nodeValue.replace(/^\s+|\s+$/g, ''));
					}
					if (el.tagName === 'bottomlinks') { break;}
					j++;
					el = el.nextSibling;
				}
				anagObj['anagrams'] = elArray;
				return {'output' : JSON.stringify(anagObj), 'ctype' : 'application/json'};
			}

			//Send it out.
			res.writeHead(200, {'Content-Type': handleResultsOutput.ctype});
			res.end(handleResultsOutput.output);
		}
	});

});

app.listen(process.env.PORT || 3000);
console.log('server started port: ' +  3000);
	//Web Scraper that scrapes a web page (without permission I may add).
	//Simple test using node is all this is, no error handling etc.
	//Returns JSON OR XML
	//Format : localhost:'PORT'/scrape/'WORD'/'FORMAT'
	//Example CURL
	//JSON - curl -X GET http://localhost:3000/scrape/fundamental/json
	//XML - curl -X GET http://localhost:3000/scrape/fundamental/xml

	var express = require('express'),
	request = require('request'),
	jsdom = require('jsdom'),
	builder = require('xmlbuilder'),
	sys = require('sys');

	var app = express.createServer();
	app.configure('development', function() {/whatever you want/});

	app.configure('production', function() {
	app.use(express.errorHandler({
	}));
	});

	app.get('/scrape/:id/:format', function(req, res) {
	//Get the word
	var word = req.params.id, formatType = req.params.format;

	//Hit the website
	request({uri: 'http://wordsmith.org/anagram/anagram.cgi?anagram=' + word + '&t=1000&a=n'}, function(error, response, body) {
	var b, doc, handleResults, handResultsOutput, sendResponse;
	if (!error && response.statusCode == 200) {
	doc = jsdom.jsdom(body, null, {
	features: {
	FetchExternalResources: ['script'],
	ProcessExternalResources: false
	}
	});
	//determine the formatType;
	handleResults = (formatType === 'xml') ? getXMLResults : getJSONResults;
	//I know where the words are in the DOM.
	b = doc.getElementsByTagName('b');
	for (var i = 0; i < b.length; i++) {
	var text = b[i].innerHTML;
	if (text.search(/found/i) !== -1) {
	handleResultsOutput = handleResults(b[2], parseInt(text, 10));
	}
	}

	//Handle the XML results (if we have any)
	function getXMLResults(node, count) {
	count = count * 2;
	var el = node.nextSibling;
	var j = 0;
	var root = builder.begin('anagrams');
	while (j < count) {
	if (el.nodeType === 3 && el.nodeValue.length > 1) {
	var item = root.ele('anagram');
	item.txt(el.nodeValue.replace(/^\s+\|\s+$/g, ''));
	}
	if (el.tagName === 'bottomlinks') { break;}
	j++;
	el = el.nextSibling;
	}
	root.up();
	return {'output' : builder.toString(), 'ctype': 'text/xml'};
	}

	//JSON Results
	function getJSONResults(node, count) {
	count = count * 2;
	var el = node.nextSibling, j = 0, anagObj = {},elArray = [];
	while (j < count) {
	if (el.nodeType === 3 && el.nodeValue.length > 1) {
	elArray.push(el.nodeValue.replace(/^\s+\|\s+$/g, ''));
	}
	if (el.tagName === 'bottomlinks') { break;}
	j++;
	el = el.nextSibling;
	}
	anagObj['anagrams'] = elArray;
	return {'output' : JSON.stringify(anagObj), 'ctype' : 'application/json'};
	}

	//Send it out.
	res.writeHead(200, {'Content-Type': handleResultsOutput.ctype});
	res.end(handleResultsOutput.output);
	}
	});

	});

	app.listen(process.env.PORT \|\| 3000);
	console.log('server started port: ' + 3000);