daithiw44/gist:3032376

## gistfile1.js
// Updated example to utilize 'basic' Domains with express.
// Website Scraper that scrapes off a site (without permission I may add)
// had seen some screen scraping examples using jQuery this example is jQuery-less.
var express = require('express'),
    request = require('request'),
    jsdom = require('jsdom'),
    builder = require('xmlbuilder').create(),
    sys = require('util'),
    createDomain = require('domain').create,
    app = express.createServer();

app.use(function(req, res, next) {
  var domain = createDomain(), formattedMsg;
  domain.on('error', function(err) {
    //server Error 500 but we'll handle it and send out a 200.
    if (req.params.format === 'json') {
      formattedMsg = '{"error":"' + err.message + '"}';
    } else {
      formattedMsg = '<anagrams><error>' + err.message + '</error></anagrams>';
    }
    writeOut(res, req.params.format, formattedMsg);
    domain.dispose();
  });
  domain.enter();
  next();
});

// Write All Output.
function writeOut(res, ctype, output) {
  res.writeHead(200, {
    'Content-Type': ctype
  });
  res.end(output);
}

//Handle the XML results
function getXMLResults(node, count) {
  var el = node.nextSibling,
      j = 0,
      root, item;
  count = count * 2;
  root = builder.begin('anagrams', {
    'version': '1.0',
    'encoding': 'UTF-8',
    'standalone': true
  });
  while (j < count) {
    if (el.nodeType === 3 && el.nodeValue.length > 1) {
      item = root.ele('anagram');
      item.txt(el.nodeValue.replace(/^\s+|\s+$/g, ''));
    }
    if (el.tagName === 'bottomlinks') {
      break;
    }
    j++;
    el = el.nextSibling;
  }
  //Uncomment below to throw an Error
  //throw new Error('This is an XML Error');
  return {
    'output': root.doc().toString(),
    'ctype': 'text/xml'
  };
}

//Handle JSON Results
function getJSONResults(node, count) {
  var el = node.nextSibling,
      j = 0,
      anagObj = {},
      elArray = [];
  count = count * 2;
  while (j < count) {
    if (el.nodeType === 3 && el.nodeValue.length > 1) {
      elArray.push(el.nodeValue.replace(/^\s+|\s+$/g, ''));
    }
    if (el.tagName === 'bottomlinks') {
      break;
    }
    j++;
    el = el.nextSibling;
  }
  anagObj['anagrams'] = elArray;
  //Uncomment below to throw an Error
  //throw new Error('This is an JSON Error');
  return {
    'output': JSON.stringify(anagObj),
    'ctype': 'application/json'
  };
}

app.get('/scrape/:id/:format', function(req, res) {
  //Get the word
  var word = req.params.id,
      formatType = req.params.format;
  //Hit the website
  request({
    uri: 'http://wordsmith.org/anagram/anagram.cgi?anagram=' + word + '&t=1000&a=n'
  }, function(error, response, body) {
    var b, doc, handleResults, handleResultsOutput, sendResponse, text, i;
    if (!error && response.statusCode === 200) {
      doc = jsdom.jsdom(body, null, {
        features: {
          //	FetchExternalResources   : ['script'],
          ProcessExternalResources: false
        }
      });
      //determine the formatType;
      handleResults = (formatType === 'xml') ? getXMLResults : getJSONResults;
      //I know where the words are in the DOM returned from call.
      b = doc.getElementsByTagName('b');
      for (i = 0; i < b.length; i++) {
        text = b[i].innerHTML;
        if (text.search(/found/i) !== -1) {
          handleResultsOutput = handleResults(b[2], parseInt(text, 10));
        }
      }
      writeOut(res, req.params.format, handleResultsOutput.output);
    }
    else {
      //Real Error so throw.
      throw new Error('Server responded with error, try again later');
    }
  });
});

app.listen(3000);
console.log('server started');
// format http://localhost:3000/scrape/"word for anagram"/"format: json or xml"
// example  call : http://localhost:3000/scrape/example/json
	// Updated example to utilize 'basic' Domains with express.
	// Website Scraper that scrapes off a site (without permission I may add)
	// had seen some screen scraping examples using jQuery this example is jQuery-less.
	var express = require('express'),
	request = require('request'),
	jsdom = require('jsdom'),
	builder = require('xmlbuilder').create(),
	sys = require('util'),
	createDomain = require('domain').create,
	app = express.createServer();

	app.use(function(req, res, next) {
	var domain = createDomain(), formattedMsg;
	domain.on('error', function(err) {
	//server Error 500 but we'll handle it and send out a 200.
	if (req.params.format === 'json') {
	formattedMsg = '{"error":"' + err.message + '"}';
	} else {
	formattedMsg = '<anagrams><error>' + err.message + '</error></anagrams>';
	}
	writeOut(res, req.params.format, formattedMsg);
	domain.dispose();
	});
	domain.enter();
	next();
	});

	// Write All Output.
	function writeOut(res, ctype, output) {
	res.writeHead(200, {
	'Content-Type': ctype
	});
	res.end(output);
	}

	//Handle the XML results
	function getXMLResults(node, count) {
	var el = node.nextSibling,
	j = 0,
	root, item;
	count = count * 2;
	root = builder.begin('anagrams', {
	'version': '1.0',
	'encoding': 'UTF-8',
	'standalone': true
	});
	while (j < count) {
	if (el.nodeType === 3 && el.nodeValue.length > 1) {
	item = root.ele('anagram');
	item.txt(el.nodeValue.replace(/^\s+\|\s+$/g, ''));
	}
	if (el.tagName === 'bottomlinks') {
	break;
	}
	j++;
	el = el.nextSibling;
	}
	//Uncomment below to throw an Error
	//throw new Error('This is an XML Error');
	return {
	'output': root.doc().toString(),
	'ctype': 'text/xml'
	};
	}

	//Handle JSON Results
	function getJSONResults(node, count) {
	var el = node.nextSibling,
	j = 0,
	anagObj = {},
	elArray = [];
	count = count * 2;
	while (j < count) {
	if (el.nodeType === 3 && el.nodeValue.length > 1) {
	elArray.push(el.nodeValue.replace(/^\s+\|\s+$/g, ''));
	}
	if (el.tagName === 'bottomlinks') {
	break;
	}
	j++;
	el = el.nextSibling;
	}
	anagObj['anagrams'] = elArray;
	//Uncomment below to throw an Error
	//throw new Error('This is an JSON Error');
	return {
	'output': JSON.stringify(anagObj),
	'ctype': 'application/json'
	};
	}

	app.get('/scrape/:id/:format', function(req, res) {
	//Get the word
	var word = req.params.id,
	formatType = req.params.format;
	//Hit the website
	request({
	uri: 'http://wordsmith.org/anagram/anagram.cgi?anagram=' + word + '&t=1000&a=n'
	}, function(error, response, body) {
	var b, doc, handleResults, handleResultsOutput, sendResponse, text, i;
	if (!error && response.statusCode === 200) {
	doc = jsdom.jsdom(body, null, {
	features: {
	// FetchExternalResources : ['script'],
	ProcessExternalResources: false
	}
	});
	//determine the formatType;
	handleResults = (formatType === 'xml') ? getXMLResults : getJSONResults;
	//I know where the words are in the DOM returned from call.
	b = doc.getElementsByTagName('b');
	for (i = 0; i < b.length; i++) {
	text = b[i].innerHTML;
	if (text.search(/found/i) !== -1) {
	handleResultsOutput = handleResults(b[2], parseInt(text, 10));
	}
	}
	writeOut(res, req.params.format, handleResultsOutput.output);
	}
	else {
	//Real Error so throw.
	throw new Error('Server responded with error, try again later');
	}
	});
	});

	app.listen(3000);
	console.log('server started');
	// format http://localhost:3000/scrape/"word for anagram"/"format: json or xml"
	// example call : http://localhost:3000/scrape/example/json