Skip to content

Instantly share code, notes, and snippets.

@daithiw44
Created July 2, 2012 09:50
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save daithiw44/3032376 to your computer and use it in GitHub Desktop.
Save daithiw44/3032376 to your computer and use it in GitHub Desktop.
Nodejs Anagram Website Scraper with express and Domains, update to previous example gist: 1335009
// Updated example to utilize 'basic' Domains with express.
// Website Scraper that scrapes off a site (without permission I may add)
// had seen some screen scraping examples using jQuery this example is jQuery-less.
var express = require('express'),
request = require('request'),
jsdom = require('jsdom'),
builder = require('xmlbuilder').create(),
sys = require('util'),
createDomain = require('domain').create,
app = express.createServer();
app.use(function(req, res, next) {
var domain = createDomain(), formattedMsg;
domain.on('error', function(err) {
//server Error 500 but we'll handle it and send out a 200.
if (req.params.format === 'json') {
formattedMsg = '{"error":"' + err.message + '"}';
} else {
formattedMsg = '<anagrams><error>' + err.message + '</error></anagrams>';
}
writeOut(res, req.params.format, formattedMsg);
domain.dispose();
});
domain.enter();
next();
});
// Write All Output.
function writeOut(res, ctype, output) {
res.writeHead(200, {
'Content-Type': ctype
});
res.end(output);
}
//Handle the XML results
function getXMLResults(node, count) {
var el = node.nextSibling,
j = 0,
root, item;
count = count * 2;
root = builder.begin('anagrams', {
'version': '1.0',
'encoding': 'UTF-8',
'standalone': true
});
while (j < count) {
if (el.nodeType === 3 && el.nodeValue.length > 1) {
item = root.ele('anagram');
item.txt(el.nodeValue.replace(/^\s+|\s+$/g, ''));
}
if (el.tagName === 'bottomlinks') {
break;
}
j++;
el = el.nextSibling;
}
//Uncomment below to throw an Error
//throw new Error('This is an XML Error');
return {
'output': root.doc().toString(),
'ctype': 'text/xml'
};
}
//Handle JSON Results
function getJSONResults(node, count) {
var el = node.nextSibling,
j = 0,
anagObj = {},
elArray = [];
count = count * 2;
while (j < count) {
if (el.nodeType === 3 && el.nodeValue.length > 1) {
elArray.push(el.nodeValue.replace(/^\s+|\s+$/g, ''));
}
if (el.tagName === 'bottomlinks') {
break;
}
j++;
el = el.nextSibling;
}
anagObj['anagrams'] = elArray;
//Uncomment below to throw an Error
//throw new Error('This is an JSON Error');
return {
'output': JSON.stringify(anagObj),
'ctype': 'application/json'
};
}
app.get('/scrape/:id/:format', function(req, res) {
//Get the word
var word = req.params.id,
formatType = req.params.format;
//Hit the website
request({
uri: 'http://wordsmith.org/anagram/anagram.cgi?anagram=' + word + '&t=1000&a=n'
}, function(error, response, body) {
var b, doc, handleResults, handleResultsOutput, sendResponse, text, i;
if (!error && response.statusCode === 200) {
doc = jsdom.jsdom(body, null, {
features: {
// FetchExternalResources : ['script'],
ProcessExternalResources: false
}
});
//determine the formatType;
handleResults = (formatType === 'xml') ? getXMLResults : getJSONResults;
//I know where the words are in the DOM returned from call.
b = doc.getElementsByTagName('b');
for (i = 0; i < b.length; i++) {
text = b[i].innerHTML;
if (text.search(/found/i) !== -1) {
handleResultsOutput = handleResults(b[2], parseInt(text, 10));
}
}
writeOut(res, req.params.format, handleResultsOutput.output);
}
else {
//Real Error so throw.
throw new Error('Server responded with error, try again later');
}
});
});
app.listen(3000);
console.log('server started');
// format http://localhost:3000/scrape/"word for anagram"/"format: json or xml"
// example call : http://localhost:3000/scrape/example/json
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment