public
Last active

NodeJS Web Scraper Anagram Example, written as a test some time ago with early Node Version, seems to still work.

  • Download Gist
NodeJS web scraper for Anagrams
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
//Web Scraper that scrapes a web page (without permission I may add).
//Simple test using node is all this is, no error handling etc.
//Returns JSON OR XML
//Format : localhost:'PORT'/scrape/'WORD'/'FORMAT'
//Example CURL
//JSON - curl -X GET http://localhost:3000/scrape/fundamental/json
//XML - curl -X GET http://localhost:3000/scrape/fundamental/xml
 
var express = require('express'),
request = require('request'),
jsdom = require('jsdom'),
builder = require('xmlbuilder'),
sys = require('sys');
 
var app = express.createServer();
app.configure('development', function() {/*whatever you want*/});
 
app.configure('production', function() {
app.use(express.errorHandler({
}));
});
 
app.get('/scrape/:id/:format', function(req, res) {
//Get the word
var word = req.params.id, formatType = req.params.format;
 
//Hit the website
request({uri: 'http://wordsmith.org/anagram/anagram.cgi?anagram=' + word + '&t=1000&a=n'}, function(error, response, body) {
var b, doc, handleResults, handResultsOutput, sendResponse;
if (!error && response.statusCode == 200) {
doc = jsdom.jsdom(body, null, {
features: {
FetchExternalResources: ['script'],
ProcessExternalResources: false
}
});
//determine the formatType;
handleResults = (formatType === 'xml') ? getXMLResults : getJSONResults;
//I know where the words are in the DOM.
b = doc.getElementsByTagName('b');
for (var i = 0; i < b.length; i++) {
var text = b[i].innerHTML;
if (text.search(/found/i) !== -1) {
handleResultsOutput = handleResults(b[2], parseInt(text, 10));
}
}
 
//Handle the XML results (if we have any)
function getXMLResults(node, count) {
count = count * 2;
var el = node.nextSibling;
var j = 0;
var root = builder.begin('anagrams');
while (j < count) {
if (el.nodeType === 3 && el.nodeValue.length > 1) {
var item = root.ele('anagram');
item.txt(el.nodeValue.replace(/^\s+|\s+$/g, ''));
}
if (el.tagName === 'bottomlinks') { break;}
j++;
el = el.nextSibling;
}
root.up();
return {'output' : builder.toString(), 'ctype': 'text/xml'};
}
 
//JSON Results
function getJSONResults(node, count) {
count = count * 2;
var el = node.nextSibling, j = 0, anagObj = {},elArray = [];
while (j < count) {
if (el.nodeType === 3 && el.nodeValue.length > 1) {
elArray.push(el.nodeValue.replace(/^\s+|\s+$/g, ''));
}
if (el.tagName === 'bottomlinks') { break;}
j++;
el = el.nextSibling;
}
anagObj['anagrams'] = elArray;
return {'output' : JSON.stringify(anagObj), 'ctype' : 'application/json'};
}
 
//Send it out.
res.writeHead(200, {'Content-Type': handleResultsOutput.ctype});
res.end(handleResultsOutput.output);
}
});
 
});
 
app.listen(process.env.PORT || 3000);
console.log('server started port: ' + 3000);

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.