Skip to content

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
NodeJS Web Scraper Anagram Example, written as a test some time ago with early Node Version, seems to still work.
//Web Scraper that scrapes a web page (without permission I may add).
//Simple test using node is all this is, no error handling etc.
//Returns JSON OR XML
//Format : localhost:'PORT'/scrape/'WORD'/'FORMAT'
//Example CURL
//JSON - curl -X GET http://localhost:3000/scrape/fundamental/json
//XML - curl -X GET http://localhost:3000/scrape/fundamental/xml
var express = require('express'),
request = require('request'),
jsdom = require('jsdom'),
builder = require('xmlbuilder'),
sys = require('sys');
var app = express.createServer();
app.configure('development', function() {/*whatever you want*/});
app.configure('production', function() {
app.use(express.errorHandler({
}));
});
app.get('/scrape/:id/:format', function(req, res) {
//Get the word
var word = req.params.id, formatType = req.params.format;
//Hit the website
request({uri: 'http://wordsmith.org/anagram/anagram.cgi?anagram=' + word + '&t=1000&a=n'}, function(error, response, body) {
var b, doc, handleResults, handResultsOutput, sendResponse;
if (!error && response.statusCode == 200) {
doc = jsdom.jsdom(body, null, {
features: {
FetchExternalResources: ['script'],
ProcessExternalResources: false
}
});
//determine the formatType;
handleResults = (formatType === 'xml') ? getXMLResults : getJSONResults;
//I know where the words are in the DOM.
b = doc.getElementsByTagName('b');
for (var i = 0; i < b.length; i++) {
var text = b[i].innerHTML;
if (text.search(/found/i) !== -1) {
handleResultsOutput = handleResults(b[2], parseInt(text, 10));
}
}
//Handle the XML results (if we have any)
function getXMLResults(node, count) {
count = count * 2;
var el = node.nextSibling;
var j = 0;
var root = builder.begin('anagrams');
while (j < count) {
if (el.nodeType === 3 && el.nodeValue.length > 1) {
var item = root.ele('anagram');
item.txt(el.nodeValue.replace(/^\s+|\s+$/g, ''));
}
if (el.tagName === 'bottomlinks') { break;}
j++;
el = el.nextSibling;
}
root.up();
return {'output' : builder.toString(), 'ctype': 'text/xml'};
}
//JSON Results
function getJSONResults(node, count) {
count = count * 2;
var el = node.nextSibling, j = 0, anagObj = {},elArray = [];
while (j < count) {
if (el.nodeType === 3 && el.nodeValue.length > 1) {
elArray.push(el.nodeValue.replace(/^\s+|\s+$/g, ''));
}
if (el.tagName === 'bottomlinks') { break;}
j++;
el = el.nextSibling;
}
anagObj['anagrams'] = elArray;
return {'output' : JSON.stringify(anagObj), 'ctype' : 'application/json'};
}
//Send it out.
res.writeHead(200, {'Content-Type': handleResultsOutput.ctype});
res.end(handleResultsOutput.output);
}
});
});
app.listen(process.env.PORT || 3000);
console.log('server started port: ' + 3000);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.