Skip to content

Instantly share code, notes, and snippets.

@AYLIEN
Created February 2, 2016 13:52
Show Gist options
  • Save AYLIEN/03edf443e756ad3edab0 to your computer and use it in GitHub Desktop.
Save AYLIEN/03edf443e756ad3edab0 to your computer and use it in GitHub Desktop.
Classify Homepages
var _ = require('underscore'),
cheerio = require('cheerio'),
request = require('request'),
AYLIENTextAPI = require("aylien_textapi");
var textapi = new AYLIENTextAPI({
application_id: "YourApplicationId",
application_key: "YourApplicationKey"
});
var url = 'http://www.bbc.com/';
request(url, function(err, resp, body) {
if (!err) {
var text = extract(body)
textapi.classifyByTaxonomy({'text': text, 'taxonomy': 'iab-qag'}, function(err, result) {
console.log(result.categories);
});
}
});
function getText(tagName, $) {
var texts = _.chain($(tagName)).map(function(e) {
return $(e).text().trim();
}).filter(function(t) {
return t.length > 0;
}).value();
return texts.join(' ');
}
function extract(body) {
var $ = cheerio.load(body);
var keywords = $('meta[name="keywords"]').attr('content');
var description = $('meta[name="description"]').attr('content');
var imgAlts = _($('img[alt]')).map(function(e) {
return $(e).attr('alt').trim();
}).join(' ');
var h1 = getText('h1', $);
var h2 = getText('h2', $);
var links = getText('a', $);
var text = [h1, h2, links, imgAlts].join(' ');
return text;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment