Skip to content

Instantly share code, notes, and snippets.

@AYLIEN
Created January 26, 2015 17:52
Show Gist options
  • Save AYLIEN/7a1c60fa33f5f10fe896 to your computer and use it in GitHub Desktop.
Save AYLIEN/7a1c60fa33f5f10fe896 to your computer and use it in GitHub Desktop.
News Extraction and Analysis
//**********************************************************
// This application takes a web site address 'webSite'and
// searches it for an RSS feed. If an RSS feed is found the
// address is displayed along with the first 'numberOfStories'.
// Try it for yourself by specififying a website below.
//**********************************************************
var webSite = 'http://gigaom.com/';
var numberOfStories = 3;
var AYLIENTextAPI = require('aylien_textapi'),
request = require('request'),
xml2js = require('xml2js'),
_ = require('underscore');
var textapi = new AYLIENTextAPI({
application_id: 'YourApplicationId',
application_key: 'YourApplicationKey'
});
var rssFeed;
var parser = new xml2js.Parser();
var counter = 3 * numberOfStories;
var analysisResults = {};
//***************************************************************
// Article Extraction
// Scrapes an article and extracts main text, images, feeds, etc
//***************************************************************
textapi.extract(webSite, function(err, resp) {
if (err !== null) {
console.log("Error: " + err);
} else {
if (resp.feeds[0]) {
console.log("\nThe RSS feed link for the site is :\n\n", resp.feeds[0]);
rssFeed = resp.feeds[0];
request(rssFeed, function(error, response, body) {
if (error === null) {
parser.parseString(body);
} else {
console.log(error);
}
});
callAylienAPIs(outputResults);
} else {
console.log("No RSS feed was found for the specified URL");
}
}
});
//**********************************************************
// Calls the AYLIEN APIs and makes sure each call is
// completed before printint the results to screen.
//**********************************************************
function callAylienAPIs(callback) {
parser.addListener('end', function(result) {
var items = result.rss.channel[0].item;
items.slice(1, numberOfStories + 1).forEach(function(item) {
var title = item.title[0];
analysisResults[title] = {};
analysisResults[title].title = title;
var link = item.link[0];
analysisResults[title].link = link;
//**********************************************************
// Article Classification
// Classifies an article into about 500 categories
//**********************************************************
textapi.classify(link, function(error, result) {
if (error === null && result.categories[0]) {
var cat = {};
cat.label = result.categories[0].label;
cat.code = result.categories[0].code;
cat.confidence = result.categories[0].confidence;
analysisResults[title].category = cat;
analysisResults[title].category_error = false;
} else {
analysisResults[title].category_error = true;
}
counter--;
if (counter == 0) {
callback();
}
});
//**********************************************************
// Concept Extraction
// Extracts the concepts mentioned in a text
//**********************************************************
textapi.concepts(link, function(err, resp) {
if (err !== null) {
console.log("Error: " + err);
} else {
analysisResults[title].concepts = resp.concepts;
}
counter--;
if (counter == 0) {
callback();
}
});
//**********************************************************
// Entity Extraction
// Extracts the entities mentioned in a text
//**********************************************************
textapi.entities(link, function(err, resp) {
if (err !== null) {
console.log("Error: " + err);
} else {
analysisResults[title].entities = resp.entities;
}
counter--;
if (counter == 0) {
callback();
}
});
});
});
}
function outputResults() {
var i = 1;
for (var key in analysisResults) {
if (analysisResults[key].title && analysisResults[key].link) {
console.log("\n\n************************************************************");
console.log("Story Number " + i);
console.log("************************************************************\n\n");
console.log("Story Title : ", analysisResults[key].title);
console.log("Story Link : ", analysisResults[key].link);
i++;
}
console.log("\nClassification :");
if (analysisResults[key].category_error == false) {
console.log("-----------------");
console.log("Category Label : " + analysisResults[key].category.label);
console.log("IPTC Code : " + analysisResults[key].category.code);
console.log("Confidence : " + analysisResults[key].category.confidence);
} else {
console.log("No Classification available for this story\n");
}
var j = 0
console.log("\nConcepts :");
console.log("----------");
if (analysisResults[key].concepts) {
var concepts = Object.keys(analysisResults[key].concepts);
concepts.forEach(function(c) {
var info = analysisResults[key].concepts[c];
console.log("\nConcept DBPedia URI : " + c);
console.log("Surface Form : " + info.surfaceForms[0].string);
console.log("Relavance Score : " + info.surfaceForms[0].score);
console.log("Offset Index : " + info.surfaceForms[0].offset);
});
} else {
console.log("No Concept data available for this story\n");
}
console.log("\nEntities :");
console.log("-----------");
if (analysisResults[key].entities) {
_(['organization', 'location', 'keyword', 'date', 'person', 'money', 'percentage', 'time', 'url', 'email', 'phone']).each(function(entityType) {
if (analysisResults[key].entities[entityType]) {
console.log("\n" + entityType + " : " + analysisResults[key].entities[entityType].join(', '))
}
})
} else {
console.log("No Entity data available for this story\n");
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment