Created
January 26, 2015 17:52
-
-
Save AYLIEN/7a1c60fa33f5f10fe896 to your computer and use it in GitHub Desktop.
News Extraction and Analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//********************************************************** | |
// This application takes a web site address 'webSite'and | |
// searches it for an RSS feed. If an RSS feed is found the | |
// address is displayed along with the first 'numberOfStories'. | |
// Try it for yourself by specififying a website below. | |
//********************************************************** | |
var webSite = 'http://gigaom.com/'; | |
var numberOfStories = 3; | |
var AYLIENTextAPI = require('aylien_textapi'), | |
request = require('request'), | |
xml2js = require('xml2js'), | |
_ = require('underscore'); | |
var textapi = new AYLIENTextAPI({ | |
application_id: 'YourApplicationId', | |
application_key: 'YourApplicationKey' | |
}); | |
var rssFeed; | |
var parser = new xml2js.Parser(); | |
var counter = 3 * numberOfStories; | |
var analysisResults = {}; | |
//*************************************************************** | |
// Article Extraction | |
// Scrapes an article and extracts main text, images, feeds, etc | |
//*************************************************************** | |
textapi.extract(webSite, function(err, resp) { | |
if (err !== null) { | |
console.log("Error: " + err); | |
} else { | |
if (resp.feeds[0]) { | |
console.log("\nThe RSS feed link for the site is :\n\n", resp.feeds[0]); | |
rssFeed = resp.feeds[0]; | |
request(rssFeed, function(error, response, body) { | |
if (error === null) { | |
parser.parseString(body); | |
} else { | |
console.log(error); | |
} | |
}); | |
callAylienAPIs(outputResults); | |
} else { | |
console.log("No RSS feed was found for the specified URL"); | |
} | |
} | |
}); | |
//********************************************************** | |
// Calls the AYLIEN APIs and makes sure each call is | |
// completed before printint the results to screen. | |
//********************************************************** | |
function callAylienAPIs(callback) { | |
parser.addListener('end', function(result) { | |
var items = result.rss.channel[0].item; | |
items.slice(1, numberOfStories + 1).forEach(function(item) { | |
var title = item.title[0]; | |
analysisResults[title] = {}; | |
analysisResults[title].title = title; | |
var link = item.link[0]; | |
analysisResults[title].link = link; | |
//********************************************************** | |
// Article Classification | |
// Classifies an article into about 500 categories | |
//********************************************************** | |
textapi.classify(link, function(error, result) { | |
if (error === null && result.categories[0]) { | |
var cat = {}; | |
cat.label = result.categories[0].label; | |
cat.code = result.categories[0].code; | |
cat.confidence = result.categories[0].confidence; | |
analysisResults[title].category = cat; | |
analysisResults[title].category_error = false; | |
} else { | |
analysisResults[title].category_error = true; | |
} | |
counter--; | |
if (counter == 0) { | |
callback(); | |
} | |
}); | |
//********************************************************** | |
// Concept Extraction | |
// Extracts the concepts mentioned in a text | |
//********************************************************** | |
textapi.concepts(link, function(err, resp) { | |
if (err !== null) { | |
console.log("Error: " + err); | |
} else { | |
analysisResults[title].concepts = resp.concepts; | |
} | |
counter--; | |
if (counter == 0) { | |
callback(); | |
} | |
}); | |
//********************************************************** | |
// Entity Extraction | |
// Extracts the entities mentioned in a text | |
//********************************************************** | |
textapi.entities(link, function(err, resp) { | |
if (err !== null) { | |
console.log("Error: " + err); | |
} else { | |
analysisResults[title].entities = resp.entities; | |
} | |
counter--; | |
if (counter == 0) { | |
callback(); | |
} | |
}); | |
}); | |
}); | |
} | |
function outputResults() { | |
var i = 1; | |
for (var key in analysisResults) { | |
if (analysisResults[key].title && analysisResults[key].link) { | |
console.log("\n\n************************************************************"); | |
console.log("Story Number " + i); | |
console.log("************************************************************\n\n"); | |
console.log("Story Title : ", analysisResults[key].title); | |
console.log("Story Link : ", analysisResults[key].link); | |
i++; | |
} | |
console.log("\nClassification :"); | |
if (analysisResults[key].category_error == false) { | |
console.log("-----------------"); | |
console.log("Category Label : " + analysisResults[key].category.label); | |
console.log("IPTC Code : " + analysisResults[key].category.code); | |
console.log("Confidence : " + analysisResults[key].category.confidence); | |
} else { | |
console.log("No Classification available for this story\n"); | |
} | |
var j = 0 | |
console.log("\nConcepts :"); | |
console.log("----------"); | |
if (analysisResults[key].concepts) { | |
var concepts = Object.keys(analysisResults[key].concepts); | |
concepts.forEach(function(c) { | |
var info = analysisResults[key].concepts[c]; | |
console.log("\nConcept DBPedia URI : " + c); | |
console.log("Surface Form : " + info.surfaceForms[0].string); | |
console.log("Relavance Score : " + info.surfaceForms[0].score); | |
console.log("Offset Index : " + info.surfaceForms[0].offset); | |
}); | |
} else { | |
console.log("No Concept data available for this story\n"); | |
} | |
console.log("\nEntities :"); | |
console.log("-----------"); | |
if (analysisResults[key].entities) { | |
_(['organization', 'location', 'keyword', 'date', 'person', 'money', 'percentage', 'time', 'url', 'email', 'phone']).each(function(entityType) { | |
if (analysisResults[key].entities[entityType]) { | |
console.log("\n" + entityType + " : " + analysisResults[key].entities[entityType].join(', ')) | |
} | |
}) | |
} else { | |
console.log("No Entity data available for this story\n"); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment