Skip to content

Instantly share code, notes, and snippets.

@fb55
Created April 25, 2010 18:23
Show Gist options
  • Save fb55/378599 to your computer and use it in GitHub Desktop.
Save fb55/378599 to your computer and use it in GitHub Desktop.
a YQL script to extract articles from webpages
<?xml version="1.0" encoding="UTF-8" ?>
<table xmlns="http://query.yahooapis.com/v1/schema/table.xsd">
<meta>
<author>Felix Boehm</author>
<documentationURL>http://feedic.com/</documentationURL>
<sampleQuery>select * from t where url="feedic.com"</sampleQuery>
</meta>
<bindings>
<select itemPath="" produces="XML">
<urls>
<url></url>
</urls>
<inputs>
<key id="url" type="xs:string" paramType="variable" required="true" />
<key id="html" type="xs:string" paramType="variable" required="false" />
</inputs>
<execute><![CDATA[
y.include("run.js");
run();
]]>
</execute>
</select>
</bindings>
</table>
function run() {
var doc = new XML(y.query("select * from html where xpath='html' and url=@url", {
url: url
}).results);
var title = doc..title. * .toString();
//clean up doc
delete doc..head;
delete doc..span;
delete doc..script;
delete doc..noscript;
delete doc..iframe;
delete doc..frameset;
delete doc..input;
delete doc..textarea;
delete doc..select;
//the array to save the scores
var score = [],
a = 0;
//process doc
for each(var doca in doc.. * ) {
//search article
score[a] = 0;
//add score-points of paragraphs
for each(var p in doca.p) {
if (p.toString().length > 10) score[a] += 5;
score[a] += p.toString().split(",").length;
}
if (doca.@ ["class"].toString().match(/(comment|meta|footer|footnote)/)) score[a] -= 50;
else if (doca.@ ["class"].toString().match(/((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/)) score[a] += 25;
if (doca.@ ["id"].toString().match(/(comment|meta|footer|footnote|watch-comment-panel|cm|disqus_thread|prodReviews)/)) score[a] -= 70;
else if (doca.@ ["id"].toString().match(/^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/)) score[a] += 25;
a++;
}
var top = [0],
b = 0;
for each(s in score) {
if (s > top[0]) {
top[0] = s;
top[1] = b;
}
b++;
}
score,
b;
var art = doc.. * [top[1]]. * ;
/*
//fix links
var artas = art..a;
for(var b=0;b<artas.length();b++){
var a = artas[b];
if(a.@["href"]){
var s = a.@["href"][0];
if(!s.split("/")[0]){
a.@["href"][0] = "http://" + url + s;
}
else if(s.split("")[0].match(//w/)){
try{var urlroot = url.split("/")[0];}
catch(e){ var urlroot = url; }
finally { a.@["href"][0] = "http://" + urlroot + s; }
}
}
}
var artimg = art..img;
for(var b=0;b<artimg.length();b++){
var a = artimg[b];
if(a.@["src"]){
var s = a.@["src"][0];
if(!s.split("/")[0]){
a.@["src"][0] = "http://" + url + s;
}
else if(!s.match(url)){
var urlroot = url.split("/")[0];
a.@["src"][0] = "http://" + url.split("/")[0] + "/" + s;
}
}
else {
delete art..img[b];
}
}*/
//clean document
for each(var a in art.. * ) {
if (!a. * ) delete a;
}
for each(var a in art..@ ["id"]) {
delete a;
}
for each(var a in art..@ ["class"]) {
delete a;
}
for each(var a in art..@ ["onload"]) {
delete a;
}
for each(var a in art..@ ["rel"]) {
delete a;
}
for each(var a in art..@ ["mouseover"]) {
delete a;
}
for each(var a in art..@ ["tabindex"]) {
delete a;
}
var artikel = art.toString();
if (html) {
response.object = '<html><head><title>' + title + '</title><body><div id="article" score="' + top[0] + '">' + artikel + '</div></body></html>';
}
else {
response.object = artikel;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment