Created
April 25, 2010 18:23
-
-
Save fb55/378599 to your computer and use it in GitHub Desktop.
a YQL script to extract articles from webpages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8" ?> | |
<table xmlns="http://query.yahooapis.com/v1/schema/table.xsd"> | |
<meta> | |
<author>Felix Boehm</author> | |
<documentationURL>http://feedic.com/</documentationURL> | |
<sampleQuery>select * from t where url="feedic.com"</sampleQuery> | |
</meta> | |
<bindings> | |
<select itemPath="" produces="XML"> | |
<urls> | |
<url></url> | |
</urls> | |
<inputs> | |
<key id="url" type="xs:string" paramType="variable" required="true" /> | |
<key id="html" type="xs:string" paramType="variable" required="false" /> | |
</inputs> | |
<execute><![CDATA[ | |
y.include("run.js"); | |
run(); | |
]]> | |
</execute> | |
</select> | |
</bindings> | |
</table> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function run() { | |
var doc = new XML(y.query("select * from html where xpath='html' and url=@url", { | |
url: url | |
}).results); | |
var title = doc..title. * .toString(); | |
//clean up doc | |
delete doc..head; | |
delete doc..span; | |
delete doc..script; | |
delete doc..noscript; | |
delete doc..iframe; | |
delete doc..frameset; | |
delete doc..input; | |
delete doc..textarea; | |
delete doc..select; | |
//the array to save the scores | |
var score = [], | |
a = 0; | |
//process doc | |
for each(var doca in doc.. * ) { | |
//search article | |
score[a] = 0; | |
//add score-points of paragraphs | |
for each(var p in doca.p) { | |
if (p.toString().length > 10) score[a] += 5; | |
score[a] += p.toString().split(",").length; | |
} | |
if (doca.@ ["class"].toString().match(/(comment|meta|footer|footnote)/)) score[a] -= 50; | |
else if (doca.@ ["class"].toString().match(/((^|\\s)(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)(\\s|$))/)) score[a] += 25; | |
if (doca.@ ["id"].toString().match(/(comment|meta|footer|footnote|watch-comment-panel|cm|disqus_thread|prodReviews)/)) score[a] -= 70; | |
else if (doca.@ ["id"].toString().match(/^(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)$/)) score[a] += 25; | |
a++; | |
} | |
var top = [0], | |
b = 0; | |
for each(s in score) { | |
if (s > top[0]) { | |
top[0] = s; | |
top[1] = b; | |
} | |
b++; | |
} | |
score, | |
b; | |
var art = doc.. * [top[1]]. * ; | |
/* | |
//fix links | |
var artas = art..a; | |
for(var b=0;b<artas.length();b++){ | |
var a = artas[b]; | |
if(a.@["href"]){ | |
var s = a.@["href"][0]; | |
if(!s.split("/")[0]){ | |
a.@["href"][0] = "http://" + url + s; | |
} | |
else if(s.split("")[0].match(//w/)){ | |
try{var urlroot = url.split("/")[0];} | |
catch(e){ var urlroot = url; } | |
finally { a.@["href"][0] = "http://" + urlroot + s; } | |
} | |
} | |
} | |
var artimg = art..img; | |
for(var b=0;b<artimg.length();b++){ | |
var a = artimg[b]; | |
if(a.@["src"]){ | |
var s = a.@["src"][0]; | |
if(!s.split("/")[0]){ | |
a.@["src"][0] = "http://" + url + s; | |
} | |
else if(!s.match(url)){ | |
var urlroot = url.split("/")[0]; | |
a.@["src"][0] = "http://" + url.split("/")[0] + "/" + s; | |
} | |
} | |
else { | |
delete art..img[b]; | |
} | |
}*/ | |
//clean document | |
for each(var a in art.. * ) { | |
if (!a. * ) delete a; | |
} | |
for each(var a in art..@ ["id"]) { | |
delete a; | |
} | |
for each(var a in art..@ ["class"]) { | |
delete a; | |
} | |
for each(var a in art..@ ["onload"]) { | |
delete a; | |
} | |
for each(var a in art..@ ["rel"]) { | |
delete a; | |
} | |
for each(var a in art..@ ["mouseover"]) { | |
delete a; | |
} | |
for each(var a in art..@ ["tabindex"]) { | |
delete a; | |
} | |
var artikel = art.toString(); | |
if (html) { | |
response.object = '<html><head><title>' + title + '</title><body><div id="article" score="' + top[0] + '">' + artikel + '</div></body></html>'; | |
} | |
else { | |
response.object = artikel; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment