Skip to content

Instantly share code, notes, and snippets.

@kpricorn
Created November 17, 2010 20:05
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save kpricorn/703967 to your computer and use it in GitHub Desktop.
Save kpricorn/703967 to your computer and use it in GitHub Desktop.
nodejs readability
var document;
var dbg = (typeof console !== 'undefined') ? function(s) {
console.log("Readability: " + s);
} : function() {};
/*
* Readability. An Arc90 Lab Experiment.
* Website: http://lab.arc90.com/experiments/readability
* Source: http://code.google.com/p/arc90labs-readability
*
* "Readability" is a trademark of Arc90 Inc and may not be used without explicit permission.
*
* Copyright (c) 2010 Arc90 Inc
* Readability is licensed under the Apache License, Version 2.0.
**/
var readability = {
version: '1.7.1',
emailSrc: 'http://lab.arc90.com/experiments/readability/email.php',
iframeLoads: 0,
convertLinksToFootnotes: false,
reversePageScroll: false, /* If they hold shift and hit space, scroll up */
frameHack: false, /**
* The frame hack is to workaround a firefox bug where if you
* pull content out of a frame and stick it into the parent element, the scrollbar won't appear.
* So we fake a scrollbar in the wrapping div.
**/
biggestFrame: false,
bodyCache: null, /* Cache the body HTML in case we need to re-use it later */
flags: 0x1 | 0x2 | 0x4, /* Start with all flags set. */
/* constants */
FLAG_STRIP_UNLIKELYS: 0x1,
FLAG_WEIGHT_CLASSES: 0x2,
FLAG_CLEAN_CONDITIONALLY: 0x4,
maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */
parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */
pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */
/**
* All of the regular expressions in use within readability.
* Defined up here so we don't instantiate them repeatedly in loops.
**/
regexps: {
unlikelyCandidates: /combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
negative: /combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single/i,
divToPElements: /<(a|blockquote|dl|div|img|ol|p|pre|table|ul)/i,
replaceBrs: /(<br[^>]*>[ \n\r\t]*){2,}/gi,
replaceFonts: /<(\/?)font[^>]*>/gi,
trim: /^\s+|\s+$/g,
normalize: /\s{2,}/g,
killBreaks: /(<br\s*\/?>(\s|&nbsp;?)*){1,}/g,
videos: /http:\/\/(www\.)?(youtube|vimeo)\.com/i,
skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, // Match: next, continue, >, >>, » but not >|, »| as those usually mean last.
prevLink: /(prev|earl|old|new|<|«)/i
},
/**
* Runs readability.
*
* Workflow:
* 1. Prep the document by removing script tags, css, etc.
* 2. Build readability's DOM tree.
* 3. Grab the article content from the current dom tree.
* 4. Replace the current DOM tree with the new one.
* 5. Read peacefully.
*
* @return void
**/
init: function() {
readability.removeScripts(document);
if(document.body && !readability.bodyCache) {
readability.bodyCache = document.body.innerHTML;
}
/* Make sure this document is added to the list of parsed pages first, so we don't double up on the first page */
readability.parsedPages[window.location.href.replace(/\/$/, '')] = true;
/* Pull out any possible next page link first */
var nextPageLink = readability.findNextPageLink(document.body);
readability.prepDocument();
/* Build readability's DOM tree */
var overlay = document.createElement("DIV");
var innerDiv = document.createElement("DIV");
var articleTools = readability.getArticleTools();
var articleTitle = readability.getArticleTitle();
var articleContent = readability.grabArticle();
var articleFooter = readability.getArticleFooter();
if(!articleContent) {
articleContent = document.createElement("DIV");
articleContent.id = "readability-content";
articleContent.innerHTML = [
"<p>Sorry, readability was unable to parse this page for content. If you feel like it should have been able to, please <a href='http://code.google.com/p/arc90labs-readability/issues/entry'>let us know by submitting an issue.</a></p>",
(readability.frameHack ? "<p><strong>It appears this page uses frames.</strong> Unfortunately, browser security properties often cause Readability to fail on pages that include frames. You may want to try running readability itself on this source page: <a href='" + readability.biggestFrame.src + "'>" + readability.biggestFrame.src + "</a></p>" : ""),
"<p>Also, please note that Readability does not play very nicely with front pages. Readability is intended to work on articles with a sizable chunk of text that you'd like to read comfortably. If you're using Readability on a landing page (like nytimes.com for example), please click into an article first before using Readability.</p>"
].join('');
nextPageLink = null;
}
overlay.id = "readOverlay";
innerDiv.id = "readInner";
/* Apply user-selected styling */
document.body.className = readStyle;
document.dir = readability.getSuggestedDirection(articleTitle.innerHTML);
if (readStyle == "style-athelas" || readStyle == "style-apertura"){
overlay.className = readStyle + " rdbTypekit";
}
else {
overlay.className = readStyle;
}
innerDiv.className = readMargin + " " + readSize;
if(typeof(readConvertLinksToFootnotes) !== 'undefined' && readConvertLinksToFootnotes === true) {
readability.convertLinksToFootnotes = true;
}
/* Glue the structure of our document together. */
innerDiv.appendChild( articleTitle );
innerDiv.appendChild( articleContent );
innerDiv.appendChild( articleFooter );
overlay.appendChild( articleTools );
overlay.appendChild( innerDiv );
/* Clear the old HTML, insert the new content. */
document.body.innerHTML = "";
document.body.insertBefore(overlay, document.body.firstChild);
document.body.removeAttribute('style');
if(readability.frameHack)
{
var readOverlay = document.getElementById('readOverlay');
readOverlay.style.height = '100%';
readOverlay.style.overflow = 'auto';
}
/**
* If someone tries to use Readability on a site's root page, give them a warning about usage.
**/
if((window.location.protocol + "//" + window.location.host + "/") == window.location.href)
{
articleContent.style.display = "none";
var rootWarning = document.createElement('p');
rootWarning.id = "readability-warning";
rootWarning.innerHTML = "<em>Readability</em> was intended for use on individual articles and not home pages. " +
"If you'd like to try rendering this page anyway, <a onClick='javascript:document.getElementById(\"readability-warning\").style.display=\"none\";document.getElementById(\"readability-content\").style.display=\"block\";'>click here</a> to continue.";
innerDiv.insertBefore( rootWarning, articleContent );
}
readability.postProcessContent(articleContent);
window.scrollTo(0, 0);
/* If we're using the Typekit library, select the font */
if (readStyle == "style-athelas" || readStyle == "style-apertura") {
readability.useRdbTypekit();
}
if (nextPageLink) {
/**
* Append any additional pages after a small timeout so that people
* can start reading without having to wait for this to finish processing.
**/
window.setTimeout(function() {
readability.appendNextPage(nextPageLink);
}, 500);
}
/** Smooth scrolling **/
document.onkeydown = function(e) {
var code = (window.event) ? event.keyCode : e.keyCode;
if (code === 16) {
readability.reversePageScroll = true;
return;
}
if (code === 32) {
readability.curScrollStep = 0;
var windowHeight = window.innerHeight ? window.innerHeight : (document.documentElement.clientHeight ? document.documentElement.clientHeight : document.body.clientHeight);
if(readability.reversePageScroll) {
readability.scrollTo(readability.scrollTop(), readability.scrollTop() - (windowHeight - 50), 20, 10);
}
else {
readability.scrollTo(readability.scrollTop(), readability.scrollTop() + (windowHeight - 50), 20, 10);
}
return false;
}
};
document.onkeyup = function(e) {
var code = (window.event) ? event.keyCode : e.keyCode;
if (code === 16) {
readability.reversePageScroll = false;
return;
}
}
},
/**
* Run any post-process modifications to article content as necessary.
*
* @param Element
* @return void
**/
postProcessContent: function(articleContent) {
if(readability.convertLinksToFootnotes && !window.location.href.match(/wikipedia\.org/g)) {
readability.addFootnotes(articleContent);
}
readability.fixImageFloats(articleContent);
},
/**
* Some content ends up looking ugly if the image is too large to be floated.
* If the image is wider than a threshold (currently 55%), no longer float it,
* center it instead.
*
* @param Element
* @return void
**/
fixImageFloats: function (articleContent) {
var imageWidthThreshold = Math.min(articleContent.offsetWidth, 800) * 0.55,
images = articleContent.getElementsByTagName('img');
for(var i=0, il = images.length; i < il; i++) {
var image = images[i];
if(image.offsetWidth > imageWidthThreshold) {
image.className += " blockImage";
}
}
},
/**
* Get the article tools Element that has buttons like reload, print, email.
*
* @return void
**/
getArticleTools: function () {
var articleTools = document.createElement("DIV");
articleTools.id = "readTools";
articleTools.innerHTML =
"<a href='#' onclick='return window.location.reload()' title='Reload original page' id='reload-page'>Reload Original Page</a>" +
"<a href='#' onclick='javascript:window.print();' title='Print page' id='print-page'>Print Page</a>" +
"<a href='#' onclick='readability.emailBox(); return false;' title='Email page' id='email-page'>Email Page</a>";
return articleTools;
},
/**
* retuns the suggested direction of the string
*
* @return "rtl" || "ltr"
**/
getSuggestedDirection: function(text) {
function sanitizeText() {
return text.replace(/@\w+/, "");
}
function countMatches(match) {
var matches = text.match(new RegExp(match, "g"));
return matches != null ? matches.length : 0;
}
function isRTL() {
var count_heb = countMatches("[\\u05B0-\\u05F4\\uFB1D-\\uFBF4]");
var count_arb = countMatches("[\\u060C-\\u06FE\\uFB50-\\uFEFC]");
// if 20% of chars are Hebrew or Arbic then direction is rtl
return (count_heb + count_arb) * 100 / text.length > 20;
}
text = sanitizeText(text);
return isRTL() ? "rtl" : "ltr";
},
/**
* Get the article title as an H1.
*
* @return void
**/
getArticleTitle: function () {
var curTitle = "",
origTitle = "";
try {
curTitle = origTitle = document.title;
if(typeof curTitle != "string") { /* If they had an element with id "title" in their HTML */
curTitle = origTitle = readability.getInnerText(document.getElementsByTagName('title')[0]);
}
}
catch(e) {}
if(curTitle.match(/ [\|\-] /))
{
curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
if(curTitle.split(' ').length < 3) {
curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
}
}
else if(curTitle.indexOf(': ') !== -1)
{
curTitle = origTitle.replace(/.*:(.*)/gi, '$1');
if(curTitle.split(' ').length < 3) {
curTitle = origTitle.replace(/[^:]*[:](.*)/gi,'$1');
}
}
else if(curTitle.length > 150 || curTitle.length < 15)
{
var hOnes = document.getElementsByTagName('h1');
if(hOnes.length == 1)
{
curTitle = readability.getInnerText(hOnes[0]);
}
}
curTitle = curTitle.replace( readability.regexps.trim, "" );
if(curTitle.split(' ').length <= 4) {
curTitle = origTitle;
}
var articleTitle = document.createElement("H1");
articleTitle.innerHTML = curTitle;
return articleTitle;
},
/**
* Get the footer with the readability mark etc.
*
* @return void
**/
getArticleFooter: function () {
var articleFooter = document.createElement("DIV");
/**
* For research purposes, generate an img src that contains the chosen readstyle etc,
* so we can generate aggregate stats and change styles based on them in the future
**/
// var statsQueryParams = "?readStyle=" + encodeURIComponent(readStyle) + "&readMargin=" + encodeURIComponent(readMargin) + "&readSize=" + encodeURIComponent(readSize);
/* TODO: attach this to an image */
articleFooter.id = "readFooter";
articleFooter.innerHTML = [
"<div id='rdb-footer-print'>Excerpted from <cite>" + document.title + "</cite><br />" + window.location.href + "</div>",
"<div id='rdb-footer-wrapper'>",
"<div id='rdb-footer-left'>",
"<a href='http://lab.arc90.com/experiments/readability' id='readability-logo'>Readability &mdash;&nbsp;</a>",
"<a href='http://www.arc90.com/' id='arc90-logo'> An Arc90 Laboratory Experiment&nbsp;</a>",
" <span id='readability-url'> http://lab.arc90.com/experiments/readability</span>",
"</div>",
"<div id='rdb-footer-right'>",
"<a href='http://www.twitter.com/arc90' class='footer-twitterLink'>Follow us on Twitter &raquo;</a>",
"<span class='version'>Readability version " + readability.version + "</span>",
"</div>",
"</div>"].join('');
return articleFooter;
},
/**
* Prepare the HTML document for readability to scrape it.
* This includes things like stripping javascript, CSS, and handling terrible markup.
*
* @return void
**/
prepDocument: function () {
/**
* In some cases a body element can't be found (if the HTML is totally hosed for example)
* so we create a new body node and append it to the document.
*/
if(document.body === null)
{
var body = document.createElement("body");
try {
document.body = body;
}
catch(e) {
document.documentElement.appendChild(body);
dbg(e);
}
}
document.body.id = "readabilityBody";
var frames = document.getElementsByTagName('frame');
if(frames.length > 0)
{
var bestFrame = null;
var bestFrameSize = 0; /* The frame to try to run readability upon. Must be on same domain. */
var biggestFrameSize = 0; /* Used for the error message. Can be on any domain. */
for(var frameIndex = 0; frameIndex < frames.length; frameIndex++)
{
var frameSize = frames[frameIndex].offsetWidth + frames[frameIndex].offsetHeight;
var canAccessFrame = false;
try {
frames[frameIndex].contentWindow.document.body;
canAccessFrame = true;
}
catch(eFrames) {
dbg(eFrames);
}
if(frameSize > biggestFrameSize) {
biggestFrameSize = frameSize;
readability.biggestFrame = frames[frameIndex];
}
if(canAccessFrame && frameSize > bestFrameSize)
{
readability.frameHack = true;
bestFrame = frames[frameIndex];
bestFrameSize = frameSize;
}
}
if(bestFrame)
{
var newBody = document.createElement('body');
newBody.innerHTML = bestFrame.contentWindow.document.body.innerHTML;
newBody.style.overflow = 'scroll';
document.body = newBody;
var frameset = document.getElementsByTagName('frameset')[0];
if(frameset) {
frameset.parentNode.removeChild(frameset); }
}
}
/* Remove all stylesheets */
for (var k=0;k < document.styleSheets.length; k++) {
if (document.styleSheets[k].href !== null && document.styleSheets[k].href.lastIndexOf("readability") == -1) {
document.styleSheets[k].disabled = true;
}
}
/* Remove all style tags in head (not doing this on IE) - TODO: Why not? */
var styleTags = document.getElementsByTagName("style");
for (var st=0;st < styleTags.length; st++) {
styleTags[st].textContent = "";
}
/* Turn all double br's into p's */
/* Note, this is pretty costly as far as processing goes. Maybe optimize later. */
document.body.innerHTML = document.body.innerHTML.replace(readability.regexps.replaceBrs, '</p><p>').replace(readability.regexps.replaceFonts, '<$1span>');
},
/**
* For easier reading, convert this document to have footnotes at the bottom rather than inline links.
* @see http://www.roughtype.com/archives/2010/05/experiments_in.php
*
* @return void
**/
addFootnotes: function(articleContent) {
var footnotesWrapper = document.getElementById('readability-footnotes'),
articleFootnotes = document.getElementById('readability-footnotes-list');
if(!footnotesWrapper) {
footnotesWrapper = document.createElement("DIV");
footnotesWrapper.id = 'readability-footnotes';
footnotesWrapper.innerHTML = '<h3>References</h3>';
footnotesWrapper.style.display = 'none'; /* Until we know we have footnotes, don't show the references block. */
articleFootnotes = document.createElement('ol');
articleFootnotes.id = 'readability-footnotes-list';
footnotesWrapper.appendChild(articleFootnotes);
var readFooter = document.getElementById('readFooter');
if(readFooter) {
readFooter.parentNode.insertBefore(footnotesWrapper, readFooter);
}
}
var articleLinks = articleContent.getElementsByTagName('a');
var linkCount = articleFootnotes.getElementsByTagName('li').length;
for (var i = 0; i < articleLinks.length; i++)
{
var articleLink = articleLinks[i],
footnoteLink = articleLink.cloneNode(true),
refLink = document.createElement('a'),
footnote = document.createElement('li'),
linkDomain = footnoteLink.host ? footnoteLink.host : document.location.host,
linkText = readability.getInnerText(articleLink);
if(articleLink.className && articleLink.className.indexOf('readability-DoNotFootnote') !== -1 || linkText.match(readability.regexps.skipFootnoteLink)) {
continue;
}
linkCount++;
/** Add a superscript reference after the article link */
refLink.href = '#readabilityFootnoteLink-' + linkCount;
refLink.innerHTML = '<small><sup>[' + linkCount + ']</sup></small>';
refLink.className = 'readability-DoNotFootnote';
try { refLink.style.color = 'inherit'; } catch(e) {} /* IE7 doesn't like inherit. */
if(articleLink.parentNode.lastChild == articleLink) {
articleLink.parentNode.appendChild(refLink);
} else {
articleLink.parentNode.insertBefore(refLink, articleLink.nextSibling);
}
articleLink.name = 'readabilityLink-' + linkCount;
try { articleLink.style.color = 'inherit'; } catch(e) {} /* IE7 doesn't like inherit. */
footnote.innerHTML = "<small><sup><a href='#readabilityLink-" + linkCount + "' title='Jump to Link in Article'>^</a></sup></small> ";
footnoteLink.innerHTML = (footnoteLink.title ? footnoteLink.title : linkText);
footnoteLink.name = 'readabilityFootnoteLink-' + linkCount;
footnote.appendChild(footnoteLink);
footnote.innerHTML = footnote.innerHTML + "<small> (" + linkDomain + ")</small>";
articleFootnotes.appendChild(footnote);
}
if(linkCount > 0) {
footnotesWrapper.style.display = 'block';
}
},
useRdbTypekit: function () {
var rdbHead = document.getElementsByTagName('head')[0];
var rdbTKScript = document.createElement('script');
var rdbTKCode = null;
var rdbTKLink = document.createElement('a');
rdbTKLink.setAttribute('class','rdbTK-powered');
rdbTKLink.setAttribute('title','Fonts by Typekit');
rdbTKLink.innerHTML = "Fonts by <span class='rdbTK'>Typekit</span>";
if (readStyle == "style-athelas") {
rdbTKCode = "sxt6vzy";
dbg("Using Athelas Theme");
rdbTKLink.setAttribute('href','http://typekit.com/?utm_source=readability&utm_medium=affiliate&utm_campaign=athelas');
rdbTKLink.setAttribute('id','rdb-athelas');
document.getElementById("rdb-footer-right").appendChild(rdbTKLink);
}
if (readStyle == "style-apertura") {
rdbTKCode = "bae8ybu";
dbg("Using Inverse Theme");
rdbTKLink.setAttribute('href','http://typekit.com/?utm_source=readability&utm_medium=affiliate&utm_campaign=inverse');
rdbTKLink.setAttribute('id','rdb-inverse');
document.getElementById("rdb-footer-right").appendChild(rdbTKLink);
}
/**
* Setting new script tag attributes to pull Typekits libraries
**/
rdbTKScript.setAttribute('type','text/javascript');
rdbTKScript.setAttribute('src',"http://use.typekit.com/" + rdbTKCode + ".js");
rdbTKScript.setAttribute('charset','UTF-8');
rdbHead.appendChild(rdbTKScript);
/**
* In the future, maybe try using the following experimental Callback function?:
* http://gist.github.com/192350
* &
* http://getsatisfaction.com/typekit/topics/support_a_pre_and_post_load_callback_function
**/
var typekitLoader = function() {
dbg("Looking for Typekit.");
if(typeof Typekit != "undefined") {
try {
dbg("Caught typekit");
Typekit.load();
clearInterval(window.typekitInterval);
} catch(e) {
dbg("Typekit error: " + e);
}
}
};
window.typekitInterval = window.setInterval(typekitLoader, 100);
},
/**
* Prepare the article node for display. Clean out any inline styles,
* iframes, forms, strip extraneous <p> tags, etc.
*
* @param Element
* @return void
**/
prepArticle: function (articleContent) {
readability.cleanStyles(articleContent);
readability.killBreaks(articleContent);
/* Clean out junk from the article content */
readability.cleanConditionally(articleContent, "form");
readability.clean(articleContent, "object");
readability.clean(articleContent, "h1");
/**
* If there is only one h2, they are probably using it
* as a header and not a subheader, so remove it since we already have a header.
***/
if(articleContent.getElementsByTagName('h2').length == 1) {
readability.clean(articleContent, "h2");
}
readability.clean(articleContent, "iframe");
readability.cleanHeaders(articleContent);
/* Do these last as the previous stuff may have removed junk that will affect these */
readability.cleanConditionally(articleContent, "table");
readability.cleanConditionally(articleContent, "ul");
readability.cleanConditionally(articleContent, "div");
/* Remove extra paragraphs */
var articleParagraphs = articleContent.getElementsByTagName('p');
for(var i = articleParagraphs.length-1; i >= 0; i--) {
var imgCount = articleParagraphs[i].getElementsByTagName('img').length;
var embedCount = articleParagraphs[i].getElementsByTagName('embed').length;
var objectCount = articleParagraphs[i].getElementsByTagName('object').length;
if(imgCount === 0 && embedCount === 0 && objectCount === 0 && readability.getInnerText(articleParagraphs[i], false) == '') {
articleParagraphs[i].parentNode.removeChild(articleParagraphs[i]);
}
}
try {
articleContent.innerHTML = articleContent.innerHTML.replace(/<br[^>]*>\s*<p/gi, '<p');
}
catch (e) {
dbg("Cleaning innerHTML of breaks failed. This is an IE strict-block-elements bug. Ignoring.: " + e);
}
},
/**
* Initialize a node with the readability object. Also checks the
* className/id for special names to add to its score.
*
* @param Element
* @return void
**/
initializeNode: function (node) {
node.readability = {"contentScore": 0};
switch(node.tagName) {
case 'DIV':
node.readability.contentScore += 5;
break;
case 'PRE':
case 'TD':
case 'BLOCKQUOTE':
node.readability.contentScore += 3;
break;
case 'ADDRESS':
case 'OL':
case 'UL':
case 'DL':
case 'DD':
case 'DT':
case 'LI':
case 'FORM':
node.readability.contentScore -= 3;
break;
case 'H1':
case 'H2':
case 'H3':
case 'H4':
case 'H5':
case 'H6':
case 'TH':
node.readability.contentScore -= 5;
break;
}
node.readability.contentScore += readability.getClassWeight(node);
},
/***
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
*
* @param page a document to run upon. Needs to be a full document, complete with body.
* @return Element
**/
grabArticle: function (page) {
var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS),
isPaging = (page !== null) ? true: false;
page = page ? page : document.body;
var pageCacheHtml = page.innerHTML;
var allElements = page.getElementsByTagName('*');
/**
* First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
* into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
*
* Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
* TODO: Shouldn't this be a reverse traversal?
**/
var node = null;
var nodesToScore = [];
for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex++) {
/* Remove unlikely candidates */
if (stripUnlikelyCandidates) {
var unlikelyMatchString = node.className + node.id;
if (
(
unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&
unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) == -1 &&
node.tagName !== "BODY"
)
)
{
dbg("Removing unlikely candidate - " + unlikelyMatchString);
node.parentNode.removeChild(node);
nodeIndex--;
continue;
}
}
if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") {
nodesToScore[nodesToScore.length] = node;
}
/* Turn all divs that don't have children block level elements into p's */
if (node.tagName === "DIV") {
if (node.innerHTML.search(readability.regexps.divToPElements) === -1) {
var newNode = document.createElement('p');
try {
newNode.innerHTML = node.innerHTML;
node.parentNode.replaceChild(newNode, node);
nodeIndex--;
nodesToScore[nodesToScore.length] = node;
}
catch(e) {
dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e);
}
}
else
{
/* EXPERIMENTAL */
for(var i = 0, il = node.childNodes.length; i < il; i++) {
var childNode = node.childNodes[i];
if(childNode.nodeType == 3) { // Node.TEXT_NODE
var p = document.createElement('p');
p.innerHTML = childNode.nodeValue;
p.style.display = 'inline';
p.className = 'readability-styled';
childNode.parentNode.replaceChild(p, childNode);
}
}
}
}
}
/**
* Loop through all paragraphs, and assign a score to them based on how content-y they look.
* Then add their score to their parent node.
*
* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
**/
var candidates = [];
for (var pt=0; pt < nodesToScore.length; pt++) {
var parentNode = nodesToScore[pt].parentNode;
var grandParentNode = parentNode ? parentNode.parentNode : null;
var innerText = readability.getInnerText(nodesToScore[pt]);
if(!parentNode || typeof(parentNode.tagName) == 'undefined') {
continue;
}
/* If this paragraph is less than 25 characters, don't even count it. */
if(innerText.length < 25) {
continue; }
/* Initialize readability data for the parent. */
if(typeof parentNode.readability == 'undefined') {
readability.initializeNode(parentNode);
candidates.push(parentNode);
}
/* Initialize readability data for the grandparent. */
if(grandParentNode && typeof(grandParentNode.readability) == 'undefined' && typeof(grandParentNode.tagName) != 'undefined') {
readability.initializeNode(grandParentNode);
candidates.push(grandParentNode);
}
var contentScore = 0;
/* Add a point for the paragraph itself as a base. */
contentScore++;
/* Add points for any commas within this paragraph */
contentScore += innerText.split(',').length;
/* For every 100 characters in this paragraph, add another point. Up to 3 points. */
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
/* Add the score to the parent. The grandparent gets half. */
parentNode.readability.contentScore += contentScore;
if(grandParentNode) {
grandParentNode.readability.contentScore += contentScore/2;
}
}
/**
* After we've calculated scores, loop through all of the possible candidate nodes we found
* and find the one with the highest score.
**/
var topCandidate = null;
for(var c=0, cl=candidates.length; c < cl; c++)
{
/**
* Scale the final candidates score based on link density. Good content should have a
* relatively small link density (5% or less) and be mostly unaffected by this operation.
**/
candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c]));
dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore);
if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
topCandidate = candidates[c]; }
}
/**
* If we still have no top candidate, just use the body as a last resort.
* We also have to copy the body node so it is something we can modify.
**/
if (topCandidate === null || topCandidate.tagName == "BODY")
{
topCandidate = document.createElement("DIV");
topCandidate.innerHTML = page.innerHTML;
page.innerHTML = "";
page.appendChild(topCandidate);
readability.initializeNode(topCandidate);
}
/**
* Now that we have the top candidate, look through its siblings for content that might also be related.
* Things like preambles, content split by ads that we removed, etc.
**/
var articleContent = document.createElement("DIV");
if (isPaging) {
articleContent.id = "readability-content";
}
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
var siblingNodes = topCandidate.parentNode.childNodes;
for(var s=0, sl=siblingNodes.length; s < sl; s++) {
var siblingNode = siblingNodes[s];
var append = false;
/**
* Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.
* Example of error visible here: http://www.esquire.com/features/honesty0707
**/
if(!siblingNode) {
continue;
}
dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability != 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
if(siblingNode === topCandidate)
{
append = true;
}
var contentBonus = 0;
/* Give a bonus if sibling nodes and top candidates have the example same classname */
if(siblingNode.className == topCandidate.className && topCandidate.className != "") {
contentBonus += topCandidate.readability.contentScore * 0.2;
}
if(typeof siblingNode.readability != 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
{
append = true;
}
if(siblingNode.nodeName == "P") {
var linkDensity = readability.getLinkDensity(siblingNode);
var nodeContent = readability.getInnerText(siblingNode);
var nodeLength = nodeContent.length;
if(nodeLength > 80 && linkDensity < 0.25)
{
append = true;
}
else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
{
append = true;
}
}
if(append) {
dbg("Appending node: " + siblingNode);
var nodeToAppend = null;
if(siblingNode.nodeName != "DIV" && siblingNode.nodeName != "P") {
/* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
nodeToAppend = document.createElement("DIV");
try {
nodeToAppend.id = siblingNode.id;
nodeToAppend.innerHTML = siblingNode.innerHTML;
}
catch(er) {
dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
nodeToAppend = siblingNode;
s--;
sl--;
}
} else {
nodeToAppend = siblingNode;
s--;
sl--;
}
/* To ensure a node does not interfere with readability styles, remove its classnames */
nodeToAppend.className = "";
/* Append sibling and subtract from our list because it removes the node when you append to another node */
articleContent.appendChild(nodeToAppend);
}
}
/**
* So we have all of the content that we need. Now we clean it up for presentation.
**/
readability.prepArticle(articleContent);
if (readability.curPageNum === 1) {
articleContent.innerHTML = '<div id="readability-page-1" class="page">' + articleContent.innerHTML + '</div>';
}
/**
* Now that we've gone through the full algorithm, check to see if we got any meaningful content.
* If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
* likelihood of finding the content, and the sieve approach gives us a higher likelihood of
* finding the -right- content.
**/
if(readability.getInnerText(articleContent, false).length < 250) {
page.innerHTML = pageCacheHtml;
if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
return readability.grabArticle(page);
}
else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
return readability.grabArticle(page);
}
else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);
return readability.grabArticle(page);
} else {
return null;
}
}
return articleContent;
},
/**
* Removes script tags from the document.
*
* @param Element
**/
removeScripts: function (doc) {
var scripts = doc.getElementsByTagName('script');
for(var i = scripts.length-1; i >= 0; i--)
{
if(typeof(scripts[i].src) == "undefined" || (scripts[i].src.indexOf('readability') == -1 && scripts[i].src.indexOf('typekit') == -1))
{
scripts[i].nodeValue="";
scripts[i].removeAttribute('src');
if (scripts[i].parentNode) {
scripts[i].parentNode.removeChild(scripts[i]);
}
}
}
},
/**
* Get the inner text of a node - cross browser compatibly.
* This also strips out any excess whitespace to be found.
*
* @param Element
* @return string
**/
getInnerText: function (e, normalizeSpaces) {
var textContent = "";
if(typeof(e.textContent) == "undefined" && typeof(e.innerText) == "undefined") {
return "";
}
normalizeSpaces = (typeof normalizeSpaces == 'undefined') ? true : normalizeSpaces;
textContent = e.textContent.replace( readability.regexps.trim, "" );
if(normalizeSpaces) {
return textContent.replace( readability.regexps.normalize, " "); }
else {
return textContent; }
},
/**
* Get the number of times a string s appears in the node e.
*
* @param Element
* @param string - what to split on. Default is ","
* @return number (integer)
**/
getCharCount: function (e,s) {
s = s || ",";
return readability.getInnerText(e).split(s).length-1;
},
/**
* Remove the style attribute on every e and under.
* TODO: Test if getElementsByTagName(*) is faster.
*
* @param Element
* @return void
**/
cleanStyles: function (e) {
e = e || document;
var cur = e.firstChild;
if(!e) {
return; }
// Remove any root styles, if we're able.
if(typeof e.removeAttribute == 'function' && e.className != 'readability-styled') {
e.removeAttribute('style'); }
// Go until there are no more child nodes
while ( cur !== null ) {
if ( cur.nodeType == 1 ) {
// Remove style attribute(s) :
if(cur.className != "readability-styled") {
cur.removeAttribute("style");
}
readability.cleanStyles( cur );
}
cur = cur.nextSibling;
}
},
/**
* Get the density of links as a percentage of the content
* This is the amount of text that is inside a link divided by the total text in the node.
*
* @param Element
* @return number (float)
**/
getLinkDensity: function (e) {
var links = e.getElementsByTagName("a");
var textLength = readability.getInnerText(e).length;
var linkLength = 0;
for(var i=0, il=links.length; i<il;i++)
{
linkLength += readability.getInnerText(links[i]).length;
}
return linkLength / textLength;
},
/**
* Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
*
* @author Dan Lacy
* @return string the base url
**/
findBaseUrl: function () {
var noUrlParams = window.location.pathname.split("?")[0],
urlSlashes = noUrlParams.split("/").reverse(),
cleanedSegments = [],
possibleType = "";
for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i++) {
var segment = urlSlashes[i];
// Split off and save anything that looks like a file type.
if (segment.indexOf(".") !== -1) {
possibleType = segment.split(".")[1];
/* If the type isn't alpha-only, it's probably not actually a file extension. */
if(!possibleType.match(/[^a-zA-Z]/)) {
segment = segment.split(".")[0];
}
}
/**
* EW-CMS specific segment replacement. Ugly.
* Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
**/
if(segment.indexOf(',00') !== -1) {
segment = segment.replace(',00', '');
}
// If our first or second segment has anything looking like a page number, remove it.
if (segment.match(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i) && ((i === 1) || (i === 0))) {
segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, "");
}
del = false;
/* If this is purely a number, and it's the first or second segment, it's probably a page number. Remove it. */
if (i < 2 && segment.match(/^\d{1,2}$/)) {
del = true;
}
/* If this is the first segment and it's just "index", remove it. */
if(i === 0 && segment.toLowerCase() == "index")
del = true;
/* If our first or second segment is smaller than 3 characters, and the first segment was purely alphas, remove it. */
if(i < 2 && segment.length < 3 && !urlSlashes[0].match(/[a-z]/i))
del = true;
/* If it's not marked for deletion, push it to cleanedSegments. */
if (!del) {
cleanedSegments.push(segment);
}
}
// This is our final, cleaned, base article URL.
return window.location.protocol + "//" + window.location.host + cleanedSegments.reverse().join("/");
},
/**
* Look for any paging links that may occur within the document.
*
* @param body
* @return object (array)
**/
findNextPageLink: function (elem) {
var possiblePages = {},
allLinks = elem.getElementsByTagName('a'),
articleBaseUrl = readability.findBaseUrl();
/**
* Loop through all links, looking for hints that they may be next-page links.
* Things like having "page" in their textContent, className or id, or being a child
* of a node with a page-y className or id.
*
* Also possible: levenshtein distance? longest common subsequence?
*
* After we do that, assign each page a score, and
**/
for(i = 0, il = allLinks.length; i < il; i++) {
var link = allLinks[i],
linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
/* If we've already seen this page, ignore it */
if(linkHref == "" || linkHref == articleBaseUrl || linkHref == window.location.href || linkHref in readability.parsedPages) {
continue;
}
/* If it's on a different domain, skip it. */
if(window.location.host != linkHref.split(/\/+/g)[1]) {
continue;
}
var linkText = readability.getInnerText(link);
/* If the linkText looks like it's not the next page, skip it. */
if(linkText.match(readability.regexps.extraneous) || linkText.length > 25) {
continue;
}
/* If the leftovers of the URL after removing the base URL don't contain any digits, it's certainly not a next page link. */
var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
if(!linkHrefLeftover.match(/\d/)) {
continue;
}
if(!(linkHref in possiblePages)) {
possiblePages[linkHref] = {"score": 0, "linkText": linkText, "href": linkHref};
} else {
possiblePages[linkHref].linkText += ' | ' + linkText;
}
linkObj = possiblePages[linkHref];
/**
* If the articleBaseUrl isn't part of this URL, penalize this link. It could still be the link, but the odds are lower.
* Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
**/
if(linkHref.indexOf(articleBaseUrl) !== 0) {
linkObj.score -= 25;
}
var linkData = linkText + ' ' + link.className + ' ' + link.id;
if(linkData.match(readability.regexps.nextLink)) {
linkObj.score += 50;
}
if(linkData.match(/pag(e|ing|inat)/i)) {
linkObj.score += 25;
}
if(linkData.match(/(first|last)/i)) { // -65 is enough to negate any bonuses gotten from a > or » in the text,
/* If we already matched on "next", last is probably fine. If we didn't, then it's bad. Penalize. */
if(!linkObj.linkText.match(readability.regexps.nextLink))
linkObj.score -= 65;
}
if(linkData.match(readability.regexps.negative) || linkData.match(readability.regexps.extraneous)) {
linkObj.score -= 50;
}
if(linkData.match(readability.regexps.prevLink)) {
linkObj.score -= 200;
}
/* If a parentNode contains page or paging or paginat */
var parentNode = link.parentNode,
positiveNodeMatch = false,
negativeNodeMatch = false;
while(parentNode) {
var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
if(!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
positiveNodeMatch = true;
linkObj.score += 25;
}
if(!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(readability.regexps.negative)) {
/* If this is just something like "footer", give it a negative. If it's something like "body-and-footer", leave it be. */
if(!parentNodeClassAndId.match(readability.regexps.positive)) {
linkObj.score -= 25;
negativeNodeMatch = true;
}
}
parentNode = parentNode.parentNode;
}
/**
* If the URL looks like it has paging in it, add to the score.
* Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
**/
if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i)) {
linkObj.score += 25;
}
/* If the URL contains negative values, give a slight decrease. */
if (linkHref.match(readability.regexps.extraneous)) {
linkObj.score -= 15;
}
/**
* Minor punishment to anything that doesn't match our current URL.
* NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
* Dan, can you show me a counterexample where this is necessary?
* if (linkHref.indexOf(window.location.href) !== 0) {
* linkObj.score -= 1;
* }
**/
/**
* If the link text can be parsed as a number, give it a minor bonus, with a slight
* bias towards lower numbered pages. This is so that pages that might not have 'next'
* in their text can still get scored, and sorted properly by score.
**/
linkTextAsNumber = parseInt(linkText, 10);
if(linkTextAsNumber) {
// Punish 1 since we're either already there, or it's probably before what we want anyways.
if (linkTextAsNumber === 1) {
linkObj.score -= 10;
}
else {
// Todo: Describe this better
linkObj.score += Math.max(0, 10 - linkTextAsNumber);
}
}
}
/**
* Loop thrugh all of our possible pages from above and find our top candidate for the next page URL.
* Require at least a score of 50, which is a relatively high confidence that this page is the next link.
**/
var topPage = null;
for(var page in possiblePages) {
if(possiblePages.hasOwnProperty(page)) {
if(possiblePages[page].score >= 50 && (!topPage || topPage.score < possiblePages[page].score)) {
topPage = possiblePages[page];
}
}
}
if(topPage) {
var nextHref = topPage.href.replace(/\/$/,'');
dbg('NEXT PAGE IS ' + nextHref);
readability.parsedPages[nextHref] = true;
return nextHref;
}
else {
return null;
}
},
/**
* Build a simple cross browser compatible XHR.
*
* TODO: This could likely be simplified beyond what we have here right now. There's still a bit of excess junk.
**/
xhr: function () {
if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) {
return new XMLHttpRequest();
}
else {
try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { }
try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { }
try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { }
}
return false;
},
successfulRequest: function (request) {
return (request.status >= 200 && request.status < 300) || request.status == 304 || (request.status === 0 && request.responseText);
},
ajax: function (url, options) {
var request = readability.xhr();
function respondToReadyState(readyState) {
if (request.readyState == 4) {
if (readability.successfulRequest(request)) {
if (options.success) { options.success(request); }
}
else {
if (options.error) { options.error(request); }
}
}
}
if (typeof options === 'undefined') { options = {}; }
request.onreadystatechange = respondToReadyState;
request.open('get', url, true);
request.setRequestHeader('Accept', 'text/html');
try {
request.send(options.postBody);
}
catch (e) {
if (options.error) { options.error(); }
}
return request;
},
/**
* Make an AJAX request for each page and append it to the document.
**/
curPageNum: 1,
appendNextPage: function (nextPageLink) {
readability.curPageNum++;
var articlePage = document.createElement("DIV");
articlePage.id = 'readability-page-' + readability.curPageNum;
articlePage.className = 'page';
articlePage.innerHTML = '<p class="page-separator" title="Page ' + readability.curPageNum + '">&sect;</p>';
document.getElementById("readability-content").appendChild(articlePage);
if(readability.curPageNum > readability.maxPages) {
var nextPageLink = "<div style='text-align: center'><a href='" + nextPageLink + "'>View Next Page</a></div>";
articlePage.innerHTML = articlePage.innerHTML + nextPageLink;
return;
}
/**
* Now that we've built the article page DOM element, get the page content
* asynchronously and load the cleaned content into the div we created for it.
*
* Todo: try using a self-calling function rather than with
*
* Yes, "with statement is considered harmful". But this is using with as a replacement for let which is in ecmascript 1.7, so it's okay.
* See here: http://stackoverflow.com/questions/61552/are-there-legitimate-uses-for-javascripts-with-statement#answer-185283
**/
with({pageUrl: nextPageLink, thisPage: articlePage}) {
readability.ajax(pageUrl, {
success: function(r) {
/* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */
var eTag = r.getResponseHeader('ETag');
if(eTag) {
if(eTag in readability.pageETags) {
dbg("Exact duplicate page found via ETag. Aborting.");
articlePage.style.display = 'none';
return;
} else {
readability.pageETags[eTag] = 1;
}
}
// TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
var page = document.createElement("DIV");
/**
* Do some preprocessing to our HTML to make it ready for appending.
* • Remove any script tags. Swap and reswap newlines with a unicode character because multiline regex doesn't work in javascript.
* • Turn any noscript tags into divs so that we can parse them. This allows us to find any next page links hidden via javascript.
* • Turn all double br's into p's - was handled by prepDocument in the original view.
* Maybe in the future abstract out prepDocument to work for both the original document and AJAX-added pages.
**/
var responseHtml = r.responseText.
replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '').
replace(/\uffff/g,'\n').
replace(/<(\/?)noscript/gi, '<$1div').
replace(readability.regexps.replaceBrs, '</p><p>').
replace(readability.regexps.replaceFonts, '<$1span>');
page.innerHTML = responseHtml;
/**
* Reset all flags for the next page, as they will search through it and disable as necessary at the end of grabArticle.
**/
readability.flags = 0x1 | 0x2 | 0x4;
var nextPageLink = readability.findNextPageLink(page),
content = readability.grabArticle(page);
if(!content) {
dbg("No content found in page to append. Aborting.")
return;
}
/**
* Anti-duplicate mechanism. Essentially, get the first paragraph of our new page.
* Compare it against all of the the previous document's we've gotten. If the previous
* document contains exactly the innerHTML of this first paragraph, it's probably a duplicate.
**/
firstP = content.getElementsByTagName("P").length ? content.getElementsByTagName("P")[0] : null;
if(firstP && firstP.innerHTML.length > 100) {
for(var i=1; i <= readability.curPageNum; i++) {
var rPage = document.getElementById('readability-page-' + i);
if(rPage && rPage.innerHTML.indexOf(firstP.innerHTML) !== -1) {
dbg('Duplicate of page ' + i + ' - skipping.');
articlePage.style.display = 'none';
readability.parsedPages[pageUrl] = true;
return;
}
}
}
readability.removeScripts(content);
thisPage.innerHTML = thisPage.innerHTML + content.innerHTML;
/**
* After the page has rendered, post process the content. This delay is necessary because,
* in webkit at least, offsetWidth is not set in time to determine image width. We have to
* wait a little bit for reflow to finish before we can fix floating images.
**/
window.setTimeout(
function() { readability.postProcessContent(thisPage); },
500
);
if(nextPageLink) {
readability.appendNextPage(nextPageLink);
}
}
});
}
},
/**
* Get an elements class/id weight. Uses regular expressions to tell if this
* element looks good or bad.
*
* @param Element
* @return number (Integer)
**/
getClassWeight: function (e) {
if(!readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
return 0;
}
var weight = 0;
/* Look for a special classname */
if (typeof(e.className) === 'string' && e.className != '')
{
if(e.className.search(readability.regexps.negative) !== -1) {
weight -= 25; }
if(e.className.search(readability.regexps.positive) !== -1) {
weight += 25; }
}
/* Look for a special ID */
if (typeof(e.id) === 'string' && e.id != '')
{
if(e.id.search(readability.regexps.negative) !== -1) {
weight -= 25; }
if(e.id.search(readability.regexps.positive) !== -1) {
weight += 25; }
}
return weight;
},
nodeIsVisible: function (node) {
return (node.offsetWidth !== 0 || node.offsetHeight !== 0) && node.style.display.toLowerCase() !== 'none';
},
/**
* Remove extraneous break tags from a node.
*
* @param Element
* @return void
**/
killBreaks: function (e) {
try {
e.innerHTML = e.innerHTML.replace(readability.regexps.killBreaks,'<br />');
}
catch (eBreaks) {
dbg("KillBreaks failed - this is an IE bug. Ignoring.: " + eBreaks);
}
},
/**
* Clean a node of all elements of type "tag".
* (Unless it's a youtube/vimeo video. People love movies.)
*
* @param Element
* @param string tag to clean
* @return void
**/
clean: function (e, tag) {
var targetList = e.getElementsByTagName( tag );
var isEmbed = (tag == 'object' || tag == 'embed');
for (var y=targetList.length-1; y >= 0; y--) {
/* Allow youtube and vimeo videos through as people usually want to see those. */
if(isEmbed) {
var attributeValues = "";
for (var i=0, il=targetList[y].attributes.length; i < il; i++) {
attributeValues += targetList[y].attributes[i].value + '|';
}
/* First, check the elements attributes to see if any of them contain youtube or vimeo */
if (attributeValues.search(readability.regexps.videos) !== -1) {
continue;
}
/* Then check the elements inside this element for the same. */
if (targetList[y].innerHTML.search(readability.regexps.videos) !== -1) {
continue;
}
}
targetList[y].parentNode.removeChild(targetList[y]);
}
},
/**
* Clean an element of all tags of type "tag" if they look fishy.
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
*
* @return void
**/
cleanConditionally: function (e, tag) {
if(!readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
return;
}
var tagsList = e.getElementsByTagName(tag);
var curTagsLength = tagsList.length;
/**
* Gather counts for other typical elements embedded within.
* Traverse backwards so we can remove nodes at the same time without effecting the traversal.
*
* TODO: Consider taking into account original contentScore here.
**/
for (var i=curTagsLength-1; i >= 0; i--) {
var weight = readability.getClassWeight(tagsList[i]);
var contentScore = (typeof tagsList[i].readability != 'undefined') ? tagsList[i].readability.contentScore : 0;
dbg("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")" + ((typeof tagsList[i].readability != 'undefined') ? (" with score " + tagsList[i].readability.contentScore) : ''));
if(weight+contentScore < 0)
{
tagsList[i].parentNode.removeChild(tagsList[i]);
}
else if ( readability.getCharCount(tagsList[i],',') < 10) {
/**
* If there are not very many commas, and the number of
* non-paragraph elements is more than paragraphs or other ominous signs, remove the element.
**/
var p = tagsList[i].getElementsByTagName("p").length;
var img = tagsList[i].getElementsByTagName("img").length;
var li = tagsList[i].getElementsByTagName("li").length-100;
var input = tagsList[i].getElementsByTagName("input").length;
var embedCount = 0;
var embeds = tagsList[i].getElementsByTagName("embed");
for(var ei=0,il=embeds.length; ei < il; ei++) {
if (embeds[ei].src.search(readability.regexps.videos) == -1) {
embedCount++;
}
}
var linkDensity = readability.getLinkDensity(tagsList[i]);
var contentLength = readability.getInnerText(tagsList[i]).length;
var toRemove = false;
if ( img > p ) {
toRemove = true;
} else if(li > p && tag != "ul" && tag != "ol") {
toRemove = true;
} else if( input > Math.floor(p/3) ) {
toRemove = true;
} else if(contentLength < 25 && (img === 0 || img > 2) ) {
toRemove = true;
} else if(weight < 25 && linkDensity > 0.2) {
toRemove = true;
} else if(weight >= 25 && linkDensity > 0.5) {
toRemove = true;
} else if((embedCount == 1 && contentLength < 75) || embedCount > 1) {
toRemove = true;
}
if(toRemove) {
tagsList[i].parentNode.removeChild(tagsList[i]);
}
}
}
},
/**
* Clean out spurious headers from an Element. Checks things like classnames and link density.
*
* @param Element
* @return void
**/
cleanHeaders: function (e) {
for (var headerIndex = 1; headerIndex < 3; headerIndex++) {
var headers = e.getElementsByTagName('h' + headerIndex);
for (var i=headers.length-1; i >=0; i--) {
if (readability.getClassWeight(headers[i]) < 0 || readability.getLinkDensity(headers[i]) > 0.33) {
headers[i].parentNode.removeChild(headers[i]);
}
}
}
},
/*** Smooth scrolling logic ***/
/**
* easeInOut animation algorithm - returns an integer that says how far to move at this point in the animation.
* Borrowed from jQuery's easing library.
* @return integer
**/
easeInOut: function(start,end,totalSteps,actualStep) {
var delta = end - start;
if ((actualStep/=totalSteps/2) < 1) { return delta/2*actualStep*actualStep + start; }
return -delta/2 * ((--actualStep)*(actualStep-2) - 1) + start;
},
/**
* Helper function to, in a cross compatible way, get or set the current scroll offset of the document.
* @return mixed integer on get, the result of window.scrollTo on set
**/
scrollTop: function(scroll){
var setScroll = typeof scroll != 'undefined';
if(setScroll) {
return window.scrollTo(0, scroll);
}
if(typeof window.pageYOffset != 'undefined') {
return window.pageYOffset;
}
else if(document.documentElement.clientHeight) {
return document.documentElement.scrollTop;
}
else {
return document.body.scrollTop;
}
},
/**
* scrollTo - Smooth scroll to the point of scrollEnd in the document.
* @return void
**/
curScrollStep: 0,
scrollTo: function (scrollStart, scrollEnd, steps, interval) {
if(
(scrollStart < scrollEnd && readability.scrollTop() < scrollEnd) ||
(scrollStart > scrollEnd && readability.scrollTop() > scrollEnd)
) {
readability.curScrollStep++;
if(readability.curScrollStep > steps) {
return;
}
var oldScrollTop = readability.scrollTop();
readability.scrollTop(readability.easeInOut(scrollStart, scrollEnd, steps, readability.curScrollStep));
// We're at the end of the window.
if(oldScrollTop == readability.scrollTop()) {
return;
}
window.setTimeout(function() {
readability.scrollTo(scrollStart, scrollEnd, steps, interval);
}, interval);
}
},
/**
* Show the email popup.
*
* @return void
**/
emailBox: function () {
var emailContainerExists = document.getElementById('email-container');
if(null !== emailContainerExists)
{
return;
}
var emailContainer = document.createElement("DIV");
emailContainer.setAttribute('id', 'email-container');
emailContainer.innerHTML = '<iframe src="'+readability.emailSrc + '?pageUrl='+escape(window.location)+'&pageTitle='+escape(document.title)+'" scrolling="no" onload="readability.removeFrame()" style="width:500px; height: 490px; border: 0;"></iframe>';
document.body.appendChild(emailContainer);
},
/**
* Close the email popup. This is a hacktackular way to check if we're in a "close loop".
* Since we don't have crossdomain access to the frame, we can only know when it has
* loaded again. If it's loaded over 3 times, we know to close the frame.
*
* @return void
**/
removeFrame: function () {
readability.iframeLoads++;
if (readability.iframeLoads > 3)
{
var emailContainer = document.getElementById('email-container');
if (null !== emailContainer) {
emailContainer.parentNode.removeChild(emailContainer);
}
readability.iframeLoads = 0;
}
},
htmlspecialchars: function (s) {
if (typeof(s) == "string") {
s = s.replace(/&/g, "&amp;");
s = s.replace(/"/g, "&quot;");
s = s.replace(/'/g, "&#039;");
s = s.replace(/</g, "&lt;");
s = s.replace(/>/g, "&gt;");
}
return s;
},
flagIsActive: function(flag) {
return (readability.flags & flag) > 0;
},
addFlag: function(flag) {
readability.flags = readability.flags | flag;
},
removeFlag: function(flag) {
readability.flags = readability.flags & ~flag;
}
};
var http = require('http');
http.createServer(function(req, res) {
res.writeHead(200, {
'Content-Type': 'text/html'
});
var sys = require('sys');
var jq = require('jquery');
var jsdom = require('jsdom').jsdom,
sys = require('sys'),
window = jsdom().createWindow();
window.document.innerHTML = '<html><body><h1>HTML Ipsum Presents</h1> \
<p><strong>Pellentesque habitant morbi tristique</strong> senectus et netus et malesuada fames ac turpis egestas. Vestibulum tortor quam, feugiat vitae, ultricies eget, tempor sit amet, ante. Donec eu libero sit amet quam egestas semper. <em>Aenean ultricies mi vitae est.</em> Mauris placerat eleifend leo. Quisque sit amet est et sapien ullamcorper pharetra. Vestibulum erat wisi, condimentum sed, <code>commodo vitae</code>, ornare sit amet, wisi. Aenean fermentum, elit eget tincidunt condimentum, eros ipsum rutrum orci, sagittis tempus lacus enim ac dui. <a href="#">Donec non enim</a> in turpis pulvinar facilisis. Ut felis.</p> \
<h2>Header Level 2</h2> \
<ol> \
<li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li> \
<li>Aliquam tincidunt mauris eu risus.</li> \
</ol> \
<blockquote><p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus magna. Cras in mi at felis aliquet congue. Ut a est eget ligula molestie gravida. Curabitur massa. Donec eleifend, libero at sagittis mollis, tellus est malesuada tellus, at luctus turpis elit sit amet quam. Vivamus pretium ornare est.</p></blockquote> \
<h3>Header Level 3</h3> \
<ul> \
<li>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</li> \
<li>Aliquam tincidunt mauris eu risus.</li> \
</ul> \
<pre><code> \
#header h1 a { \
display: block; \
width: 300px; \
height: 80px; \
} \
</code></pre></body></html>';
document = window.document;
var content = readability.grabArticle();
res.end('<html><body>' + content.innerHTML + '</body></html>');
}).listen(8124, "127.0.0.1");
console.log('Server running at http://127.0.0.1:8124/');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment