Skip to content

Instantly share code, notes, and snippets.

@Dither
Created August 8, 2016 10:25
Show Gist options
  • Save Dither/94a1dc6d9120970c6398816b575980c3 to your computer and use it in GitHub Desktop.
Save Dither/94a1dc6d9120970c6398816b575980c3 to your computer and use it in GitHub Desktop.
Readability research version
// ==UserScript==
// @name Readability
// @include http://*
// @include https://*
/*jshint curly: false, es5: true, strict: false, loopfunc: true, scripturl: true, browser: true, devel: true, nonstandard: true*/
/*
* Copyright (c) 2010 Arc90 Inc
*
* Licensed under the Apache License, Version 2.0 (the 'License');
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an 'AS IS' BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
(function(root) {
var CssSelectorGenerator, indexOf = [].indexOf || function(item) {
for (var i = 0, l = this.length; i < l; i++) {
if (i in this && this[i] === item) return i;
}
return -1;
};
CssSelectorGenerator = (function() {
CssSelectorGenerator.prototype.default_options = {
selectors: ['id', 'class', 'tag', 'nthchild']
};
function CssSelectorGenerator(options) {
if (options == null) { options = {}; }
this.options = {};
this.setOptions(this.default_options);
this.setOptions(options);
}
CssSelectorGenerator.prototype.setOptions = function(options) {
var results = [], val;
if (options == null) { options = {}; }
for (var key in options) {
val = options[key];
if (this.default_options.hasOwnProperty(key)) {
results.push(this.options[key] = val);
} else {
results.push(void 0);
}
}
return results;
};
CssSelectorGenerator.prototype.isElement = function(element) {
return !!((element != null ? element.nodeType : void 0) === 1);
};
CssSelectorGenerator.prototype.getParents = function(element) {
var current_element, result = [];
if (this.isElement(element)) {
current_element = element;
while (this.isElement(current_element)) {
result.push(current_element);
current_element = current_element.parentNode;
}
}
return result;
};
CssSelectorGenerator.prototype.getTagSelector = function(element) {
return this.sanitizeItem(element.tagName.toLowerCase());
};
CssSelectorGenerator.prototype.sanitizeItem = function(item) {
var characters = (item.split('')).map(function(character) {
if (character === ':') {
return "\\" + (':'.charCodeAt(0).toString(16).toUpperCase()) + " ";
} else if (/[ !"#$%&'()*+,.\/;<=>?@\[\\\]^`{|}~]/.test(character)) {
return "\\" + character;
} else {
return escape(character).replace(/\%/g, '\\');
}
});
return characters.join('');
};
CssSelectorGenerator.prototype.getIdSelector = function(element) {
var id = element.getAttribute('id'), sanitized_id;
if ((id != null) && (id !== '') && !(/\s/.exec(id)) && !(/^\d/.exec(id))) {
var root_node = element; //element.ownerDocument
// works even if we haven't inserted the node yet
while (root_node.parentNode) {
if (/^(html|body)$/i.test(root_node.localName)) break;
root_node = root_node.parentNode;
}
sanitized_id = "#" + (this.sanitizeItem(id));
if (root_node.querySelectorAll(sanitized_id).length === 1) {
return sanitized_id;
}
}
return null;
};
CssSelectorGenerator.prototype.getClassSelectors = function(element) {
var item,
class_string = element.getAttribute('class'),
result = [];
if (typeof class_string === 'string') {
class_string = class_string.replace(/\s+/g, ' ');
class_string = class_string.replace(/^\s|\s$/g, '');
if (class_string !== '') {
result = (function() {
var k, len, ref, results;
ref = class_string.split(/\s+/);
results = [];
for (k = 0, len = ref.length; k < len; k++) {
item = ref[k];
results.push("." + (this.sanitizeItem(item)));
}
return results;
}).call(this);
}
}
return result;
};
CssSelectorGenerator.prototype.getAttributeSelectors = function(element) {
var blacklist = ['id', 'class'],
ref = element.attributes,
result = [];
for (var attribute, ref1, k = 0, len = ref.length; k < len; k++) {
attribute = ref[k];
if (ref1 = attribute.nodeName, indexOf.call(blacklist, ref1) < 0) {
result.push("[" + attribute.nodeName + "=" + attribute.nodeValue + "]");
}
}
return result;
};
CssSelectorGenerator.prototype.getNthChildSelector = function(element) {
var parent_element = element.parentNode;
if (parent_element) {
var counter = 0, siblings = parent_element.childNodes;
for (var sibling, k = 0, len = siblings.length; k < len; k++) {
sibling = siblings[k];
if (this.isElement(sibling)) {
counter++;
if (sibling === element) {
return ":nth-child(" + counter + ")";
}
}
}
}
return null;
};
CssSelectorGenerator.prototype.testSelector = function(root_node, element, selector) {
if (typeof selector === 'string' && selector.length) {
var result = root_node.querySelectorAll(selector);
if (result.length === 1 && result[0] === element) return true;
}
return false;
};
CssSelectorGenerator.prototype.getAllSelectors = function(element) {
var result;
result = {
t: null,
i: null,
c: null,
a: null,
n: null
};
if (indexOf.call(this.options.selectors, 'tag') >= 0) {
result.t = this.getTagSelector(element);
}
if (indexOf.call(this.options.selectors, 'id') >= 0) {
result.i = this.getIdSelector(element);
}
if (indexOf.call(this.options.selectors, 'class') >= 0) {
result.c = this.getClassSelectors(element);
}
if (indexOf.call(this.options.selectors, 'attribute') >= 0) {
result.a = this.getAttributeSelectors(element);
}
if (indexOf.call(this.options.selectors, 'nthchild') >= 0) {
result.n = this.getNthChildSelector(element);
}
return result;
};
CssSelectorGenerator.prototype.testUniqueness = function(element, selector) {
var found_elements, parent;
parent = element.parentNode;
if (!parent) return false;
found_elements = parent.querySelectorAll(selector);
return found_elements.length === 1 && found_elements[0] === element;
};
CssSelectorGenerator.prototype.testCombinations = function(element, items, tag) {
var item, ref = this.getCombinations(items), ref1;
for (var k = 0, len = ref.length; k < len; k++) {
item = ref[k];
if (this.testUniqueness(element, item)) {
return item;
}
}
if (tag != null) {
if (typeof items === 'string') items = [[items]];
if (tag == null) tag = '';
ref1 = items.map(function(item) {
return tag + item ;
});
for (var l = 0, len1 = ref1.length; l < len1; l++) {
item = ref1[l];
if (this.testUniqueness(element, item)) {
return item;
}
}
}
return null;
};
CssSelectorGenerator.prototype.getUniqueSelector = function(element, named) {
var found_selector, selector_type,
selectors = this.getAllSelectors(element),
ref = this.options.selectors;
for (var k = 0, len = ref.length; k < len; k++) {
selector_type = ref[k];
switch (selector_type) {
case 'id':
if (selectors.i != null) {
if (this.testUniqueness(element, selectors.i)) return selectors.i;
if (found_selector = this.testCombinations(element, selectors.i, selectors.t)) return found_selector;
}
break;
case 'class':
if ((selectors.c != null) && selectors.c.length !== 0) {
if (found_selector = this.testCombinations(element, selectors.c, selectors.t)) return found_selector;
}
break;
case 'attribute':
if ((selectors.a != null) && selectors.a.length !== 0) {
if (found_selector = this.testCombinations(element, selectors.a, selectors.t)) return found_selector;
}
break;
case 'tag':
if (selectors.t != null) {
if (this.testUniqueness(element, selectors.t)) return selectors.t;
}
break;
case 'nthchild':
if (selectors.n != null) {
if (this.testUniqueness(element, selectors.n)) return selectors.n;
if (found_selector = this.testCombinations(element, selectors.n, selectors.t)) return found_selector;
}
}
}
return '*';
};
CssSelectorGenerator.prototype.getSelector = function(element) {
var all_selectors = [], item, parents = this.getParents(element), result, selector;
for (var k = 0, len = parents.length; k < len; k++) {
item = parents[k];
selector = this.getUniqueSelector(item);
if (selector != null) {
all_selectors.push(selector);
}
}
var selectors = [],
root_node = element; //element.ownerDocument
// works even if we haven't inserted the node yet
while (root_node.parentNode) {
if (/^(html|body)$/i.test(root_node.localName)) break;
root_node = root_node.parentNode;
}
for (var l = 0, len1 = all_selectors.length; l < len1; l++) {
item = all_selectors[l];
selectors.unshift(item);
result = selectors.join('>');
if (this.testSelector(root_node, element, result)) {
return result;
}
}
return null;
};
CssSelectorGenerator.prototype.getCombinations = function(items) {
var i, j, k, l, ref, ref1, result = [[]];
if (items == null) items = [];
if (typeof items === 'string') return [[items]];
for (i = k = 0, ref = items.length - 1; 0 <= ref ? k <= ref : k >= ref; i = 0 <= ref ? ++k : --k) {
for (j = l = 0, ref1 = result.length - 1; 0 <= ref1 ? l <= ref1 : l >= ref1; j = 0 <= ref1 ? ++l : --l) {
result.push(result[j].concat(items[i]));
}
}
result.shift();
result = result.sort(function(a, b) { return a.length - b.length; });
result = result.map(function(item) { return item.join(''); });
return result;
};
return CssSelectorGenerator;
})();
root.CssSelectorGenerator = CssSelectorGenerator;
})(this);
/*
* This code is heavily based on Arc90's readability.js (1.7.1) script
* available at: http://code.google.com/p/arc90labs-readability
*
* Modified by DitherSky for research purposes.
*/
/*
For testing: new Readability().init(uri, document).parse(function(result) {
document.body.innerHTML = result.content;
});
*/
/**
* Public constructor.
* @param {Object} options The options object.
*/
function Readability(options) {
options = options || {};
this._version = '1.7.2-no-multi-page';
this._uri = null;
this._doc = null;
this._biggestFrame = false;
this._articleAuthor = null;
this._articleDir = null;
this._requestedUrls = [];
// Configurable options
this._debug = !!options.debug || false;
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES;
this._excludeExternal = options.excludeExternal || this.DEFAULT_EXCLUDE_EXTERNAL;
this._excludeAImg = options.excludeAImg || this.DEFAULT_EXCLUDE_A_IMG;
this._replaceImgs = options.replaceImgs || this.REPLACE_IMAGES_TO_FULL;
// Start with all flags set
this._flags = this.FLAG_STRIP_UNLIKELYS |
this.FLAG_WEIGHT_ATTRIBUTES |
this.FLAG_CLEAN_CONDITIONALLY |
this.FLAG_DISABLE_PREFILTER |
this.FLAG_DISABLE_POSTFILTER;
// The list of pages we've parsed in this call of readability,
// for autopaging. As a key store for easier searching.
this._parsedPages = {};
// A list of the ETag headers of pages we've parsed, in case they happen to match,
// we'll know it's a duplicate.
this._pageURLs = {};
// Make an AJAX request for each page and append it to the document.
this._curPageNum = 1;
this._selGen = new window.CssSelectorGenerator;
this._pathTo = this._selGen.getSelector.bind(this._selGen);
// Control whether log messages are sent to the console
if (this._debug) {
this.log = function() {
if (typeof dump !== 'undefined') {
var self = this, msg = Array.prototype.map.call(arguments, function(x) {
return (x && x.nodeName) ? self._pathTo(x) : x;
}).join(' ');
dump('[Readability] ' + msg + '\n');
} else if (typeof console !== undefined) {
var self = this, msg = Array.prototype.map.call(arguments, function(x) {
return (x && x.nodeName) ? self._pathTo(x) : x;
}).join(' ');
console.log('[Readability] ' + msg + '\n');
}
};
} else {
this.log = function() {};
}
}
Readability.prototype = {
// flags
FLAG_STRIP_UNLIKELYS: 0x1,
FLAG_WEIGHT_ATTRIBUTES: 0x2,
FLAG_CLEAN_CONDITIONALLY: 0x4,
FLAG_DISABLE_PREFILTER: 0x8,
FLAG_DISABLE_POSTFILTER: 0x10,
// constants
SCORE_CHARS_IN_PARAGRAPH: 100,
SCORE_WORDS_IN_PARAGRAPH: 20,
GRANDPARENT_SCORE_DIVISOR: 2,
GRANDPARENT_UP_SCORE_DIVISOR: 3,
MIN_PARAGRAPH_LENGTH: 20,
MIN_COMMAS_IN_PARAGRAPH: 6,
MIN_ARTICLE_LENGTH: 200,
MIN_NODE_LENGTH: 80,
MAX_LINK_DENSITY: 0.25,
MIN_WORD_LENGTH: 2,
MAX_ANCESTORS: 4,
// Replace image src to full URL from parent's link
REPLACE_IMAGES_TO_FULL: true,
// Ignore external links when checking for link weights
DEFAULT_EXCLUDE_EXTERNAL: true,
// Ignore a>img type of nodes when considering link density
DEFAULT_EXCLUDE_A_IMG: true,
// Max number of nodes supported by this parser. Default: 0 (no limit)
DEFAULT_MAX_ELEMS_TO_PARSE: 0,
// The number of top candidates to consider when analyzing how
// tight the competition is among candidates.
DEFAULT_N_TOP_CANDIDATES: 5,
// The maximum number of pages to loop through before we call
// it quits and just show a link.
DEFAULT_MAX_PAGES: 10,
// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: 'IMG,SECTION,P,TD,PRE,CODE,H2,H3,H4,H5,H6'.split(','),
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
safe: /hentry|(?:instapaper|article).body|markdown|\bfulltext/i,
unlikelyCandidates: /auth?or|similar|ignore|\binfo|annoy|clock|\bdate|\btime|footer|com(?:bx|ment|munity)|banner|intro|log.{2}n|edcolinks|hidd?e|about|bookmark|\bcat|search|social|robot|published|mast(?:head)|subscri|category|disqus|extra|head(?:er|note)|floor|agegate|menu|function|remark|rss|tool|header|teaserlist|widget|meta|adsense|inner-?ad|ad-|\badv\b|\bads\b|agr?egate?|pager|sidebar|popup|tweet|twit|like/i,
okMaybeItsACandidate: /and|out(?:er|side)|wrap|post|article\b|body|entry|\bmain|page|contain|\bcontent|column|general|detail|shadow|lightbox|blog/i,
positive: /read|full|article|source|content|body|\bcontent|contain|\bentry|main|page|attach|post|text|blog|story/i,
negative: /pag(?:er|ination)|\bdate|\btime|nav|tag|extra|keyword|foot(?:note)?|^hid$|hid$|\bhid\b|^hid|all|bottom|stat|info|modal|outbrain|masthead|com-|contact|_nav|link|media|\bout|skyscraper|promo|\bad-|related|scroll|shoutbox|sponsor|shopping|teaser/i,
extraneous: /\bprint|archive|comment|discuss|e?[\-]?mail|share|reply|sign|single|utility/i,
authorline: /byline|author|dateline|writtenby/i,
styleFilter: /display\s*:\s*none|visibility\s*:\s*hidden/i,
normalize: /\s{2,}/g,
flatten: /(?:[\r\n](?:\s|&nbsp;)*)+/g,
videos: /\/\/(?:[^.?\/]+\.)?(?:youtu(?:be)?|soundcloud|vimeo|imgur|gfycat|dailymotion|cliphunter|twitch|vid|pornhub|xvideos|twitvid|rutube|viddler)\.(?:com|me|be|org|net|tv|ru)/i,
nextLink: /(next|newer|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
prevLink: /(prev|earl|old|new|<|«)/i,
pages: /pag(?:e|ing|inat)/i,
pageNumber: /p[ag]{0,2}(?:e|ing|ination)?[=\/]\d{1,2}/i,
whitespace: /^\s*$/,
hasContent: /\S$/,
imgExt: /\.(?:gif|svg|jpe?g|a?png|webp)$/i
},
DIV_TO_P_ELEMS: ['A', 'BLOCKQUOTE', 'DL', 'DIV', 'IMG', 'OL', 'P', 'PRE', 'TABLE', 'UL', 'SELECT', 'CODE', 'FOOTER', 'ASIDE'],
ALTER_TO_DIV_EXCEPTIONS: ['DIV', 'ARTICLE', 'SECTION', 'P'],
NODES_TO_IGNORE: ['HTML', 'HEAD', 'BODY', 'ARTICLE'],
ATTRIBUTE_WHITELIST: ['lang', 'src', 'href', 'type', 'alt', 'title', 'data', 'height', 'width', 'name', 'value', 'type', 'border', 'frameborder', 'colspan', 'rowspan', 'span', 'cite'],
MEDIA_NODES: ['object', 'embed', 'iframe', 'audio', 'video'],
NODES_TO_CLEAN: ['iframe', 'audio', 'video', 'object', 'embed', 'applet', 'h1', 'footer', 'input', 'button', 'nav', 'canvas', 'time'],
// raw HTML filters
_preFilters: [
{ r: /<html[^>]+>/gi, s: '<html>' }, // HTML5 namespaced
{ r: /^\s+|\s+$/g, s: '' }, // trim()
{ r: /[\r\n]+(?=\n{2})/g, s: '' },
{ r: /(?:<br\/>(?:\s|&nbsp;?)*)+(?=<\/?p)/gi, s: '' }, // replace excessive br's
{ r: /(?:\s|&nbsp;?)+(?=<br\/>)/g, s: '' }, // remove spaces in front of <br>s
{ r: /(?:<br\/>){2,}/gi, s: '</p><p>' }, // all double+ <br>s into <p>s
{ r: /\n/g, s: '\uffff' }, // filter scripts...
{ r: /<script.*?>.*?<\/script>/gi, s: '' },
{ r: /\n/g, s: '\uffff' },
{ r: /<script.*?>.*?<\/script>/gi, s: '' },
{ r: /\uffff/g, s: '\n' }, // ...filter scripts
/* { r: /<(\/?)noscript/gi, s: '<$1div' }, // expand noscript*/
{ r: /<(\/?)font[^>]*>/gi, s: '<$1span>' }, // fonts to spans
{ r: /<\/?span[^>]*>/gi, s: '' } // remove spans as we redefine styles and they're probably special-styled
],
// output HTML filters
_postFilters: [
{ r: /<(?:a|div|span|p|i|strong)[^>]+\/>|<(?:a|div|span|p|i|strong)[^>]*>\s*<\/(?:a|div|span|p|i|strong)[^>]*>/gi, s: '' } // replace empty tags that break layouts
],
/**
* Replace innerHTML of a node based on array of custom regexp filters.
*
* @param Element
* @param Array
* @return void
**/
_replaceAll: function(content, filters) {
if (!content || !content.innerHTML) return;
for (var i = 0, l = filters.length; i < l; i++) {
content.innerHTML = String.prototype.replace.apply(content.innerHTML, [filters[i].r, filters[i].s]);
}
},
/**
* Run any post-process modifications to article content if necessary.
*
* @param Element
* @return void
**/
_postProcessContent: function(articleContent) {
if (!articleContent) return;
this._replaceAll(articleContent, this._postFilters);
if (this._debug) return;
this._filterAttributes(articleContent);
},
/**
* Run any pre-process modifications to article content if necessary.
*
* @param Element
* @return void
**/
_preProcessContent: function(articleContent) {
if (!articleContent || !articleContent.documentElement) return;
this._replaceAll(articleContent, this._preFilters);
this._fixRelativeUris(articleContent);
},
/**
* Iterates over a NodeList, calls `filterFn` for each node and removes node
* if function returned `true`.
*
* If function is not passed, removes all the nodes in node list.
*
* @param NodeList nodeList The no
* @param Function filterFn
* @return void
*/
_removeNodes: function(nodeList, filterFn) {
for (var parentNode, node, i = nodeList.length; i--;) {
node = nodeList[i];
parentNode = node.parentNode;
if (parentNode && (!filterFn || filterFn.call(this, node, i, nodeList)))
parentNode.removeChild(node);
}
},
/**
* Iterate over a NodeList, which doesn't natively fully implement the Array
* interface.
*
* For convenience, the current object context is applied to the provided
* iterate function.
*
* @param NodeList nodeList The NodeList.
* @param Function fn The iterate function.
* @param Boolean backward Whether to use backward iteration.
* @return void
*/
_forEachNode: function(nodeList, fn, backward) {
var i, l;
if (backward) {
for (i = nodeList.length; i--;)
fn(nodeList[i], i, this);
} else {
for (i = 0, l = nodeList.length; i < l; i++)
fn(nodeList[i], i, this);
}
},
/**
* Iterate over a NodeList, return true if any of the provided iterate
* function calls returns true, false otherwise.
*
* For convenience, the current object context is applied to the
* provided iterate function.
*
* @param NodeList nodeList The NodeList.
* @param Function fn The iterate function.
* @return Boolean
*/
_someNode: function(nodeList, fn) {
return Array.prototype.some.call(nodeList, fn, this);
},
_getAllNodesWithTag: function(node, tagNames) {
if (node.querySelectorAll) {
return node.querySelectorAll(tagNames.join(','));
}
return [].concat.apply([], tagNames.map(function(tag) {
var collection = node.getElementsByTagName(tag);
return Array.isArray(collection) ? collection : Array.from(collection);
}));
},
/**
* Converts each <a> and <img> uri in the given element to an absolute URI,
* ignoring #ref URIs.
*
* @param Element
* @return void
*/
_fixRelativeUris: function(articleContent) {
var scheme = this._uri.scheme;
var prePath = this._uri.prePath;
var pathBase = this._uri.pathBase;
var toAbsoluteURI = function(uri) {
// If this is already an absolute URI, return it.
if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri))
return uri;
// Scheme-rooted relative URI.
if (uri.substr(0, 2) == '//')
return scheme + '://' + uri.substr(2);
// Prepath-rooted relative URI.
if (uri[0] == '/')
return prePath + uri;
// Dotslash relative URI.
if (uri.indexOf('./') === 0)
return pathBase + uri.slice(2);
// Ignore hash URIs:
if (uri[0] == '#')
return uri;
// Standard relative URI; add entire path. pathBase already includes a
// trailing '/'.
return pathBase + uri;
};
var fixHref = function(link) {
var href = link.getAttribute('href');
if (href) {
// Replace links with javascript: URIs with text content, since
// they won't work after scripts have been removed from the page.
if (/javascript\s*:/i.test(href)) {
link.parentNode.replaceChild(this._doc.createTextNode(link.textContent), link);
} else {
link.setAttribute('href', toAbsoluteURI(href));
}
}
};
var links = this._getAllNodesWithTag(articleContent, ['a', 'link']);
this._forEachNode(links, fixHref.bind(this), true);
var media = this._getAllNodesWithTag(articleContent, ['img', 'source']);
this._forEachNode(media, function(node) {
var src = node.getAttribute('src');
if (src)
node.setAttribute('src', toAbsoluteURI(src));
}, true);
},
/**
* Get the article title as an H1.
*
* @return void
**/
_getArticleTitle: function() {
var doc = this._doc;
var curTitle = '';
var origTitle = '';
try {
curTitle = origTitle = doc.title;
// If they had an element with id/class 'title' in their HTML
if (typeof curTitle !== 'string')
curTitle = origTitle =
this._getInnerText(this._getAllNodesWithTag(doc, ['title', '[id*="title"]', '[class*="title"]']));
} catch (e) {}
if (curTitle.match(/ [\|\-|\xbb] /)) {
curTitle = origTitle.replace(/(.*)[\|\-|\xbb] .*/gi, '$1');
if (curTitle.split(' ').length < 3)
curTitle = origTitle.replace(/[^\|\-|\xbb]*[\|\-|\xbb](.*)/gi, '$1');
} else if (curTitle.indexOf(': ') !== -1) {
var match = this._someNode(this._getAllNodesWithTag(doc, ['h1', 'h2', 'h3']), function(heading) {
return heading.textContent === curTitle;
});
// If we don't, let's extract the title out of the original title string.
if (!match) {
curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1);
// If the title is now too short, try the first colon instead:
if (curTitle.split(' ').length < 3)
curTitle = origTitle.substring(origTitle.indexOf(':') + 1);
}
} else if (curTitle.length > 150 || curTitle.length < 15) {
var hOnes = doc.getElementsByTagName('h1');
if (hOnes.length === 1)
curTitle = this._getInnerText(hOnes[0]);
}
curTitle = curTitle.trim();
if (curTitle.split(' ').length <= 5)
curTitle = origTitle;
return curTitle;
},
/**
* Prepare the HTML document for readability to scrape it.
* This includes things like stripping javascript, CSS, and handling terrible markup.
*
* @return void
**/
_prepDocument: function() {
var doc = this._doc;
// Remove all style tags in head
this._removeNodes(this._getAllNodesWithTag(doc, ['style', 'link']));
if (doc.body)
this._replaceBrs(doc.body);
},
/**
* Finds the next element, starting from the given node, and ignoring
* whitespace in between. If the given node is an element, the same node is
* returned.
*/
_nextElement: function(node) {
var next = node;
while (next && (next.nodeType != Node.ELEMENT_NODE) && this.REGEXPS.whitespace.test(next.textContent)) {
next = next.nextSibling;
}
return next;
},
/**
* Replaces 2 or more successive <br> elements with a single <p>.
* Whitespace between <br> elements are ignored. For example:
* <div>foo<br>bar<br> <br><br>abc</div>
* will become:
* <div>foo<br>bar<p>abc</p></div>
*/
_replaceBrs: function(elem) {
var replBr = function(br) {
var next = br.nextSibling;
// Whether 2 or more <br> elements have been found and replaced with a
// <p> block.
var replaced = false;
// If we find a <br> chain, remove the <br>s until we hit another element
// or non-whitespace. This leaves behind the first <br> in the chain
// (which will be replaced with a <p> later).
while ((next = this._nextElement(next)) && (next.tagName == 'BR')) {
replaced = true;
var brSibling = next.nextSibling;
next.parentNode.removeChild(next);
next = brSibling;
}
// If we removed a <br> chain, replace the remaining <br> with a <p>. Add
// all sibling nodes as children of the <p> until we hit another <br>
// chain.
if (replaced) {
var p = this._doc.createElement('p');
br.parentNode.replaceChild(p, br);
next = p.nextSibling;
while (next) {
// If we've hit another <br><br>, we're done adding children to this <p>.
if (next.tagName == 'BR') {
var nextElem = this._nextElement(next);
if (nextElem && nextElem.tagName == 'BR')
break;
}
// Otherwise, make this node a child of the new <p>.
var sibling = next.nextSibling;
p.appendChild(next);
next = sibling;
}
}
};
this._forEachNode(this._getAllNodesWithTag(elem, ['br']), replBr.bind(this));
},
_setNodeTag: function(node, tag) {
//this.log('_setNodeTag', node, tag);
if (node.__JSDOMParser__) {
node.localName = tag.toLowerCase();
node.tagName = tag.toUpperCase();
return node;
}
var replacement = node.ownerDocument.createElement(tag);
while (node.firstChild)
replacement.appendChild(node.firstChild);
node.parentNode.replaceChild(replacement, node);
if (node.readability)
replacement.readability = node.readability;
for (var i = 0; i < node.attributes.length; i++)
replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);
return replacement;
},
/**
* Prepare the article node for display. Clean out any inline styles,
* iframes, forms, strip extraneous <p> tags, etc.
*
* @param Element
* @return void
**/
_prepArticle: function(articleContent) {
this._cleanStyles(articleContent);
// Clean out junk from the article content
this._cleanConditionally(articleContent, ['form']);
this._clean(articleContent, this.NODES_TO_CLEAN);
// If there is only one h2, they are probably using it as a header
// and not a sub-header, so remove it since we already have a header.
var h2 = articleContent.getElementsByTagName('h2');
if (h2.length === 1 && h2[0].length < this.MIN_NODE_LENGTH)
this._clean(articleContent, 'h2');
var h3 = articleContent.getElementsByTagName('h3');
if (h3.length === 1 && h3[0].length < this.MIN_NODE_LENGTH)
this._clean(articleContent, 'h3');
this._cleanHeaders(articleContent);
// Do these last as the previous stuff may have removed junk
// that will affect these
this._cleanConditionally(articleContent, ['table', 'ul', 'div']);
// Remove extra paragraphs
this._removeNodes(articleContent.getElementsByTagName('p'), function(paragraph) {
var imgCount = paragraph.getElementsByTagName('img').length;
var embedCount = paragraph.getElementsByTagName('embed').length;
var objectCount = paragraph.getElementsByTagName('object').length;
var videoCount = paragraph.getElementsByTagName('video').length;
var audioCount = paragraph.getElementsByTagName('audio').length;
var iframeCount = paragraph.getElementsByTagName('iframe').length;
var totalCount = imgCount + embedCount + objectCount + videoCount + audioCount + iframeCount;
return totalCount === 0 && !this._getInnerText(paragraph, false, true);
});
var self = this;
this._forEachNode(this._getAllNodesWithTag(articleContent, ['br']), function(br) {
var next = self._nextElement(br.nextSibling);
if (next && next.tagName == 'P')
br.parentNode.removeChild(br);
});
},
/**
* Initialize a node with the readability object. Also checks the
* className/id for special names to add to its score.
*
* @param Element
* @return void
**/
_initializeNode: function(node) {
node.readability = { 'contentScore': 0 };
switch (node.tagName) {
case 'ARTICLE':
node.readability.contentScore += 50;
break;
case 'DIV':
node.readability.contentScore += 5;
break;
case 'PRE':
case 'CODE':
case 'TD':
case 'BLOCKQUOTE':
case 'FIGURE':
node.readability.contentScore += 3;
break;
//case 'SECTION': // often misused
case 'ADDRESS':
case 'OL':
case 'UL':
case 'DL':
node.readability.contentScore -= 2 * Math.ceil(this._getLinkDensity(node));
break;
//case 'ASIDE':
case 'FOOTER':
case 'HEADER':
case 'ADDRESS':
case 'FORM':
case 'BUTTON':
case 'TEXTAREA':
case 'INPUT':
case 'NAV':
node.readability.contentScore -= 3;
break;
case 'H1':
case 'H2':
case 'H3':
case 'H4':
case 'H5':
case 'H6':
case 'TH':
case 'HGROUP':
node.readability.contentScore -= 8;
}
node.readability.contentScore += this._getAttributesWeight(node);
},
_removeAndGetNext: function(node) {
var nextNode = this._getNextNode(node, true);
node.parentNode.removeChild(node);
return nextNode;
},
/**
* Traverse the DOM from node to node, starting at the node passed in.
* Pass true for the second parameter to indicate this node itself
* (and its kids) are going away, and we want the next node over.
*
* Calling this in a loop will traverse the DOM depth-first.
*/
_getNextNode: function(node, ignoreSelfAndKids) {
// First check for kids if those aren't being ignored
if (!ignoreSelfAndKids && node.firstElementChild)
return node.firstElementChild;
// Then for siblings...
if (node.nextElementSibling)
return node.nextElementSibling;
// And finally, move up the parent chain *and* find a sibling
// (because this is depth-first traversal, we will have already
// seen the parent nodes themselves).
do {
node = node.parentNode;
} while (node && !node.nextElementSibling);
return node && node.nextElementSibling;
},
/**
* Like _getNextNode, but for DOM implementations with no
* firstElementChild/nextElementSibling functionality...
*/
_getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) {
function nextSiblingEl(n) {
do {
n = n.nextSibling;
} while (n && n.nodeType !== n.ELEMENT_NODE);
return n;
}
// First check for kids if those aren't being ignored
if (!ignoreSelfAndKids && node.children[0]) {
return node.children[0];
}
// Then for siblings...
var next = nextSiblingEl(node);
if (next)
return next;
// And finally, move up the parent chain *and* find a sibling
// (because this is depth-first traversal, we will have already
// seen the parent nodes themselves).
do {
node = node.parentNode;
if (node)
next = nextSiblingEl(node);
} while (node && !next);
return node && next;
},
_checkAuthorLine: function(node, matchString) {
if (this._articleAuthor)
return false;
var rel;
if (typeof node.getAttribute !== 'undefined')
rel = node.getAttribute('rel');
if ((rel === 'author' || this.REGEXPS.authorline.test(matchString)) && this._isValidAuthorLine(node.textContent)) {
this._articleAuthor = node.textContent.trim();
return true;
}
return false;
},
_getWordCount: function(text) {
var length = 0;
text.split(/[\s.,;]/).every(function(v) {
if (v.length > this.MIN_WORD_LENGTH) length++;
return true; });
return length;
},
_getNodeAncestors: function(node, maxDepth) {
maxDepth = maxDepth || 0;
var i = 0,
ancestors = [];
while (node.parentNode) {
ancestors.push(node.parentNode);
if (maxDepth && ++i === maxDepth)
break;
node = node.parentNode;
}
return ancestors;
},
_filterAttributes: function(doc) {
var nodes = doc.querySelectorAll('*');
for (var i = 0, l = nodes.length; i < l; i++) {
var attributes = nodes[i].attributes,
j = attributes.length;
while (j--) {
var attr = attributes[j];
if (this.ATTRIBUTE_WHITELIST.indexOf(attr.name.toLowerCase()) === -1)
nodes[i].removeAttributeNode(attr);
}
}
},
/***
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
*
* @param page a document to run upon. Needs to be a full document, complete with body.
* @return Element
**/
_grabArticle: function(page, grab_callback) {
this.log('\n**** grabArticle ****\n');
var isPaging = false;
// We can't grab an article if we don't have a page!
if (!page) {
this.log('No body found in document. Abort.');
return grab_callback(null);
}
var doc = page.documentElement;
var pageCacheHtml = page.innerHTML;
// Check if any 'dir' is set on the toplevel document element
this._articleDir = page.documentElement.getAttribute('dir');
var stripUnlikelyCandidates = false;//this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)
while (true) {
// First, node prepping. Trash nodes that look cruddy (like ones with the
// class name 'comment', etc), and turn divs into P tags where they have been
// used inappropriately (as in, where they contain no other block level elements.)
var elementsToScore = [],
tname = '',
node = doc;
while (node) {
tname = node.tagName;
if (~this.NODES_TO_IGNORE.indexOf(tname)) {
node = this._getNextNode(node);
continue;
}
// Check to see if this node is an author line, and remove it if it is.
if (this._checkAuthorLine(node, matchString)) {
node = this._removeAndGetNext(node);
continue;
}
// Remove unlikely candidates
var matchString = node.className + ' ' + node.id;
if (stripUnlikelyCandidates) {
if ((node.style && node.style.cssText && REGEXPS.styleFilter.test(node.style.cssText)) ||
((this.REGEXPS.unlikelyCandidates.test(matchString) ||
this.REGEXPS.extraneous.test(matchString)) &&
!this.REGEXPS.okMaybeItsACandidate.test(matchString)) &&
tname !== 'IMG' &&
tname !== 'A')
{
this.log('Removing unlikely candidate by "', matchString, '" /', ((matchString.match(this.REGEXPS.unlikelyCandidates) || [])[0] || ''), '/')
node = this._removeAndGetNext(node);
continue;
}
}
if (this.DEFAULT_TAGS_TO_SCORE.indexOf(tname) !== -1) {
elementsToScore.push(node);
}
if (this._replaceImgs && tname === "A" && node.children.length === 1 &&
node.children[0].tagName === "IMG" &&
this.REGEXPS.imgExt.test(node.href)) {
node.children[0].src = node.parentNode.href;
}
// Turn all divs that don't have children block level elements into p's
if (tname === 'DIV') {
// Sites like http://mobile.slate.com encloses each paragraph with a DIV
// element. DIVs with only a P element inside and no text content can be
// safely converted into plain P elements to avoid confusing the scoring
// algorithm with DIVs with are, in practice, paragraphs.
if (this._hasSinglePInsideElement(node)) {
var newNode = node.children[0];
node.parentNode.replaceChild(newNode, node);
node = newNode;
} else if (!this._hasChildBlockElement(node)) {
node = this._setNodeTag(node, 'P');
elementsToScore.push(node);
} else {
// EXPERIMENTAL
this._forEachNode(node.childNodes, function(childNode) {
if (childNode.nodeType === Node.TEXT_NODE) {
var p = page.createElement('p');
p.textContent = childNode.textContent;
p.style.display = 'inline';
p.className = 'readability-styled';
node.replaceChild(p, childNode);
}
});
}
}
node = this._getNextNode(node);
}
/**
* Loop through all paragraphs, and assign a score to them based on how content-y they look.
* Then add their score to their parent node.
*
* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
**/
var candidates = [],
scoreFn = function(elementToScore) {
if (!elementToScore.parentNode || typeof elementToScore.parentNode.tagName === 'undefined')
return;
// Add a point for the paragraph itself as a base.
var isImage = elementToScore.tagName === "IMG", contentScore = 1;
if (isImage) {
if (elementToScore.getAttribute('alt'))
contentScore += 5;
var value = parseInt(elementToScore.getAttribute('width'), 10);
if (isNaN(value)); // NaN (skip)
else if (value <= 32)
this._setNodeTag(elementToScore, "noscript"); // remove from scan
else if (value >= 350)
contentScore += 20;
else if (value >= 128)
contentScore += 5;
}
// If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it.
var innerText = this._getInnerText(elementToScore);
if (!isImage && innerText.length < this.MIN_PARAGRAPH_LENGTH)
return;
// Exclude nodes with no ancestor.
var ancestors = this._getNodeAncestors(elementToScore, this.MAX_ANCESTORS);
if (ancestors.length === 0)
return;
// Add points for any words/commas within this paragraph.
//contentScore += Math.min(Math.floor(this._getWordCount(innerText) / Math.max((innerText.match(/\b[,.]\s/g) || []).length, 1)), 3);
contentScore += this._getCharCount(innerText, ',');
// For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points.
contentScore += Math.min(Math.floor(innerText.length / this.SCORE_CHARS_IN_PARAGRAPH), 3);
// For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points.
contentScore += Math.min(Math.floor(this._getWordCount(innerText) / this.SCORE_WORDS_IN_PARAGRAPH), 3);
// Initialize and score ancestors.
var scoreAn = function(ancestor, level) {
if (!ancestor.tagName)
return;
if (typeof(ancestor.readability) === 'undefined') {
this._initializeNode(ancestor);
candidates.push(ancestor);
}
// Node score divider:
// - parent: no division
// - grandparent+: ancestor level * GRANDPARENT_UP_SCORE_DIVISOR
var scoreDivider = 1;
if (level !== 0)
scoreDivider = level * this.GRANDPARENT_UP_SCORE_DIVISOR;
ancestor.readability.contentScore += Math.floor(contentScore / scoreDivider);
};
this._forEachNode(ancestors, scoreAn.bind(this));
};
this._forEachNode(elementsToScore, scoreFn.bind(this));
// After we've calculated scores, loop through all of the possible
// candidate nodes we found and find the one with the highest score.
var topCandidates = [];
for (var c = 0, cl = candidates.length; c < cl; c += 1) {
var candidate = candidates[c];
// Scale the final candidates score based on link density. Good content
// should have a relatively small link density (5% or less) and be mostly
// unaffected by this operation.
var candidateScore = Math.floor(candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)));
candidate.readability.contentScore = candidateScore;
this.log('Candidate:', candidate, 'with score', (candidateScore || 0))
for (var t = 0; t < this._nbTopCandidates; t++) {
var aTopCandidate = topCandidates[t];
if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) {
topCandidates.splice(t, 0, candidate);
if (topCandidates.length > this._nbTopCandidates)
topCandidates.pop();
break;
}
}
}
var topCandidate = topCandidates[0] || null;
var neededToCreateTopCandidate = false;
// If we still have no top candidate, just use the body as a last resort.
// We also have to copy the body node so it is something we can modify.
if (!topCandidate || topCandidate.tagName === 'BODY') {
// Move all of the page's children into topCandidate
topCandidate = page.createElement('DIV');
neededToCreateTopCandidate = true;
// Move everything (not just elements, also text nodes etc.) into the container
// so we even include text directly in the body:
var kids = page.childNodes;
while (kids.length) {
this.log('Moving child out:', kids[0])
topCandidate.appendChild(kids[0]);
}
page.appendChild(topCandidate);
this._initializeNode(topCandidate);
} else if (topCandidate) {
// Because of our bonus system, parents of candidates might have scores
// themselves. They get half of the node. There won't be nodes with higher
// scores than our topCandidate, but if we see the score going *up* in the first
// few steps up the tree, that's a decent sign that there might be more content
// lurking in other places that we want to unify in. The sibling stuff
// below does some of that - but only if we've looked high enough up the DOM
// tree.
var parentOfTopCandidate = topCandidate.parentNode;
var lastScore = topCandidate.readability.contentScore;
// The scores shouldn't get too low.
var scoreThreshold = lastScore / 3;
while (parentOfTopCandidate && parentOfTopCandidate.readability) {
var parentScore = parentOfTopCandidate.readability.contentScore;
if (parentScore < scoreThreshold)
break;
if (parentScore > lastScore) {
// Alright! We found a better parent to use.
topCandidate = parentOfTopCandidate;
break;
}
lastScore = parentOfTopCandidate.readability.contentScore;
parentOfTopCandidate = parentOfTopCandidate.parentNode;
}
// Check if data is inside a table end set topCandidate to it if found
var parentNode = topCandidate,
tagn;
while (parentNode) {
tagn = parentNode.tagName
if (tagn === 'TABLE' || tagn === 'UL' || tagn === 'OL') {
topCandidate = parentNode;
break;
}
parentNode = parentNode.parentNode;
}
}
this.log('\n\nThe primary content:', topCandidate, '\n')
// Now that we have the top candidate, look through its siblings for content
// that might also be related. Things like preambles, content split by ads
// that we removed, etc.
var articleContent = page.createElement('div');
if (isPaging)
articleContent.id = 'readability-content';
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
var siblings = topCandidate.parentNode.children;
for (var s = 0, sl = siblings.length; s < sl; s++) {
var sibling = siblings[s];
var append = false;
this.log('Looking at sibling node:', sibling.tagName, sibling, 'with score', (sibling.readability ? sibling.readability.contentScore : '-'))
this.log('Sibling has score', (sibling.readability ? (sibling.readability.contentScore || 0) : 0));
if (sibling === topCandidate) {
append = true;
} else {
var contentBonus = 0;
var produceAtrArray = function (node) {
return ((node.className || '') + ' ' + (node.id || ''))
.replace(/[_-]/g, ' ')
.split(' ')
.filter(function(v) {
return v === '' });
}
var intersectArrays = function (arrays) {
return arrays.shift().filter(function(v) {
return arrays.every(function(a) {
return a.indexOf(v) !== -1;
});
})
}
// Give a bonus if sibling nodes and top candidates have same classname group
if (intersectArrays([produceAtrArray(sibling), produceAtrArray(topCandidate)]).length)
contentBonus += topCandidate.readability.contentScore * 0.2;
if (sibling.readability &&
((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) {
append = true;
} else if (sibling.nodeName === 'P') {
var linkDensity = this._getLinkDensity(sibling),
nodeContent = this._getInnerText(sibling),
nodeLength = nodeContent.length;
if (nodeLength > this.MIN_NODE_LENGTH &&
linkDensity < this.MAX_LINK_DENSITY)
append = true;
else if (nodeLength > 0 &&
nodeLength < this.MIN_NODE_LENGTH &&
linkDensity === 0 &&
nodeContent.search(/\.( |$)/) !== -1)
append = true;
}
}
if (append) {
this.log('Appending node:', sibling);
if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
// We have a node that isn't a common block level element, like a form or td tag.
// Turn it into a div so it doesn't get filtered out later by accident.
this.log('Altering sibling:', sibling, 'to div.');
sibling = this._setNodeTag(sibling, 'DIV');
}
articleContent.appendChild(sibling);
// siblings is a reference to the children array, and
// sibling is removed from the array when we call appendChild().
// As a result, we must revisit this index since the nodes
// have been shifted.
s -= 1;
sl -= 1;
}
}
//if (this._debug)
//this.log('Article content pre-prep: ' + articleContent.innerHTML);
// So we have all of the content that we need. Now we clean it up for presentation.
this._prepArticle(articleContent);
//if (this._debug)
//this.log('Article content post-prep: ' + articleContent.innerHTML);
if (this._curPageNum === 1) {
if (neededToCreateTopCandidate) {
// We already created a fake div thing, and there wouldn't have been any siblings left
// for the previous loop, so there's no point trying to create a new div, and then
// move all the children over. Just assign IDs and class names here. No need to append
// because that already happened anyway.
topCandidate.id = 'readability-page-1';
topCandidate.className = 'page';
} else {
var div = page.createElement('DIV');
div.id = 'readability-page-1';
div.className = 'page';
var children = articleContent.childNodes;
while (children.length) {
div.appendChild(children[0]);
}
articleContent.appendChild(div);
}
}
//if (this._debug)
//this.log('Article content after paging: ' + articleContent.innerHTML);
this._removeScripts(articleContent);
this._postProcessContent(articleContent);
// Now that we've gone through the full algorithm, check to see if
// we got any meaningful content. If we didn't, we may need to re-run
// grabArticle with different flags set. This gives us a higher likelihood of
// finding the content, and the sieve approach gives us a higher likelihood of
// finding the -right- content.
if (this._getInnerText(articleContent, true).length < 500) {
page.innerHTML = pageCacheHtml;
if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
this.log('!!!!! Failed to detect content. Removing FLAG_STRIP_UNLIKELYS...');
this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
} else if (this._flagIsActive(this.FLAG_WEIGHT_ATTRIBUTES)) {
this.log('!!!!! Failed to detect content. Removing FLAG_WEIGHT_ATTRIBUTES...');
this._removeFlag(this.FLAG_WEIGHT_ATTRIBUTES);
} else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
this.log('!!!!! Failed to detect content. Removing FLAG_CLEAN_CONDITIONALLY...');
this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
} else {
this.log('!!!!! Failed to detect any content.');
return grab_callback(null);
}
} else
return grab_callback(articleContent);
}
},
/**
* Check whether the input string could be a authorline.
* This verifies that the input is a string, and that the length
* is less than 100 chars.
*
* @param possibleauthorline {string} - a string to check whether its a authorline.
* @return Boolean - whether the input string is a authorline.
*/
_isValidAuthorLine: function(authorline) {
if (typeof authorline == 'string' || authorline instanceof String) {
authorline = authorline.trim();
return (authorline.length > 0) && (authorline.length < 100);
}
return false;
},
/**
* Attempts to get excerpt and author metadata for the article.
*
* @return Object with optional 'excerpt' and 'author' properties
*/
_getArticleMetadata: function() {
var metadata = {};
var values = {};
var metaElements = this._doc.getElementsByTagName('meta');
// Match 'description', or Twitter's 'twitter:description' (Cards)
// in name attribute.
var namePattern = /^\s*((twitter)\s*:\s*)?(description|title)\s*$/gi;
// Match Facebook's Open Graph title & description properties.
var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/gi;
// Find description tags.
this._forEachNode(metaElements, function(element) {
var elementName = element.getAttribute('name');
var elementProperty = element.getAttribute('property');
if ([elementName, elementProperty].indexOf('author') !== -1) {
metadata.author = element.getAttribute('content');
return;
}
if ([elementName, elementProperty] !== -1) {
metadata.author = element.getAttribute('content');
return;
}
var name = null;
if (namePattern.test(elementName)) {
name = elementName;
} else if (propertyPattern.test(elementProperty)) {
name = elementProperty;
}
if (name) {
var content = element.getAttribute('content');
if (content) {
if (/^(?:\w+\s*:\s*)?description/.test(name) !== -1) {
metadata.excerpt = content.trim();
return;
} else if (/^(?:\w+\s*:\s*)?title/.test(name) !== -1) {
metadata.title = content.trim();
return;
}
}
}
});
return metadata;
},
/**
* Removes script tags from the document.
*
* @param Element
**/
_removeScripts: function(doc) {
this._removeNodes(doc.getElementsByTagName('script'), function(scriptNode) {
scriptNode.nodeValue = '';
scriptNode.removeAttribute('src');
return true;
});
this._removeNodes(doc.getElementsByTagName('noscript'));
},
/**
* Check if this node has only whitespace and a single P element
* Returns false if the DIV node contains non-empty text nodes
* or if it contains no P or more than 1 element.
*
* @param Element
**/
_hasSinglePInsideElement: function(element) {
// There should be exactly 1 element child which is a P:
if (element.children.length != 1 || element.children[0].tagName !== 'P') {
return false;
}
// And there should be no text nodes with real content
return !this._someNode(element.childNodes, function(node) {
return node.nodeType === Node.TEXT_NODE &&
this.REGEXPS.hasContent.test(node.textContent);
});
},
/**
* Determine whether element has any children block level elements.
*
* @param Element
*/
_hasChildBlockElement: function(element) {
return this._someNode(element.childNodes, function(node) {
return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 ||
this._hasChildBlockElement(node);
});
},
/**
* Get the inner text of a node - cross browser compatibly.
* This also strips out any excess whitespace to be found.
*
* @param Element
* @param Boolean normalizeSpaces (default: true)
* @param boolean flattenLines (default: false)
* @return string
**/
_getInnerText: function(e, normalizeSpaces, flattenLines) {
flattenLines = (typeof flattenLines === 'undefined') ? false : flattenLines;
normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
var textContent = e.textContent.trim();
if (flattenLines) {
return textContent.replace(this.REGEXPS.flatten, '');
} else if (normalizeSpaces) {
return textContent.replace(this.REGEXPS.normalize, ' ');
}
return textContent;
},
/**
* Get the number of times a string s appears in the node e.
*
* @param Element
* @param string - what to split on. Default is ','
* @return number (integer)
**/
_getCharCount: function(e, s) {
return (e.match(new RegExp(s || ',', 'g')) || []).length;
},
/**
* Remove the style attribute on every e and under.
* TODO: Test if getElementsByTagName(*) is faster.
*
* @param Element
* @return void
**/
_cleanStyles: function(e) {
e = e || this._doc;
if (!e)
return;
var cur = e.firstChild;
// Remove any root styles, if we're able.
if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled')
e.removeAttribute('style');
// Go until there are no more child nodes
while (cur !== null) {
if (cur.nodeType === cur.ELEMENT_NODE) {
// Remove style attribute(s) :
if (cur.className !== 'readability-styled')
cur.removeAttribute('style');
this._cleanStyles(cur);
}
cur = cur.nextSibling;
}
},
/**
* Get the density of links as a percentage of the content
* This is the amount of text that is inside a link divided by the total text in the node.
*
* @param Element
* @return number (float)
**/
_getLinkDensity: function(element) {
var textLength = this._getInnerText(element, false, true).length;
if (textLength === 0)
return 0;
var linkLength = 0,
dRe = this._uri.domainRe,
excExt = this._excludeExternal,
excAImg = this._excludeAImg,
self = this;
this._forEachNode(element.getElementsByTagName('a'), function(linkNode) {
if (excExt && !dRe.test(linkNode.getAttribute('href'))) return;
if (excAImg && linkNode.getElementsByTagName('img').length === 1) return;
linkLength += self._getInnerText(linkNode, false, true).length;
});
return linkLength / textLength;
},
/**
* Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
*
* @author Dan Lacy
* @return string the base url
**/
_findBaseUrl: function() {
var uri = this._uri;
var noUrlParams = uri.path.split('?')[0];
var urlSlashes = noUrlParams.split('/').reverse();
var cleanedSegments = [];
var possibleType = '';
for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) {
var segment = urlSlashes[i];
// Split off and save anything that looks like a file type.
if (segment.indexOf('.') !== -1) {
possibleType = segment.split('.')[1];
// If the type isn't alpha-only, it's probably not actually a file extension.
if (!/[^a-zA-Z]/.test(possibleType))
segment = segment.split('.')[0];
}
// EW-CMS specific segment replacement. Ugly.
// Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
if (segment.indexOf(',00') !== -1)
segment = segment.replace(',00', '');
// If our first or second segment has anything looking like a page number, remove it.
if (/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i.test(segment) && ((i === 1) || (i === 0)))
segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, '');
var del = false;
// If this is purely a number, and it's the first or second segment,
// it's probably a page number. Remove it.
if (i < 2 && /^\d{1,2}$/.test(segment))
del = true;
// If this is the first segment and it's just 'index', remove it.
if (i === 0 && segment.toLowerCase() === 'index')
del = true;
// If our first or second segment is smaller than 3 characters,
// and the first segment was purely alphas, remove it.
if (i < 2 && segment.length < 3 && !/[a-z]/i.test(urlSlashes[0]))
del = true;
// If it's not marked for deletion, push it to cleanedSegments.
if (!del)
cleanedSegments.push(segment);
}
// This is our final, cleaned, base article URL.
return uri.scheme + '://' + uri.host + cleanedSegments.reverse().join('/');
},
/**
* Look for any paging links that may occur within the document.
*
* @param body
* @return object (array)
**/
_findNextPageLink: function(elem) {
var uri = this._uri;
var possiblePages = {};
var allLinks = elem.getElementsByTagName('a');
var articleBaseUrl = this._findBaseUrl();
// Loop through all links, looking for hints that they may be next-page links.
// Things like having 'page' in their textContent, className or id, or being a child
// of a node with a page-y className or id.
//
// Also possible: levenshtein distance? longest common subsequence?
//
// After we do that, assign each page a score, and
for (var i = 0, il = allLinks.length; i < il; i += 1) {
var link = allLinks[i];
var linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');
// If we've already seen this page, ignore it.
if (linkHref === '' ||
linkHref === articleBaseUrl ||
linkHref === uri.spec ||
linkHref in this._parsedPages) {
continue;
}
// If it's on a different domain, skip it.
if (uri.host !== linkHref.split(/\/+/g)[1])
continue;
var linkText = this._getInnerText(link);
// If the linkText looks like it's not the next page, skip it.
if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25)
continue;
// If the leftovers of the URL after removing the base URL don't contain
// any digits, it's certainly not a next page link.
var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
if (!linkHrefLeftover.match(/\d/))
continue;
if (!(linkHref in possiblePages)) {
possiblePages[linkHref] = { 'score': 0, 'linkText': linkText, 'href': linkHref };
} else {
possiblePages[linkHref].linkText += ' | ' + linkText;
}
var linkObj = possiblePages[linkHref];
// If the articleBaseUrl isn't part of this URL, penalize this link. It could
// still be the link, but the odds are lower.
// Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
if (linkHref.indexOf(articleBaseUrl) !== 0)
linkObj.score -= 25;
var linkData = linkText + ' ' + link.className + ' ' + link.id;
if (linkData.match(this.REGEXPS.nextLink))
linkObj.score += 50;
if (linkData.match(/pag(e|ing|inat)/i))
linkObj.score += 25;
if (linkData.match(/(first|last)/i)) {
// -65 is enough to negate any bonuses gotten from a > or » in the text,
// If we already matched on 'next', last is probably fine.
// If we didn't, then it's bad. Penalize.
if (!linkObj.linkText.match(this.REGEXPS.nextLink))
linkObj.score -= 65;
}
if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous))
linkObj.score -= 50;
if (linkData.match(this.REGEXPS.prevLink))
linkObj.score -= 200;
// If a parentNode contains page or paging or paginat
var parentNode = link.parentNode;
var positiveNodeMatch = false;
var negativeNodeMatch = false;
while (parentNode) {
var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;
if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
positiveNodeMatch = true;
linkObj.score += 25;
}
if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) {
// If this is just something like 'footer', give it a negative.
// If it's something like 'body-and-footer', leave it be.
if (!parentNodeClassAndId.match(this.REGEXPS.positive)) {
linkObj.score -= 25;
negativeNodeMatch = true;
}
}
parentNode = parentNode.parentNode;
}
// If the URL looks like it has paging in it, add to the score.
// Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i))
linkObj.score += 25;
// If the URL contains negative values, give a slight decrease.
if (linkHref.match(this.REGEXPS.extraneous))
linkObj.score -= 15;
/**
* Minor punishment to anything that doesn't match our current URL.
* NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
* Dan, can you show me a counterexample where this is necessary?
* if (linkHref.indexOf(window.location.href) !== 0) {
* linkObj.score -= 1;
* }
**/
// If the link text can be parsed as a number, give it a minor bonus, with a slight
// bias towards lower numbered pages. This is so that pages that might not have 'next'
// in their text can still get scored, and sorted properly by score.
var linkTextAsNumber = parseInt(linkText, 10);
if (linkTextAsNumber) {
// Punish 1 since we're either already there, or it's probably
// before what we want anyways.
if (linkTextAsNumber === 1) {
linkObj.score -= 10;
} else {
linkObj.score += Math.max(0, 10 - linkTextAsNumber);
}
}
}
// Loop thrugh all of our possible pages from above and find our top
// candidate for the next page URL. Require at least a score of 50, which
// is a relatively high confidence that this page is the next link.
var topPage = null;
for (var page in possiblePages) {
if (possiblePages.hasOwnProperty(page)) {
if (possiblePages[page].score >= 50 &&
(!topPage || topPage.score < possiblePages[page].score))
topPage = possiblePages[page];
}
}
var nextHref = null;
if (topPage) {
nextHref = topPage.href.replace(/\/$/, '');
this.log('NEXT PAGE IS ' + nextHref);
this._parsedPages[nextHref] = true;
}
return nextHref;
},
/**
* Add pre filter for raw input HTML processing
* @param string RegExp for replace
* @param string (optional) Replacer
*/
_addPreFilter: function(filter, replacer) {
if (typeof filter !== 'object') return;
this._preFilters.push({ r: filter, s: (replacer || '') });
},
/**
* Add post filter for raw output HTML processing
* @param string RegExp for replace
* @param string (optional) Replacer
*/
_addPostFilter: function(filter, replacer) {
if (typeof filter !== 'object') return;
this._postFilters.push({ r: filter, s: (replacer || '') });
},
_createHTML: function(source, url) {
var doc = this._doc.implementation.createHTMLDocument('HTMLParser');
doc.documentElement.innerHTML = source;
return doc;
},
/**
* Get an elements class/id weight. Uses regular expressions to tell
* if this element looks good or bad.
*
* @param Element
* @return number (Integer)
**/
_getAttributesWeight: function(e) {
if (!this._flagIsActive(this.FLAG_WEIGHT_ATTRIBUTES))
return 0;
var weight = 0,
self = this,
getWeight = function(attr) {
// Look for a special string in atribute text
if (typeof attr === 'string' && attr.trim().length > 2) {
if (self.REGEXPS.safe.test(attr))
return 100;
if (self.REGEXPS.negative.test(attr))
return -25;
if (self.REGEXPS.positive.test(attr))
return 25;
if (self.REGEXPS.unlikelyCandidates.test(attr))
return -5;
if (self.REGEXPS.okMaybeItsACandidate.test(attr))
return 5;
return 0;
}
return 0;
};
weight += getWeight(e.className);
weight += getWeight(e.id);
return weight;
},
/**
* Check if a given node has one of its ancestor tag name matching the
* provided one.
* @param HTMLElement node
* @param String tagName
* @param Number maxDepth
* @return Boolean
*/
_hasAncestorTag: function(node, tagName, maxDepth) {
maxDepth = maxDepth || 3;
tagName = tagName.toUpperCase();
var depth = 0;
while (node.parentNode) {
if (depth > maxDepth)
return false;
if (node.parentNode.tagName === tagName)
return true;
node = node.parentNode;
depth++;
}
return false;
},
/**
* Clean a node of all elements of type 'tags'.
* (Unless it's a youtube/vimeo video. People love movies.)
*
* @param Element
* @param string tags to clean
* @return void
**/
_clean: function(e, tags) {
var isEmbed = this.MEDIA_NODES.indexOf(tags) !== -1;
this._removeNodes(this._getAllNodesWithTag(e, tags), function(element) {
// Allow youtube and vimeo videos through as people usually want to see those.
if (isEmbed) {
var attributeValues = [].map.call(element.attributes, function(attr) {
return attr.value;
}).join('|');
// Check the elements attributes and elements inside to see if any of them contain youtube or vimeo
if (this.REGEXPS.videos.test(attributeValues) || this.REGEXPS.videos.test(element.innerHTML))
return false;
}
return true;
});
},
/**
* Clean an element of all tags of type 'tag' if they look fishy.
* 'Fishy' is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
*
* @return void
**/
_cleanConditionally: function(e, tags) {
if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
return;
// Gather counts for other typical elements embedded within.
// Traverse backwards so we can remove nodes at the same time
// without effecting the traversal.
//
// TODO: Consider taking into account original contentScore here.
this._removeNodes(this._getAllNodesWithTag(e, tags), function(node) {
var weight, contentScore;
if (node.readability) {
weight = 0;
contentScore = node.readability.contentScore || 0;
} else {
weight = this._getAttributesWeight(node);
contentScore = 0;
}
var haveToRemove = false,
isList = (node.tagName === 'UL' || node.tagName === 'OL');
if (weight + contentScore < 0) return true;
var contentText = this._getInnerText(node, false, true)
if (this._getCharCount(contentText, ',') < this.MIN_COMMAS_IN_PARAGRAPH) {
// If there are not very many commas, and the number of
// non-paragraph elements is more than paragraphs or other
// ominous signs, remove the element.
var p = node.getElementsByTagName('p').length;
var img = node.getElementsByTagName('img').length;
var li = node.getElementsByTagName('li').length - 100;
var input = node.getElementsByTagName('input').length;
var embedCount = 0;
var embeds = this._getAllNodesWithTag(node, this.MEDIA_NODES);
for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
if (!this.REGEXPS.videos.test(embeds[ei].src) && !this.REGEXPS.videos.test(embeds[ei].innerHTML))
embedCount += 1;
}
var linkDensity = this._getLinkDensity(node),
contentLength = contentText.length;
haveToRemove =
// Make an exception for elements with no p's and exactly 1 img.
(img > p && !this._hasAncestorTag(node, 'figure')) ||
(!isList && li > p) ||
(input > Math.floor(p / 3)) ||
(!isList && contentLength < 25 && (img === 0 || img > 2)) ||
(!isList && weight < 25 && linkDensity > 0.2) ||
(weight >= 25 && linkDensity > 0.5) ||
((embedCount === 1 && contentLength < 75) || embedCount > 1);
}
if (haveToRemove) this.log('Cleaning Conditionally:', node, 'list:', isList, ';img:', img,';linkDensity:',linkDensity);
return haveToRemove;
});
},
/**
* Clean out spurious headers from an Element. Checks things like classnames and link density.
*
* @param Element
* @return void
**/
_cleanHeaders: function(e) {
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
this._removeNodes(e.getElementsByTagName('h' + headerIndex), function(header) {
return this._getAttributesWeight(header) < 0;
});
}
},
_flagIsActive: function(flag) {
return (this._flags & flag) > 0;
},
_addFlag: function(flag) {
this._flags = this._flags | flag;
},
_removeFlag: function(flag) {
this._flags = this._flags & ~flag;
},
/**
* Decides whether or not the document is reader-able without parsing the whole thing.
*
* @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
*/
isProbablyReaderable: function(helperIsVisible) {
var nodes = this._getAllNodesWithTag(this._doc, ['p', 'pre']);
// FIXME we should have a fallback for helperIsVisible, but this is
// problematic because of jsdom's elem.style handling - see
// https://github.com/mozilla/readability/pull/186 for context.
var score = 0;
// This is a little cheeky, we use the accumulator 'score' to decide what to return from
// this callback:
return this._someNode(nodes, function(node) {
if (helperIsVisible && !helperIsVisible(node))
return false;
var matchString = node.className + ' ' + node.id;
if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
!this.REGEXPS.okMaybeItsACandidate.test(matchString))
return false;
if (node.matches && node.matches('li p'))
return false;
var textContentLength = node.textContent.trim().length;
if (textContentLength < this.MIN_ARTICLE_LENGTH)
return false;
score += Math.round(100 / (1 + Math.exp(-1 * textContentLength / this.MIN_ARTICLE_LENGTH))) - 50;
//score += Math.sqrt(textContentLength - this.MIN_ARTICLE_LENGTH);
if (score > 20)
return true;
return false;
});
},
_successfulRequest: function(request) {
return (request.status >= 200 && request.status < 300) ||
request.status === 304 ||
(request.status === 0 && request.responseText);
},
_ajax: function(url, options) {
var self = this,
request = new XMLHttpRequest(),
respondToReadyState = function(readyState) {
if (request.readyState === 4) {
if (self._successfulRequest(request)) {
if (options.success)
options.success(self._createHTML(request.responseText));
} else if (options.error) {
options.error(request);
}
}
};
if (typeof options === 'undefined')
options = {};
request.onreadystatechange = respondToReadyState;
request.open('get', url, true);
request.setRequestHeader('Accept', 'text/html');
try {
request.send(options.postBody);
} catch (e) {
if (options.error)
options.error();
}
return request;
},
/*
* Initializes instance.
*
* @param {String} uri The URL string.
* @param {HTMLDocument} doc Parent document.
*/
init: function(uri, doc) {
if (!uri) throw new Error('No URL specified. Aborting.');
if (!doc) throw new Error('No document specified. Aborting.');
var a = doc.createElement('a'),
base = doc.getElementsByTagName('base');
a.href = uri;
if (base[0]) base = base[0];
else base = null;
this._doc = doc;
this._uri = {
spec: a.href,
host: a.host,
prePath: a.protocol + '//' + a.host,
scheme: a.protocol.substr(0, a.protocol.indexOf(':')),
pathBase: base || a.protocol + '//' + a.host + a.pathname.substr(0, a.pathname.lastIndexOf('/') + 1),
domainRe: new RegExp('/' + a.host.replace(/^\w+:\/\/(?:www|m|i)\d*\./i, '').replace(/\./, '.') + '/')
}
return this;
},
/**
* Runs readability.
*
* Workflow:
* 1. Prep the document by removing script tags, css, etc.
* 2. Build readability's DOM tree.
* 3. Grab the article content from the current dom tree.
* 4. Replace the current DOM tree with the new one.
* 5. Read peacefully.
*
* @return void
**/
parse: function(parse_callback) {
if (!parse_callback) throw new Error('Aborting parsing document; no callback found');
var self = this;
if (!self._doc) {
throw new Error('Aborting parsing document; no data to process');
}
// Avoid parsing too large documents, as per configuration option
if (self._maxElemsToParse) {
var numTags = self._doc.getElementsByTagName('*').length;
if (numTags > self._maxElemsToParse) {
throw new Error('Aborting parsing document; ' + numTags + ' > max = ' + self._maxElemsToParse + ' elements found');
}
}
if (typeof self._doc.documentElement.firstElementChild === 'undefined') {
self._getNextNode = self._getNextNodeNoElementProperties;
}
self._preProcessContent(self._doc);
// Remove script tags from the document.
self._removeScripts(self._doc);
// FIXME: Disabled multi-page article support for now as it
// needs more work on infrastructure.
// Make sure self document is added to the list of parsed pages first,
// so we don't double up on the first page.
// self._parsedPages[uri.spec.replace(/\/$/, '')] = true;
// Pull out any possible next page link first.
// var nextPageLink = self._findNextPageLink(doc.body);
var metadata = self._getArticleMetadata();
self._prepDocument();
var articleTitle = metadata.title || self._getArticleTitle();
self._grabArticle(self._doc, function(articleContent) {
//self.log('Grabbed: ' + articleContent.innerHTML);
if (!articleContent)
parse_callback({
uri: null,
title: null,
author: null,
dir: null,
content: null,
textContent: null,
length: null,
excerpt: null
});
// if (nextPageLink) {
// // Append any additional pages after a small timeout so that people
// // can start reading without having to wait for self to finish processing.
// setTimeout((function() {
// self._appendNextPage(nextPageLink);
// }).bind(self), 500);
// }
// If we haven't found an excerpt in the article's metadata, use the article's
// first paragraph as the excerpt. This is used for displaying a preview of
// the article's content.
if (!metadata.excerpt) {
var paragraphs = articleContent.getElementsByTagName('p');
if (paragraphs.length > 0) {
metadata.excerpt = paragraphs[0].textContent.trim();
}
}
parse_callback({
uri: self._uri,
title: articleTitle,
author: metadata.author || self._articleAuthor,
dir: self._articleDir,
content: articleContent.innerHTML,
textContent: articleContent.textContent,
length: articleContent.textContent.length,
excerpt: metadata.excerpt
});
});
return this;
}
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment