Dither/readability.js

## readability.js
// ==UserScript==
// @name       Readability
// @include    http://*
// @include    https://*

/*jshint curly: false, es5: true, strict: false, loopfunc: true, scripturl: true, browser: true, devel: true, nonstandard: true*/
/*
 * Copyright (c) 2010 Arc90 Inc
 *
 * Licensed under the Apache License, Version 2.0 (the 'License');
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an 'AS IS' BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

(function(root) {
    var CssSelectorGenerator, indexOf = [].indexOf || function(item) {
        for (var i = 0, l = this.length; i < l; i++) {
            if (i in this && this[i] === item) return i;
        }
        return -1;
    };

    CssSelectorGenerator = (function() {
        CssSelectorGenerator.prototype.default_options = {
            selectors: ['id', 'class', 'tag', 'nthchild']
        };

        function CssSelectorGenerator(options) {
            if (options == null) { options = {}; }
            this.options = {};
            this.setOptions(this.default_options);
            this.setOptions(options);
        }

        CssSelectorGenerator.prototype.setOptions = function(options) {
            var results = [], val;
            if (options == null) { options = {}; }

            for (var key in options) {
                val = options[key];
                if (this.default_options.hasOwnProperty(key)) {
                    results.push(this.options[key] = val);
                } else {
                    results.push(void 0);
                }
            }
            return results;
        };

        CssSelectorGenerator.prototype.isElement = function(element) {
            return !!((element != null ? element.nodeType : void 0) === 1);
        };

        CssSelectorGenerator.prototype.getParents = function(element) {
            var current_element, result = [];

            if (this.isElement(element)) {
                current_element = element;
                while (this.isElement(current_element)) {
                    result.push(current_element);
                    current_element = current_element.parentNode;
                }
            }
            return result;
        };

        CssSelectorGenerator.prototype.getTagSelector = function(element) {
            return this.sanitizeItem(element.tagName.toLowerCase());
        };

        CssSelectorGenerator.prototype.sanitizeItem = function(item) {
            var characters = (item.split('')).map(function(character) {
                if (character === ':') {
                    return "\\" + (':'.charCodeAt(0).toString(16).toUpperCase()) + " ";
                } else if (/[ !"#$%&'()*+,.\/;<=>?@\[\\\]^`{|}~]/.test(character)) {
                    return "\\" + character;
                } else {
                    return escape(character).replace(/\%/g, '\\');
                }
            });
            return characters.join('');
        };

        CssSelectorGenerator.prototype.getIdSelector = function(element) {
            var id = element.getAttribute('id'), sanitized_id;

            if ((id != null) && (id !== '') && !(/\s/.exec(id)) && !(/^\d/.exec(id))) {

                var root_node = element; //element.ownerDocument
                // works even if we haven't inserted the node yet
                while (root_node.parentNode) {
                    if (/^(html|body)$/i.test(root_node.localName)) break;
                    root_node = root_node.parentNode;
                }

                sanitized_id = "#" + (this.sanitizeItem(id));
                if (root_node.querySelectorAll(sanitized_id).length === 1) {
                    return sanitized_id;
                }
            }
            return null;
        };

        CssSelectorGenerator.prototype.getClassSelectors = function(element) {
            var item,
                class_string = element.getAttribute('class'),
                result = [];

            if (typeof class_string === 'string') {
                class_string = class_string.replace(/\s+/g, ' ');
                class_string = class_string.replace(/^\s|\s$/g, '');
                if (class_string !== '') {
                    result = (function() {
                        var k, len, ref, results;
                        ref = class_string.split(/\s+/);
                        results = [];
                        for (k = 0, len = ref.length; k < len; k++) {
                            item = ref[k];
                            results.push("." + (this.sanitizeItem(item)));
                        }
                        return results;
                    }).call(this);
                }
            }
            return result;
        };

        CssSelectorGenerator.prototype.getAttributeSelectors = function(element) {
            var blacklist = ['id', 'class'],
                ref = element.attributes,
                result = [];

            for (var attribute, ref1, k = 0, len = ref.length; k < len; k++) {
                attribute = ref[k];
                if (ref1 = attribute.nodeName, indexOf.call(blacklist, ref1) < 0) {
                    result.push("[" + attribute.nodeName + "=" + attribute.nodeValue + "]");
                }
            }
            return result;
        };

        CssSelectorGenerator.prototype.getNthChildSelector = function(element) {
            var parent_element = element.parentNode;

            if (parent_element) {
                var counter = 0, siblings = parent_element.childNodes;
                for (var sibling, k = 0, len = siblings.length; k < len; k++) {
                    sibling = siblings[k];
                    if (this.isElement(sibling)) {
                        counter++;
                        if (sibling === element) {
                            return ":nth-child(" + counter + ")";
                        }
                    }
                }
            }
            return null;
        };

        CssSelectorGenerator.prototype.testSelector = function(root_node, element, selector) {
            if (typeof selector === 'string' && selector.length) {
                var result = root_node.querySelectorAll(selector);
                if (result.length === 1 && result[0] === element) return true;
            }
            return false;
        };

        CssSelectorGenerator.prototype.getAllSelectors = function(element) {
            var result;
            result = {
                t: null,
                i: null,
                c: null,
                a: null,
                n: null
            };
            if (indexOf.call(this.options.selectors, 'tag') >= 0) {
                result.t = this.getTagSelector(element);
            }
            if (indexOf.call(this.options.selectors, 'id') >= 0) {
                result.i = this.getIdSelector(element);
            }
            if (indexOf.call(this.options.selectors, 'class') >= 0) {
                result.c = this.getClassSelectors(element);
            }
            if (indexOf.call(this.options.selectors, 'attribute') >= 0) {
                result.a = this.getAttributeSelectors(element);
            }
            if (indexOf.call(this.options.selectors, 'nthchild') >= 0) {
                result.n = this.getNthChildSelector(element);
            }
            return result;
        };

        CssSelectorGenerator.prototype.testUniqueness = function(element, selector) {
            var found_elements, parent;
            parent = element.parentNode;
            if (!parent) return false;
            found_elements = parent.querySelectorAll(selector);
            return found_elements.length === 1 && found_elements[0] === element;
        };

        CssSelectorGenerator.prototype.testCombinations = function(element, items, tag) {
            var item, ref = this.getCombinations(items), ref1;
            for (var k = 0, len = ref.length; k < len; k++) {
                item = ref[k];
                if (this.testUniqueness(element, item)) {
                    return item;
                }
            }
            if (tag != null) {
                if (typeof items === 'string') items = [[items]];
                if (tag == null) tag = '';
                ref1 = items.map(function(item) {
                    return tag + item ;
                });
                for (var l = 0, len1 = ref1.length; l < len1; l++) {
                    item = ref1[l];
                    if (this.testUniqueness(element, item)) {
                        return item;
                    }
                }
            }
            return null;
        };

        CssSelectorGenerator.prototype.getUniqueSelector = function(element, named) {
            var found_selector, selector_type,
                selectors = this.getAllSelectors(element),
                ref = this.options.selectors;

            for (var k = 0, len = ref.length; k < len; k++) {
                selector_type = ref[k];
                switch (selector_type) {
                    case 'id':
                        if (selectors.i != null) {
                            if (this.testUniqueness(element, selectors.i)) return selectors.i;
                            if (found_selector = this.testCombinations(element, selectors.i, selectors.t)) return found_selector;
                        }
                        break;
                    case 'class':
                        if ((selectors.c != null) && selectors.c.length !== 0) {
                            if (found_selector = this.testCombinations(element, selectors.c, selectors.t)) return found_selector;
                        }
                        break;
                    case 'attribute':
                        if ((selectors.a != null) && selectors.a.length !== 0) {
                            if (found_selector = this.testCombinations(element, selectors.a, selectors.t)) return found_selector;
                        }
                        break;
                    case 'tag':
                        if (selectors.t != null) {
                            if (this.testUniqueness(element, selectors.t)) return selectors.t;
                        }
                        break;
                    case 'nthchild':
                        if (selectors.n != null) {
                            if (this.testUniqueness(element, selectors.n)) return selectors.n;
                            if (found_selector = this.testCombinations(element, selectors.n, selectors.t)) return found_selector;
                        }
                }
            }
            return '*';
        };

        CssSelectorGenerator.prototype.getSelector = function(element) {
            var all_selectors = [], item, parents = this.getParents(element), result, selector;

            for (var k = 0, len = parents.length; k < len; k++) {
                item = parents[k];
                selector = this.getUniqueSelector(item);
                if (selector != null) {
                    all_selectors.push(selector);
                }
            }
            var selectors = [],
                root_node = element; //element.ownerDocument
            // works even if we haven't inserted the node yet
            while (root_node.parentNode) {
                if (/^(html|body)$/i.test(root_node.localName)) break;
                root_node = root_node.parentNode;
            }

            for (var l = 0, len1 = all_selectors.length; l < len1; l++) {
                item = all_selectors[l];
                selectors.unshift(item);
                result = selectors.join('>');
                if (this.testSelector(root_node, element, result)) {
                    return result;
                }
            }
            return null;
        };

        CssSelectorGenerator.prototype.getCombinations = function(items) {
            var i, j, k, l, ref, ref1, result = [[]];
            if (items == null) items = [];
            if (typeof items === 'string') return [[items]];
            for (i = k = 0, ref = items.length - 1; 0 <= ref ? k <= ref : k >= ref; i = 0 <= ref ? ++k : --k) {
                for (j = l = 0, ref1 = result.length - 1; 0 <= ref1 ? l <= ref1 : l >= ref1; j = 0 <= ref1 ? ++l : --l) {
                    result.push(result[j].concat(items[i]));
                }
            }
            result.shift();
            result = result.sort(function(a, b) { return a.length - b.length; });
            result = result.map(function(item) { return item.join(''); });
            return result;
        };

        return CssSelectorGenerator;
    })();

    root.CssSelectorGenerator = CssSelectorGenerator;
})(this);

/*
 * This code is heavily based on Arc90's readability.js (1.7.1) script
 * available at: http://code.google.com/p/arc90labs-readability
 *
 * Modified by DitherSky for research purposes.
 */

/*
    For testing: new Readability().init(uri, document).parse(function(result) {
        document.body.innerHTML = result.content;
    });
*/

/**
 * Public constructor.
 * @param {Object}       options The options object.
 */
function Readability(options) {
    options = options || {};

    this._version = '1.7.2-no-multi-page';

    this._uri = null;
    this._doc = null;
    this._biggestFrame = false;
    this._articleAuthor = null;
    this._articleDir = null;
    this._requestedUrls = [];

    // Configurable options
    this._debug = !!options.debug || false;
    this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
    this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
    this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES;
    this._excludeExternal = options.excludeExternal || this.DEFAULT_EXCLUDE_EXTERNAL;
    this._excludeAImg = options.excludeAImg || this.DEFAULT_EXCLUDE_A_IMG;
    this._replaceImgs = options.replaceImgs || this.REPLACE_IMAGES_TO_FULL;

    // Start with all flags set
    this._flags = this.FLAG_STRIP_UNLIKELYS |
                  this.FLAG_WEIGHT_ATTRIBUTES |
                  this.FLAG_CLEAN_CONDITIONALLY |
                  this.FLAG_DISABLE_PREFILTER |
                  this.FLAG_DISABLE_POSTFILTER;

    // The list of pages we've parsed in this call of readability,
    // for autopaging. As a key store for easier searching.
    this._parsedPages = {};

    // A list of the ETag headers of pages we've parsed, in case they happen to match,
    // we'll know it's a duplicate.
    this._pageURLs = {};

    // Make an AJAX request for each page and append it to the document.
    this._curPageNum = 1;


    this._selGen = new window.CssSelectorGenerator;
    this._pathTo = this._selGen.getSelector.bind(this._selGen);

    // Control whether log messages are sent to the console
    if (this._debug) {
        this.log = function() {
            if (typeof dump !== 'undefined') {
                var self = this, msg = Array.prototype.map.call(arguments, function(x) {
                    return (x && x.nodeName) ? self._pathTo(x) : x;
                }).join(' ');
                dump('[Readability] ' + msg + '\n');
            } else if (typeof console !== undefined) {
                var self = this, msg = Array.prototype.map.call(arguments, function(x) {
                    return (x && x.nodeName) ? self._pathTo(x) : x;
                }).join(' ');
                console.log('[Readability] ' + msg + '\n');
            }
        };
    } else {
        this.log = function() {};
    }
}

Readability.prototype = {
    // flags
    FLAG_STRIP_UNLIKELYS: 0x1,
    FLAG_WEIGHT_ATTRIBUTES: 0x2,
    FLAG_CLEAN_CONDITIONALLY: 0x4,
    FLAG_DISABLE_PREFILTER: 0x8,
    FLAG_DISABLE_POSTFILTER: 0x10,

    // constants
    SCORE_CHARS_IN_PARAGRAPH: 100,
    SCORE_WORDS_IN_PARAGRAPH: 20,
    GRANDPARENT_SCORE_DIVISOR: 2,
    GRANDPARENT_UP_SCORE_DIVISOR: 3,
    MIN_PARAGRAPH_LENGTH: 20,
    MIN_COMMAS_IN_PARAGRAPH: 6,
    MIN_ARTICLE_LENGTH: 200,
    MIN_NODE_LENGTH: 80,
    MAX_LINK_DENSITY: 0.25,
    MIN_WORD_LENGTH: 2,
    MAX_ANCESTORS: 4,

    // Replace image src to full URL from parent's link
    REPLACE_IMAGES_TO_FULL: true,

    // Ignore external links when checking for link weights
    DEFAULT_EXCLUDE_EXTERNAL: true,

   // Ignore a>img type of nodes when considering link density
    DEFAULT_EXCLUDE_A_IMG: true,

    // Max number of nodes supported by this parser. Default: 0 (no limit)
    DEFAULT_MAX_ELEMS_TO_PARSE: 0,

    // The number of top candidates to consider when analyzing how
    // tight the competition is among candidates.
    DEFAULT_N_TOP_CANDIDATES: 5,

    // The maximum number of pages to loop through before we call
    // it quits and just show a link.
    DEFAULT_MAX_PAGES: 10,

    // Element tags to score by default.
    DEFAULT_TAGS_TO_SCORE: 'IMG,SECTION,P,TD,PRE,CODE,H2,H3,H4,H5,H6'.split(','),

    // All of the regular expressions in use within readability.
    // Defined up here so we don't instantiate them repeatedly in loops.
    REGEXPS: {
        safe: /hentry|(?:instapaper|article).body|markdown|\bfulltext/i,
        unlikelyCandidates: /auth?or|similar|ignore|\binfo|annoy|clock|\bdate|\btime|footer|com(?:bx|ment|munity)|banner|intro|log.{2}n|edcolinks|hidd?e|about|bookmark|\bcat|search|social|robot|published|mast(?:head)|subscri|category|disqus|extra|head(?:er|note)|floor|agegate|menu|function|remark|rss|tool|header|teaserlist|widget|meta|adsense|inner-?ad|ad-|\badv\b|\bads\b|agr?egate?|pager|sidebar|popup|tweet|twit|like/i,
        okMaybeItsACandidate: /and|out(?:er|side)|wrap|post|article\b|body|entry|\bmain|page|contain|\bcontent|column|general|detail|shadow|lightbox|blog/i,
        positive: /read|full|article|source|content|body|\bcontent|contain|\bentry|main|page|attach|post|text|blog|story/i,
        negative: /pag(?:er|ination)|\bdate|\btime|nav|tag|extra|keyword|foot(?:note)?|^hid$|hid$|\bhid\b|^hid|all|bottom|stat|info|modal|outbrain|masthead|com-|contact|_nav|link|media|\bout|skyscraper|promo|\bad-|related|scroll|shoutbox|sponsor|shopping|teaser/i,
        extraneous: /\bprint|archive|comment|discuss|e?[\-]?mail|share|reply|sign|single|utility/i,
        authorline: /byline|author|dateline|writtenby/i,
        styleFilter: /display\s*:\s*none|visibility\s*:\s*hidden/i,
        normalize: /\s{2,}/g,
        flatten: /(?:[\r\n](?:\s|&nbsp;)*)+/g,
        videos: /\/\/(?:[^.?\/]+\.)?(?:youtu(?:be)?|soundcloud|vimeo|imgur|gfycat|dailymotion|cliphunter|twitch|vid|pornhub|xvideos|twitvid|rutube|viddler)\.(?:com|me|be|org|net|tv|ru)/i,
        nextLink: /(next|newer|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
        prevLink: /(prev|earl|old|new|<|«)/i,
        pages: /pag(?:e|ing|inat)/i,
        pageNumber: /p[ag]{0,2}(?:e|ing|ination)?[=\/]\d{1,2}/i,
        whitespace: /^\s*$/,
        hasContent: /\S$/,
        imgExt: /\.(?:gif|svg|jpe?g|a?png|webp)$/i
    },

    DIV_TO_P_ELEMS: ['A', 'BLOCKQUOTE', 'DL', 'DIV', 'IMG', 'OL', 'P', 'PRE', 'TABLE', 'UL', 'SELECT', 'CODE', 'FOOTER', 'ASIDE'],

    ALTER_TO_DIV_EXCEPTIONS: ['DIV', 'ARTICLE', 'SECTION', 'P'],

    NODES_TO_IGNORE: ['HTML', 'HEAD', 'BODY', 'ARTICLE'],

    ATTRIBUTE_WHITELIST: ['lang', 'src', 'href', 'type', 'alt', 'title', 'data', 'height', 'width', 'name', 'value', 'type', 'border', 'frameborder', 'colspan', 'rowspan', 'span', 'cite'],

    MEDIA_NODES: ['object', 'embed', 'iframe', 'audio', 'video'],

    NODES_TO_CLEAN: ['iframe', 'audio', 'video', 'object', 'embed', 'applet', 'h1', 'footer', 'input', 'button', 'nav', 'canvas', 'time'],

    // raw HTML filters
    _preFilters: [
        { r: /<html[^>]+>/gi, s: '<html>' }, // HTML5 namespaced
        { r: /^\s+|\s+$/g, s: '' }, // trim()
        { r: /[\r\n]+(?=\n{2})/g, s: '' },
        { r: /(?:<br\/>(?:\s|&nbsp;?)*)+(?=<\/?p)/gi, s: '' }, // replace excessive br's
        { r: /(?:\s|&nbsp;?)+(?=<br\/>)/g, s: '' }, // remove spaces in front of <br>s
        { r: /(?:<br\/>){2,}/gi, s: '</p><p>' }, // all double+ <br>s into <p>s
        { r: /\n/g, s: '\uffff' }, // filter scripts...
        { r: /<script.*?>.*?<\/script>/gi, s: '' },
        { r: /\n/g, s: '\uffff' },
        { r: /<script.*?>.*?<\/script>/gi, s: '' },
        { r: /\uffff/g, s: '\n' }, // ...filter scripts
        /* { r: /<(\/?)noscript/gi, s: '<$1div' }, // expand noscript*/
        { r: /<(\/?)font[^>]*>/gi, s: '<$1span>' }, // fonts to spans
        { r: /<\/?span[^>]*>/gi, s: '' } // remove spans as we redefine styles and they're probably special-styled
    ],

    // output HTML filters
    _postFilters: [
        { r: /<(?:a|div|span|p|i|strong)[^>]+\/>|<(?:a|div|span|p|i|strong)[^>]*>\s*<\/(?:a|div|span|p|i|strong)[^>]*>/gi, s: '' } // replace empty tags that break layouts
    ],

    /**
     * Replace innerHTML of a node based on array of custom regexp filters.
     *
     * @param Element
     * @param Array
     * @return void
     **/
    _replaceAll: function(content, filters) {
        if (!content || !content.innerHTML) return;
        for (var i = 0, l = filters.length; i < l; i++) {
            content.innerHTML = String.prototype.replace.apply(content.innerHTML, [filters[i].r, filters[i].s]);
        }
    },

    /**
     * Run any post-process modifications to article content if necessary.
     *
     * @param Element
     * @return void
     **/
    _postProcessContent: function(articleContent) {
        if (!articleContent) return;
        this._replaceAll(articleContent, this._postFilters);
        if (this._debug) return;
        this._filterAttributes(articleContent);
    },

    /**
     * Run any pre-process modifications to article content if necessary.
     *
     * @param Element
     * @return void
     **/
    _preProcessContent: function(articleContent) {
        if (!articleContent || !articleContent.documentElement) return;
        this._replaceAll(articleContent, this._preFilters);
        this._fixRelativeUris(articleContent);
    },

    /**
     * Iterates over a NodeList, calls `filterFn` for each node and removes node
     * if function returned `true`.
     *
     * If function is not passed, removes all the nodes in node list.
     *
     * @param NodeList nodeList The no
     * @param Function filterFn
     * @return void
     */
    _removeNodes: function(nodeList, filterFn) {
        for (var parentNode, node, i = nodeList.length; i--;) {
            node = nodeList[i];
            parentNode = node.parentNode;
            if (parentNode && (!filterFn || filterFn.call(this, node, i, nodeList)))
                parentNode.removeChild(node);
        }
    },

    /**
     * Iterate over a NodeList, which doesn't natively fully implement the Array
     * interface.
     *
     * For convenience, the current object context is applied to the provided
     * iterate function.
     *
     * @param  NodeList nodeList The NodeList.
     * @param  Function fn       The iterate function.
     * @param  Boolean  backward Whether to use backward iteration.
     * @return void
     */
    _forEachNode: function(nodeList, fn, backward) {
        var i, l;
        if (backward) {
            for (i = nodeList.length; i--;)
                fn(nodeList[i], i, this);
        } else {
            for (i = 0, l = nodeList.length; i < l; i++)
                fn(nodeList[i], i, this);
        }
    },

    /**
     * Iterate over a NodeList, return true if any of the provided iterate
     * function calls returns true, false otherwise.
     *
     * For convenience, the current object context is applied to the
     * provided iterate function.
     *
     * @param  NodeList nodeList The NodeList.
     * @param  Function fn       The iterate function.
     * @return Boolean
     */
    _someNode: function(nodeList, fn) {
        return Array.prototype.some.call(nodeList, fn, this);
    },

    _getAllNodesWithTag: function(node, tagNames) {
        if (node.querySelectorAll) {
            return node.querySelectorAll(tagNames.join(','));
        }
        return [].concat.apply([], tagNames.map(function(tag) {
            var collection = node.getElementsByTagName(tag);
            return Array.isArray(collection) ? collection : Array.from(collection);
        }));
    },

    /**
     * Converts each <a> and <img> uri in the given element to an absolute URI,
     * ignoring #ref URIs.
     *
     * @param Element
     * @return void
     */
    _fixRelativeUris: function(articleContent) {
        var scheme = this._uri.scheme;
        var prePath = this._uri.prePath;
        var pathBase = this._uri.pathBase;

        var toAbsoluteURI = function(uri) {
            // If this is already an absolute URI, return it.
            if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri))
                return uri;

            // Scheme-rooted relative URI.
            if (uri.substr(0, 2) == '//')
                return scheme + '://' + uri.substr(2);

            // Prepath-rooted relative URI.
            if (uri[0] == '/')
                return prePath + uri;

            // Dotslash relative URI.
            if (uri.indexOf('./') === 0)
                return pathBase + uri.slice(2);

            // Ignore hash URIs:
            if (uri[0] == '#')
                return uri;

            // Standard relative URI; add entire path. pathBase already includes a
            // trailing '/'.
            return pathBase + uri;
        };

        var fixHref = function(link) {
            var href = link.getAttribute('href');
            if (href) {
                // Replace links with javascript: URIs with text content, since
                // they won't work after scripts have been removed from the page.
                if (/javascript\s*:/i.test(href)) {
                    link.parentNode.replaceChild(this._doc.createTextNode(link.textContent), link);
                } else {
                    link.setAttribute('href', toAbsoluteURI(href));
                }
            }
        };

        var links = this._getAllNodesWithTag(articleContent, ['a', 'link']);
        this._forEachNode(links, fixHref.bind(this), true);

        var media = this._getAllNodesWithTag(articleContent, ['img', 'source']);
        this._forEachNode(media, function(node) {
            var src = node.getAttribute('src');
            if (src)
                node.setAttribute('src', toAbsoluteURI(src));
        }, true);
    },

    /**
     * Get the article title as an H1.
     *
     * @return void
     **/
    _getArticleTitle: function() {
        var doc = this._doc;
        var curTitle = '';
        var origTitle = '';

        try {
            curTitle = origTitle = doc.title;

            // If they had an element with id/class 'title' in their HTML
            if (typeof curTitle !== 'string')
                curTitle = origTitle =
                this._getInnerText(this._getAllNodesWithTag(doc, ['title', '[id*="title"]', '[class*="title"]']));
        } catch (e) {}

        if (curTitle.match(/ [\|\-|\xbb] /)) {
            curTitle = origTitle.replace(/(.*)[\|\-|\xbb] .*/gi, '$1');

            if (curTitle.split(' ').length < 3)
                curTitle = origTitle.replace(/[^\|\-|\xbb]*[\|\-|\xbb](.*)/gi, '$1');
        } else if (curTitle.indexOf(': ') !== -1) {
            var match = this._someNode(this._getAllNodesWithTag(doc, ['h1', 'h2', 'h3']), function(heading) {
                return heading.textContent === curTitle;
            });

            // If we don't, let's extract the title out of the original title string.
            if (!match) {
                curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1);

                // If the title is now too short, try the first colon instead:
                if (curTitle.split(' ').length < 3)
                    curTitle = origTitle.substring(origTitle.indexOf(':') + 1);
            }
        } else if (curTitle.length > 150 || curTitle.length < 15) {
            var hOnes = doc.getElementsByTagName('h1');

            if (hOnes.length === 1)
                curTitle = this._getInnerText(hOnes[0]);
        }

        curTitle = curTitle.trim();

        if (curTitle.split(' ').length <= 5)
            curTitle = origTitle;

        return curTitle;
    },

    /**
     * Prepare the HTML document for readability to scrape it.
     * This includes things like stripping javascript, CSS, and handling terrible markup.
     *
     * @return void
     **/
    _prepDocument: function() {
        var doc = this._doc;

        // Remove all style tags in head
        this._removeNodes(this._getAllNodesWithTag(doc, ['style', 'link']));

        if (doc.body)
            this._replaceBrs(doc.body);
    },

    /**
     * Finds the next element, starting from the given node, and ignoring
     * whitespace in between. If the given node is an element, the same node is
     * returned.
     */
    _nextElement: function(node) {
        var next = node;
        while (next && (next.nodeType != Node.ELEMENT_NODE) && this.REGEXPS.whitespace.test(next.textContent)) {
            next = next.nextSibling;
        }
        return next;
    },

    /**
     * Replaces 2 or more successive <br> elements with a single <p>.
     * Whitespace between <br> elements are ignored. For example:
     *   <div>foo<br>bar<br> <br><br>abc</div>
     * will become:
     *   <div>foo<br>bar<p>abc</p></div>
     */
    _replaceBrs: function(elem) {
        var replBr = function(br) {
            var next = br.nextSibling;

            // Whether 2 or more <br> elements have been found and replaced with a
            // <p> block.
            var replaced = false;

            // If we find a <br> chain, remove the <br>s until we hit another element
            // or non-whitespace. This leaves behind the first <br> in the chain
            // (which will be replaced with a <p> later).
            while ((next = this._nextElement(next)) && (next.tagName == 'BR')) {
                replaced = true;
                var brSibling = next.nextSibling;
                next.parentNode.removeChild(next);
                next = brSibling;
            }

            // If we removed a <br> chain, replace the remaining <br> with a <p>. Add
            // all sibling nodes as children of the <p> until we hit another <br>
            // chain.
            if (replaced) {
                var p = this._doc.createElement('p');
                br.parentNode.replaceChild(p, br);

                next = p.nextSibling;
                while (next) {
                    // If we've hit another <br><br>, we're done adding children to this <p>.
                    if (next.tagName == 'BR') {
                        var nextElem = this._nextElement(next);
                        if (nextElem && nextElem.tagName == 'BR')
                            break;
                    }

                    // Otherwise, make this node a child of the new <p>.
                    var sibling = next.nextSibling;
                    p.appendChild(next);
                    next = sibling;
                }
            }
        };
        this._forEachNode(this._getAllNodesWithTag(elem, ['br']), replBr.bind(this));
    },

    _setNodeTag: function(node, tag) {
        //this.log('_setNodeTag', node, tag);
        if (node.__JSDOMParser__) {
            node.localName = tag.toLowerCase();
            node.tagName = tag.toUpperCase();
            return node;
        }

        var replacement = node.ownerDocument.createElement(tag);
        while (node.firstChild)
            replacement.appendChild(node.firstChild);

        node.parentNode.replaceChild(replacement, node);
        if (node.readability)
            replacement.readability = node.readability;

        for (var i = 0; i < node.attributes.length; i++)
            replacement.setAttribute(node.attributes[i].name, node.attributes[i].value);

        return replacement;
    },

    /**
     * Prepare the article node for display. Clean out any inline styles,
     * iframes, forms, strip extraneous <p> tags, etc.
     *
     * @param Element
     * @return void
     **/
    _prepArticle: function(articleContent) {
        this._cleanStyles(articleContent);

        // Clean out junk from the article content
        this._cleanConditionally(articleContent, ['form']);
        this._clean(articleContent, this.NODES_TO_CLEAN);

        // If there is only one h2, they are probably using it as a header
        // and not a sub-header, so remove it since we already have a header.
        var h2 = articleContent.getElementsByTagName('h2');
        if (h2.length === 1 && h2[0].length < this.MIN_NODE_LENGTH)
            this._clean(articleContent, 'h2');

        var h3 = articleContent.getElementsByTagName('h3');
        if (h3.length === 1 && h3[0].length < this.MIN_NODE_LENGTH)
            this._clean(articleContent, 'h3');

        this._cleanHeaders(articleContent);

        // Do these last as the previous stuff may have removed junk
        // that will affect these
        this._cleanConditionally(articleContent, ['table', 'ul', 'div']);

        // Remove extra paragraphs
        this._removeNodes(articleContent.getElementsByTagName('p'), function(paragraph) {
            var imgCount = paragraph.getElementsByTagName('img').length;
            var embedCount = paragraph.getElementsByTagName('embed').length;
            var objectCount = paragraph.getElementsByTagName('object').length;
            var videoCount = paragraph.getElementsByTagName('video').length;
            var audioCount = paragraph.getElementsByTagName('audio').length;
            var iframeCount = paragraph.getElementsByTagName('iframe').length;
            var totalCount = imgCount + embedCount + objectCount + videoCount + audioCount + iframeCount;

            return totalCount === 0 && !this._getInnerText(paragraph, false, true);
        });

        var self = this;
        this._forEachNode(this._getAllNodesWithTag(articleContent, ['br']), function(br) {
            var next = self._nextElement(br.nextSibling);
            if (next && next.tagName == 'P')
                br.parentNode.removeChild(br);
        });
    },

    /**
     * Initialize a node with the readability object. Also checks the
     * className/id for special names to add to its score.
     *
     * @param Element
     * @return void
     **/
    _initializeNode: function(node) {
        node.readability = { 'contentScore': 0 };

        switch (node.tagName) {
            case 'ARTICLE':
                node.readability.contentScore += 50;
                break;
            case 'DIV':
                node.readability.contentScore += 5;
                break;
            case 'PRE':
            case 'CODE':
            case 'TD':
            case 'BLOCKQUOTE':
            case 'FIGURE':
                node.readability.contentScore += 3;
                break;
                //case 'SECTION': // often misused
            case 'ADDRESS':
            case 'OL':
            case 'UL':
            case 'DL':
                node.readability.contentScore -= 2 * Math.ceil(this._getLinkDensity(node));
                break;
                //case 'ASIDE':
            case 'FOOTER':
            case 'HEADER':
            case 'ADDRESS':
            case 'FORM':
            case 'BUTTON':
            case 'TEXTAREA':
            case 'INPUT':
            case 'NAV':
                node.readability.contentScore -= 3;
                break;
            case 'H1':
            case 'H2':
            case 'H3':
            case 'H4':
            case 'H5':
            case 'H6':
            case 'TH':
            case 'HGROUP':
                node.readability.contentScore -= 8;
        }

        node.readability.contentScore += this._getAttributesWeight(node);
    },

    _removeAndGetNext: function(node) {
        var nextNode = this._getNextNode(node, true);
        node.parentNode.removeChild(node);
        return nextNode;
    },

    /**
     * Traverse the DOM from node to node, starting at the node passed in.
     * Pass true for the second parameter to indicate this node itself
     * (and its kids) are going away, and we want the next node over.
     *
     * Calling this in a loop will traverse the DOM depth-first.
     */
    _getNextNode: function(node, ignoreSelfAndKids) {
        // First check for kids if those aren't being ignored
        if (!ignoreSelfAndKids && node.firstElementChild)
            return node.firstElementChild;

        // Then for siblings...
        if (node.nextElementSibling)
            return node.nextElementSibling;

        // And finally, move up the parent chain *and* find a sibling
        // (because this is depth-first traversal, we will have already
        // seen the parent nodes themselves).
        do {
            node = node.parentNode;
        } while (node && !node.nextElementSibling);
        return node && node.nextElementSibling;
    },

    /**
     * Like _getNextNode, but for DOM implementations with no
     * firstElementChild/nextElementSibling functionality...
     */
    _getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) {
        function nextSiblingEl(n) {
            do {
                n = n.nextSibling;
            } while (n && n.nodeType !== n.ELEMENT_NODE);
            return n;
        }
        // First check for kids if those aren't being ignored
        if (!ignoreSelfAndKids && node.children[0]) {
            return node.children[0];
        }
        // Then for siblings...
        var next = nextSiblingEl(node);
        if (next)
            return next;

        // And finally, move up the parent chain *and* find a sibling
        // (because this is depth-first traversal, we will have already
        // seen the parent nodes themselves).
        do {
            node = node.parentNode;
            if (node)
                next = nextSiblingEl(node);
        } while (node && !next);
        return node && next;
    },

    _checkAuthorLine: function(node, matchString) {
        if (this._articleAuthor)
            return false;

        var rel;
        if (typeof node.getAttribute !== 'undefined')
            rel = node.getAttribute('rel');

        if ((rel === 'author' || this.REGEXPS.authorline.test(matchString)) && this._isValidAuthorLine(node.textContent)) {
            this._articleAuthor = node.textContent.trim();
            return true;
        }

        return false;
    },

    _getWordCount: function(text) {
        var length = 0;
        text.split(/[\s.,;]/).every(function(v) {
            if (v.length > this.MIN_WORD_LENGTH) length++;
            return true; });
        return length;
    },

    _getNodeAncestors: function(node, maxDepth) {
        maxDepth = maxDepth || 0;
        var i = 0,
            ancestors = [];
        while (node.parentNode) {
            ancestors.push(node.parentNode);
            if (maxDepth && ++i === maxDepth)
                break;
            node = node.parentNode;
        }
        return ancestors;
    },

    _filterAttributes: function(doc) {
        var nodes = doc.querySelectorAll('*');

        for (var i = 0, l = nodes.length; i < l; i++) {
            var attributes = nodes[i].attributes,
                j = attributes.length;
            while (j--) {
                var attr = attributes[j];
                if (this.ATTRIBUTE_WHITELIST.indexOf(attr.name.toLowerCase()) === -1)
                    nodes[i].removeAttributeNode(attr);
            }
        }
    },

    /***
     * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
     *         most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
     *
     * @param page a document to run upon. Needs to be a full document, complete with body.
     * @return Element
     **/
    _grabArticle: function(page, grab_callback) {
        this.log('\n**** grabArticle ****\n');
        var isPaging = false;

        // We can't grab an article if we don't have a page!
        if (!page) {
            this.log('No body found in document. Abort.');
            return grab_callback(null);
        }

        var doc = page.documentElement;
        var pageCacheHtml = page.innerHTML;

        // Check if any 'dir' is set on the toplevel document element
        this._articleDir = page.documentElement.getAttribute('dir');

        var stripUnlikelyCandidates = false;//this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)

        while (true) {
            // First, node prepping. Trash nodes that look cruddy (like ones with the
            // class name 'comment', etc), and turn divs into P tags where they have been
            // used inappropriately (as in, where they contain no other block level elements.)
            var elementsToScore = [],
                tname = '',
                node = doc;

            while (node) {
                tname = node.tagName;

                if (~this.NODES_TO_IGNORE.indexOf(tname)) {
                    node = this._getNextNode(node);
                    continue;
                }

                // Check to see if this node is an author line, and remove it if it is.
                if (this._checkAuthorLine(node, matchString)) {
                    node = this._removeAndGetNext(node);
                    continue;
                }

                // Remove unlikely candidates
                var matchString = node.className + ' ' + node.id;
                if (stripUnlikelyCandidates) {
                    if ((node.style && node.style.cssText && REGEXPS.styleFilter.test(node.style.cssText)) ||
                        ((this.REGEXPS.unlikelyCandidates.test(matchString) ||
                        this.REGEXPS.extraneous.test(matchString)) &&
                        !this.REGEXPS.okMaybeItsACandidate.test(matchString)) &&
                        tname !== 'IMG' &&
                        tname !== 'A')
                    {
                        this.log('Removing unlikely candidate by "', matchString, '" /', ((matchString.match(this.REGEXPS.unlikelyCandidates) || [])[0] || ''), '/')
                        node = this._removeAndGetNext(node);
                        continue;
                    }
                }

                if (this.DEFAULT_TAGS_TO_SCORE.indexOf(tname) !== -1) {
                    elementsToScore.push(node);
                }

                if (this._replaceImgs && tname === "A" && node.children.length === 1 &&
                    node.children[0].tagName === "IMG" &&
                    this.REGEXPS.imgExt.test(node.href)) {
                    node.children[0].src = node.parentNode.href;
                }

                // Turn all divs that don't have children block level elements into p's
                if (tname === 'DIV') {
                    // Sites like http://mobile.slate.com encloses each paragraph with a DIV
                    // element. DIVs with only a P element inside and no text content can be
                    // safely converted into plain P elements to avoid confusing the scoring
                    // algorithm with DIVs with are, in practice, paragraphs.
                    if (this._hasSinglePInsideElement(node)) {
                        var newNode = node.children[0];
                        node.parentNode.replaceChild(newNode, node);
                        node = newNode;
                    } else if (!this._hasChildBlockElement(node)) {
                        node = this._setNodeTag(node, 'P');
                        elementsToScore.push(node);
                    } else {
                        // EXPERIMENTAL
                        this._forEachNode(node.childNodes, function(childNode) {
                            if (childNode.nodeType === Node.TEXT_NODE) {
                                var p = page.createElement('p');
                                p.textContent = childNode.textContent;
                                p.style.display = 'inline';
                                p.className = 'readability-styled';
                                node.replaceChild(p, childNode);
                            }
                        });
                    }
                }
                node = this._getNextNode(node);
            }

            /**
             * Loop through all paragraphs, and assign a score to them based on how content-y they look.
             * Then add their score to their parent node.
             *
             * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
             **/
            var candidates = [],
                scoreFn = function(elementToScore) {
                    if (!elementToScore.parentNode || typeof elementToScore.parentNode.tagName === 'undefined')
                        return;

                    // Add a point for the paragraph itself as a base.
                    var isImage = elementToScore.tagName === "IMG", contentScore = 1;

                    if (isImage) {
                        if (elementToScore.getAttribute('alt'))
                            contentScore += 5;
                        var value = parseInt(elementToScore.getAttribute('width'), 10);
                        if (isNaN(value)); // NaN (skip)
                        else if (value <= 32)
                            this._setNodeTag(elementToScore, "noscript"); // remove from scan
                        else if (value >= 350)
                            contentScore += 20;
                        else if (value >= 128)
                            contentScore += 5;
                    }

                    // If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it.
                    var innerText = this._getInnerText(elementToScore);
                    if (!isImage && innerText.length < this.MIN_PARAGRAPH_LENGTH)
                        return;

                    // Exclude nodes with no ancestor.
                    var ancestors = this._getNodeAncestors(elementToScore, this.MAX_ANCESTORS);
                    if (ancestors.length === 0)
                        return;

                    // Add points for any words/commas within this paragraph.
                    //contentScore += Math.min(Math.floor(this._getWordCount(innerText) / Math.max((innerText.match(/\b[,.]\s/g) || []).length, 1)), 3);
                    contentScore += this._getCharCount(innerText, ',');

                    // For every SCORE_CHARS_IN_PARAGRAPH (default:100) characters in this paragraph, add another point. Up to 3 points.
                    contentScore += Math.min(Math.floor(innerText.length / this.SCORE_CHARS_IN_PARAGRAPH), 3);

                    // For every SCORE_WORDS_IN_PARAGRAPH (default:20) words in this paragraph, add another point. Up to 3 points.
                    contentScore += Math.min(Math.floor(this._getWordCount(innerText) / this.SCORE_WORDS_IN_PARAGRAPH), 3);

                    // Initialize and score ancestors.
                    var scoreAn = function(ancestor, level) {
                        if (!ancestor.tagName)
                            return;

                        if (typeof(ancestor.readability) === 'undefined') {
                            this._initializeNode(ancestor);
                            candidates.push(ancestor);
                        }

                        // Node score divider:
                        // - parent:       no division
                        // - grandparent+: ancestor level * GRANDPARENT_UP_SCORE_DIVISOR
                        var scoreDivider = 1;
                        if (level !== 0)
                            scoreDivider = level * this.GRANDPARENT_UP_SCORE_DIVISOR;
                        ancestor.readability.contentScore += Math.floor(contentScore / scoreDivider);
                    };

                    this._forEachNode(ancestors, scoreAn.bind(this));
                };

            this._forEachNode(elementsToScore, scoreFn.bind(this));

            // After we've calculated scores, loop through all of the possible
            // candidate nodes we found and find the one with the highest score.
            var topCandidates = [];
            for (var c = 0, cl = candidates.length; c < cl; c += 1) {
                var candidate = candidates[c];

                // Scale the final candidates score based on link density. Good content
                // should have a relatively small link density (5% or less) and be mostly
                // unaffected by this operation.
                var candidateScore = Math.floor(candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)));
                candidate.readability.contentScore = candidateScore;

                this.log('Candidate:', candidate, 'with score', (candidateScore || 0))

                for (var t = 0; t < this._nbTopCandidates; t++) {
                    var aTopCandidate = topCandidates[t];

                    if (!aTopCandidate || candidateScore > aTopCandidate.readability.contentScore) {
                        topCandidates.splice(t, 0, candidate);
                        if (topCandidates.length > this._nbTopCandidates)
                            topCandidates.pop();
                        break;
                    }
                }
            }

            var topCandidate = topCandidates[0] || null;
            var neededToCreateTopCandidate = false;

            // If we still have no top candidate, just use the body as a last resort.
            // We also have to copy the body node so it is something we can modify.
            if (!topCandidate || topCandidate.tagName === 'BODY') {
                // Move all of the page's children into topCandidate
                topCandidate = page.createElement('DIV');
                neededToCreateTopCandidate = true;
                // Move everything (not just elements, also text nodes etc.) into the container
                // so we even include text directly in the body:
                var kids = page.childNodes;
                while (kids.length) {
                    this.log('Moving child out:', kids[0])
                    topCandidate.appendChild(kids[0]);
                }

                page.appendChild(topCandidate);

                this._initializeNode(topCandidate);
            } else if (topCandidate) {
                // Because of our bonus system, parents of candidates might have scores
                // themselves. They get half of the node. There won't be nodes with higher
                // scores than our topCandidate, but if we see the score going *up* in the first
                // few steps up the tree, that's a decent sign that there might be more content
                // lurking in other places that we want to unify in. The sibling stuff
                // below does some of that - but only if we've looked high enough up the DOM
                // tree.
                var parentOfTopCandidate = topCandidate.parentNode;
                var lastScore = topCandidate.readability.contentScore;
                // The scores shouldn't get too low.
                var scoreThreshold = lastScore / 3;
                while (parentOfTopCandidate && parentOfTopCandidate.readability) {
                    var parentScore = parentOfTopCandidate.readability.contentScore;
                    if (parentScore < scoreThreshold)
                        break;
                    if (parentScore > lastScore) {
                        // Alright! We found a better parent to use.
                        topCandidate = parentOfTopCandidate;
                        break;
                    }
                    lastScore = parentOfTopCandidate.readability.contentScore;
                    parentOfTopCandidate = parentOfTopCandidate.parentNode;
                }

                // Check if data is inside a table end set topCandidate to it if found
                var parentNode = topCandidate,
                    tagn;
                while (parentNode) {
                    tagn = parentNode.tagName
                    if (tagn === 'TABLE' || tagn === 'UL' || tagn === 'OL') {
                        topCandidate = parentNode;
                        break;
                    }
                    parentNode = parentNode.parentNode;
                }
            }

            this.log('\n\nThe primary content:', topCandidate, '\n')

            // Now that we have the top candidate, look through its siblings for content
            // that might also be related. Things like preambles, content split by ads
            // that we removed, etc.
            var articleContent = page.createElement('div');

            if (isPaging)
                articleContent.id = 'readability-content';

            var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
            var siblings = topCandidate.parentNode.children;

            for (var s = 0, sl = siblings.length; s < sl; s++) {
                var sibling = siblings[s];
                var append = false;

                this.log('Looking at sibling node:', sibling.tagName, sibling, 'with score', (sibling.readability ? sibling.readability.contentScore : '-'))
                this.log('Sibling has score', (sibling.readability ? (sibling.readability.contentScore || 0) : 0));

                if (sibling === topCandidate) {
                    append = true;
                } else {
                    var contentBonus = 0;

                    var produceAtrArray = function (node) {
                        return ((node.className || '') + ' ' + (node.id || ''))
                            .replace(/[_-]/g, ' ')
                            .split(' ')
                            .filter(function(v) {
                                return v === '' });
                    }

                    var intersectArrays = function (arrays) {
                        return arrays.shift().filter(function(v) {
                            return arrays.every(function(a) {
                                return a.indexOf(v) !== -1;
                            });
                        })
                    }

                    // Give a bonus if sibling nodes and top candidates have same classname group
                    if (intersectArrays([produceAtrArray(sibling), produceAtrArray(topCandidate)]).length)
                        contentBonus += topCandidate.readability.contentScore * 0.2;

                    if (sibling.readability &&
                        ((sibling.readability.contentScore + contentBonus) >= siblingScoreThreshold)) {
                        append = true;
                    } else if (sibling.nodeName === 'P') {
                        var linkDensity = this._getLinkDensity(sibling),
                            nodeContent = this._getInnerText(sibling),
                            nodeLength = nodeContent.length;

                        if (nodeLength > this.MIN_NODE_LENGTH &&
                            linkDensity < this.MAX_LINK_DENSITY)
                            append = true;
                        else if (nodeLength > 0 &&
                            nodeLength < this.MIN_NODE_LENGTH &&
                            linkDensity === 0 &&
                            nodeContent.search(/\.( |$)/) !== -1)
                            append = true;
                    }
                }

                if (append) {
                    this.log('Appending node:', sibling);

                    if (this.ALTER_TO_DIV_EXCEPTIONS.indexOf(sibling.nodeName) === -1) {
                        // We have a node that isn't a common block level element, like a form or td tag.
                        // Turn it into a div so it doesn't get filtered out later by accident.
                        this.log('Altering sibling:', sibling, 'to div.');
                        sibling = this._setNodeTag(sibling, 'DIV');
                    }

                    articleContent.appendChild(sibling);
                    // siblings is a reference to the children array, and
                    // sibling is removed from the array when we call appendChild().
                    // As a result, we must revisit this index since the nodes
                    // have been shifted.
                    s -= 1;
                    sl -= 1;
                }
            }

            //if (this._debug)
            //this.log('Article content pre-prep: ' + articleContent.innerHTML);
            // So we have all of the content that we need. Now we clean it up for presentation.
            this._prepArticle(articleContent);
            //if (this._debug)
            //this.log('Article content post-prep: ' + articleContent.innerHTML);

            if (this._curPageNum === 1) {
                if (neededToCreateTopCandidate) {
                    // We already created a fake div thing, and there wouldn't have been any siblings left
                    // for the previous loop, so there's no point trying to create a new div, and then
                    // move all the children over. Just assign IDs and class names here. No need to append
                    // because that already happened anyway.
                    topCandidate.id = 'readability-page-1';
                    topCandidate.className = 'page';
                } else {
                    var div = page.createElement('DIV');
                    div.id = 'readability-page-1';
                    div.className = 'page';
                    var children = articleContent.childNodes;
                    while (children.length) {
                        div.appendChild(children[0]);
                    }
                    articleContent.appendChild(div);
                }
            }

            //if (this._debug)
            //this.log('Article content after paging: ' + articleContent.innerHTML);

            this._removeScripts(articleContent);
            this._postProcessContent(articleContent);

            // Now that we've gone through the full algorithm, check to see if
            // we got any meaningful content. If we didn't, we may need to re-run
            // grabArticle with different flags set. This gives us a higher likelihood of
            // finding the content, and the sieve approach gives us a higher likelihood of
            // finding the -right- content.
            if (this._getInnerText(articleContent, true).length < 500) {
                page.innerHTML = pageCacheHtml;

                if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
                    this.log('!!!!! Failed to detect content. Removing FLAG_STRIP_UNLIKELYS...');
                    this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
                } else if (this._flagIsActive(this.FLAG_WEIGHT_ATTRIBUTES)) {
                    this.log('!!!!! Failed to detect content. Removing FLAG_WEIGHT_ATTRIBUTES...');
                    this._removeFlag(this.FLAG_WEIGHT_ATTRIBUTES);
                } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
                    this.log('!!!!! Failed to detect content. Removing FLAG_CLEAN_CONDITIONALLY...');
                    this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
                } else {
                    this.log('!!!!! Failed to detect any content.');
                    return grab_callback(null);
                }
            } else
                return grab_callback(articleContent);
        }
    },

    /**
     * Check whether the input string could be a authorline.
     * This verifies that the input is a string, and that the length
     * is less than 100 chars.
     *
     * @param possibleauthorline {string} - a string to check whether its a authorline.
     * @return Boolean - whether the input string is a authorline.
     */
    _isValidAuthorLine: function(authorline) {
        if (typeof authorline == 'string' || authorline instanceof String) {
            authorline = authorline.trim();
            return (authorline.length > 0) && (authorline.length < 100);
        }
        return false;
    },

    /**
     * Attempts to get excerpt and author metadata for the article.
     *
     * @return Object with optional 'excerpt' and 'author' properties
     */
    _getArticleMetadata: function() {
        var metadata = {};
        var values = {};
        var metaElements = this._doc.getElementsByTagName('meta');

        // Match 'description', or Twitter's 'twitter:description' (Cards)
        // in name attribute.
        var namePattern = /^\s*((twitter)\s*:\s*)?(description|title)\s*$/gi;

        // Match Facebook's Open Graph title & description properties.
        var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/gi;

        // Find description tags.
        this._forEachNode(metaElements, function(element) {
            var elementName = element.getAttribute('name');
            var elementProperty = element.getAttribute('property');

            if ([elementName, elementProperty].indexOf('author') !== -1) {
                metadata.author = element.getAttribute('content');
                return;
            }

            if ([elementName, elementProperty] !== -1) {
                metadata.author = element.getAttribute('content');
                return;
            }

            var name = null;
            if (namePattern.test(elementName)) {
                name = elementName;
            } else if (propertyPattern.test(elementProperty)) {
                name = elementProperty;
            }

            if (name) {
                var content = element.getAttribute('content');
                if (content) {
                    if (/^(?:\w+\s*:\s*)?description/.test(name) !== -1) {
                        metadata.excerpt = content.trim();
                        return;
                    } else if (/^(?:\w+\s*:\s*)?title/.test(name) !== -1) {
                        metadata.title = content.trim();
                        return;
                    }
                }
            }
        });

        return metadata;
    },

    /**
     * Removes script tags from the document.
     *
     * @param Element
     **/
    _removeScripts: function(doc) {
        this._removeNodes(doc.getElementsByTagName('script'), function(scriptNode) {
            scriptNode.nodeValue = '';
            scriptNode.removeAttribute('src');
            return true;
        });
        this._removeNodes(doc.getElementsByTagName('noscript'));
    },

    /**
     * Check if this node has only whitespace and a single P element
     * Returns false if the DIV node contains non-empty text nodes
     * or if it contains no P or more than 1 element.
     *
     * @param Element
     **/
    _hasSinglePInsideElement: function(element) {
        // There should be exactly 1 element child which is a P:
        if (element.children.length != 1 || element.children[0].tagName !== 'P') {
            return false;
        }

        // And there should be no text nodes with real content
        return !this._someNode(element.childNodes, function(node) {
            return node.nodeType === Node.TEXT_NODE &&
                this.REGEXPS.hasContent.test(node.textContent);
        });
    },

    /**
     * Determine whether element has any children block level elements.
     *
     * @param Element
     */
    _hasChildBlockElement: function(element) {
        return this._someNode(element.childNodes, function(node) {
            return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 ||
                this._hasChildBlockElement(node);
        });
    },

    /**
     * Get the inner text of a node - cross browser compatibly.
     * This also strips out any excess whitespace to be found.
     *
     * @param Element
     * @param Boolean normalizeSpaces (default: true)
     * @param boolean flattenLines (default: false)
     * @return string
     **/
    _getInnerText: function(e, normalizeSpaces, flattenLines) {
        flattenLines = (typeof flattenLines === 'undefined') ? false : flattenLines;
        normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
        var textContent = e.textContent.trim();

        if (flattenLines) {
            return textContent.replace(this.REGEXPS.flatten, '');
        } else if (normalizeSpaces) {
            return textContent.replace(this.REGEXPS.normalize, ' ');
        }
        return textContent;
    },

    /**
     * Get the number of times a string s appears in the node e.
     *
     * @param Element
     * @param string - what to split on. Default is ','
     * @return number (integer)
     **/
    _getCharCount: function(e, s) {
        return (e.match(new RegExp(s || ',', 'g')) || []).length;
    },

    /**
     * Remove the style attribute on every e and under.
     * TODO: Test if getElementsByTagName(*) is faster.
     *
     * @param Element
     * @return void
     **/
    _cleanStyles: function(e) {
        e = e || this._doc;
        if (!e)
            return;
        var cur = e.firstChild;

        // Remove any root styles, if we're able.
        if (typeof e.removeAttribute === 'function' && e.className !== 'readability-styled')
            e.removeAttribute('style');

        // Go until there are no more child nodes
        while (cur !== null) {
            if (cur.nodeType === cur.ELEMENT_NODE) {
                // Remove style attribute(s) :
                if (cur.className !== 'readability-styled')
                    cur.removeAttribute('style');

                this._cleanStyles(cur);
            }

            cur = cur.nextSibling;
        }
    },

    /**
     * Get the density of links as a percentage of the content
     * This is the amount of text that is inside a link divided by the total text in the node.
     *
     * @param Element
     * @return number (float)
     **/
    _getLinkDensity: function(element) {
        var textLength = this._getInnerText(element, false, true).length;
        if (textLength === 0)
            return 0;

        var linkLength = 0,
            dRe = this._uri.domainRe,
            excExt = this._excludeExternal,
            excAImg = this._excludeAImg,
            self = this;

        this._forEachNode(element.getElementsByTagName('a'), function(linkNode) {
            if (excExt && !dRe.test(linkNode.getAttribute('href'))) return;
            if (excAImg && linkNode.getElementsByTagName('img').length === 1) return;
            linkLength += self._getInnerText(linkNode, false, true).length;
        });

        return linkLength / textLength;
    },

    /**
     * Find a cleaned up version of the current URL, to use for comparing links for possible next-pageyness.
     *
     * @author Dan Lacy
     * @return string the base url
     **/
    _findBaseUrl: function() {
        var uri = this._uri;
        var noUrlParams = uri.path.split('?')[0];
        var urlSlashes = noUrlParams.split('/').reverse();
        var cleanedSegments = [];
        var possibleType = '';

        for (var i = 0, slashLen = urlSlashes.length; i < slashLen; i += 1) {
            var segment = urlSlashes[i];

            // Split off and save anything that looks like a file type.
            if (segment.indexOf('.') !== -1) {
                possibleType = segment.split('.')[1];

                // If the type isn't alpha-only, it's probably not actually a file extension.
                if (!/[^a-zA-Z]/.test(possibleType))
                    segment = segment.split('.')[0];
            }

            // EW-CMS specific segment replacement. Ugly.
            // Example: http://www.ew.com/ew/article/0,,20313460_20369436,00.html
            if (segment.indexOf(',00') !== -1)
                segment = segment.replace(',00', '');

            // If our first or second segment has anything looking like a page number, remove it.
            if (/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i.test(segment) && ((i === 1) || (i === 0)))
                segment = segment.replace(/((_|-)?p[a-z]*|(_|-))[0-9]{1,2}$/i, '');

            var del = false;

            // If this is purely a number, and it's the first or second segment,
            // it's probably a page number. Remove it.
            if (i < 2 && /^\d{1,2}$/.test(segment))
                del = true;

            // If this is the first segment and it's just 'index', remove it.
            if (i === 0 && segment.toLowerCase() === 'index')
                del = true;

            // If our first or second segment is smaller than 3 characters,
            // and the first segment was purely alphas, remove it.
            if (i < 2 && segment.length < 3 && !/[a-z]/i.test(urlSlashes[0]))
                del = true;

            // If it's not marked for deletion, push it to cleanedSegments.
            if (!del)
                cleanedSegments.push(segment);
        }

        // This is our final, cleaned, base article URL.
        return uri.scheme + '://' + uri.host + cleanedSegments.reverse().join('/');
    },

    /**
     * Look for any paging links that may occur within the document.
     *
     * @param body
     * @return object (array)
     **/
    _findNextPageLink: function(elem) {
        var uri = this._uri;
        var possiblePages = {};
        var allLinks = elem.getElementsByTagName('a');
        var articleBaseUrl = this._findBaseUrl();

        // Loop through all links, looking for hints that they may be next-page links.
        // Things like having 'page' in their textContent, className or id, or being a child
        // of a node with a page-y className or id.
        //
        // Also possible: levenshtein distance? longest common subsequence?
        //
        // After we do that, assign each page a score, and
        for (var i = 0, il = allLinks.length; i < il; i += 1) {
            var link = allLinks[i];
            var linkHref = allLinks[i].href.replace(/#.*$/, '').replace(/\/$/, '');

            // If we've already seen this page, ignore it.
            if (linkHref === '' ||
                linkHref === articleBaseUrl ||
                linkHref === uri.spec ||
                linkHref in this._parsedPages) {
                continue;
            }

            // If it's on a different domain, skip it.
            if (uri.host !== linkHref.split(/\/+/g)[1])
                continue;

            var linkText = this._getInnerText(link);

            // If the linkText looks like it's not the next page, skip it.
            if (linkText.match(this.REGEXPS.extraneous) || linkText.length > 25)
                continue;

            // If the leftovers of the URL after removing the base URL don't contain
            // any digits, it's certainly not a next page link.
            var linkHrefLeftover = linkHref.replace(articleBaseUrl, '');
            if (!linkHrefLeftover.match(/\d/))
                continue;

            if (!(linkHref in possiblePages)) {
                possiblePages[linkHref] = { 'score': 0, 'linkText': linkText, 'href': linkHref };
            } else {
                possiblePages[linkHref].linkText += ' | ' + linkText;
            }

            var linkObj = possiblePages[linkHref];

            // If the articleBaseUrl isn't part of this URL, penalize this link. It could
            // still be the link, but the odds are lower.
            // Example: http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
            if (linkHref.indexOf(articleBaseUrl) !== 0)
                linkObj.score -= 25;

            var linkData = linkText + ' ' + link.className + ' ' + link.id;
            if (linkData.match(this.REGEXPS.nextLink))
                linkObj.score += 50;

            if (linkData.match(/pag(e|ing|inat)/i))
                linkObj.score += 25;

            if (linkData.match(/(first|last)/i)) {
                // -65 is enough to negate any bonuses gotten from a > or » in the text,
                // If we already matched on 'next', last is probably fine.
                // If we didn't, then it's bad. Penalize.
                if (!linkObj.linkText.match(this.REGEXPS.nextLink))
                    linkObj.score -= 65;
            }

            if (linkData.match(this.REGEXPS.negative) || linkData.match(this.REGEXPS.extraneous))
                linkObj.score -= 50;

            if (linkData.match(this.REGEXPS.prevLink))
                linkObj.score -= 200;

            // If a parentNode contains page or paging or paginat
            var parentNode = link.parentNode;
            var positiveNodeMatch = false;
            var negativeNodeMatch = false;

            while (parentNode) {
                var parentNodeClassAndId = parentNode.className + ' ' + parentNode.id;

                if (!positiveNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(/pag(e|ing|inat)/i)) {
                    positiveNodeMatch = true;
                    linkObj.score += 25;
                }

                if (!negativeNodeMatch && parentNodeClassAndId && parentNodeClassAndId.match(this.REGEXPS.negative)) {
                    // If this is just something like 'footer', give it a negative.
                    // If it's something like 'body-and-footer', leave it be.
                    if (!parentNodeClassAndId.match(this.REGEXPS.positive)) {
                        linkObj.score -= 25;
                        negativeNodeMatch = true;
                    }
                }

                parentNode = parentNode.parentNode;
            }

            // If the URL looks like it has paging in it, add to the score.
            // Things like /page/2/, /pagenum/2, ?p=3, ?page=11, ?pagination=34
            if (linkHref.match(/p(a|g|ag)?(e|ing|ination)?(=|\/)[0-9]{1,2}/i) || linkHref.match(/(page|paging)/i))
                linkObj.score += 25;

            // If the URL contains negative values, give a slight decrease.
            if (linkHref.match(this.REGEXPS.extraneous))
                linkObj.score -= 15;

            /**
             * Minor punishment to anything that doesn't match our current URL.
             * NOTE: I'm finding this to cause more harm than good where something is exactly 50 points.
             *     Dan, can you show me a counterexample where this is necessary?
             * if (linkHref.indexOf(window.location.href) !== 0) {
             *  linkObj.score -= 1;
             * }
             **/

            // If the link text can be parsed as a number, give it a minor bonus, with a slight
            // bias towards lower numbered pages. This is so that pages that might not have 'next'
            // in their text can still get scored, and sorted properly by score.
            var linkTextAsNumber = parseInt(linkText, 10);
            if (linkTextAsNumber) {
                // Punish 1 since we're either already there, or it's probably
                // before what we want anyways.
                if (linkTextAsNumber === 1) {
                    linkObj.score -= 10;
                } else {
                    linkObj.score += Math.max(0, 10 - linkTextAsNumber);
                }
            }
        }

        // Loop thrugh all of our possible pages from above and find our top
        // candidate for the next page URL. Require at least a score of 50, which
        // is a relatively high confidence that this page is the next link.
        var topPage = null;
        for (var page in possiblePages) {
            if (possiblePages.hasOwnProperty(page)) {
                if (possiblePages[page].score >= 50 &&
                    (!topPage || topPage.score < possiblePages[page].score))
                    topPage = possiblePages[page];
            }
        }

        var nextHref = null;
        if (topPage) {
            nextHref = topPage.href.replace(/\/$/, '');

            this.log('NEXT PAGE IS ' + nextHref);
            this._parsedPages[nextHref] = true;
        }
        return nextHref;
    },

    /**
     * Add pre filter for raw input HTML processing
     * @param string RegExp for replace
     * @param string (optional) Replacer
     */
    _addPreFilter: function(filter, replacer) {
        if (typeof filter !== 'object') return;
        this._preFilters.push({ r: filter, s: (replacer || '') });
    },

    /**
     * Add post filter for raw output HTML processing
     * @param string RegExp for replace
     * @param string (optional) Replacer
     */
    _addPostFilter: function(filter, replacer) {
        if (typeof filter !== 'object') return;
        this._postFilters.push({ r: filter, s: (replacer || '') });
    },

    _createHTML: function(source, url) {
        var doc = this._doc.implementation.createHTMLDocument('HTMLParser');
        doc.documentElement.innerHTML = source;
        return doc;
    },

    /**
     * Get an elements class/id weight. Uses regular expressions to tell
     * if this element looks good or bad.
     *
     * @param Element
     * @return number (Integer)
     **/
    _getAttributesWeight: function(e) {
        if (!this._flagIsActive(this.FLAG_WEIGHT_ATTRIBUTES))
            return 0;

        var weight = 0,
            self = this,
            getWeight = function(attr) {
                // Look for a special string in atribute text
                if (typeof attr === 'string' && attr.trim().length > 2) {
                    if (self.REGEXPS.safe.test(attr))
                        return 100;

                    if (self.REGEXPS.negative.test(attr))
                        return -25;

                    if (self.REGEXPS.positive.test(attr))
                        return 25;

                    if (self.REGEXPS.unlikelyCandidates.test(attr))
                        return -5;

                    if (self.REGEXPS.okMaybeItsACandidate.test(attr))
                        return 5;

                    return 0;
                }
                return 0;
            };

        weight += getWeight(e.className);
        weight += getWeight(e.id);

        return weight;
    },

    /**
     * Check if a given node has one of its ancestor tag name matching the
     * provided one.
     * @param  HTMLElement node
     * @param  String      tagName
     * @param  Number      maxDepth
     * @return Boolean
     */
    _hasAncestorTag: function(node, tagName, maxDepth) {
        maxDepth = maxDepth || 3;
        tagName = tagName.toUpperCase();
        var depth = 0;
        while (node.parentNode) {
            if (depth > maxDepth)
                return false;
            if (node.parentNode.tagName === tagName)
                return true;
            node = node.parentNode;
            depth++;
        }
        return false;
    },

    /**
     * Clean a node of all elements of type 'tags'.
     * (Unless it's a youtube/vimeo video. People love movies.)
     *
     * @param Element
     * @param string tags to clean
     * @return void
     **/
    _clean: function(e, tags) {
        var isEmbed = this.MEDIA_NODES.indexOf(tags) !== -1;

        this._removeNodes(this._getAllNodesWithTag(e, tags), function(element) {
            // Allow youtube and vimeo videos through as people usually want to see those.
            if (isEmbed) {
                var attributeValues = [].map.call(element.attributes, function(attr) {
                    return attr.value;
                }).join('|');

                // Check the elements attributes and elements inside to see if any of them contain youtube or vimeo
                if (this.REGEXPS.videos.test(attributeValues) || this.REGEXPS.videos.test(element.innerHTML))
                    return false;
            }

            return true;
        });
    },

    /**
     * Clean an element of all tags of type 'tag' if they look fishy.
     * 'Fishy' is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
     *
     * @return void
     **/
    _cleanConditionally: function(e, tags) {
        if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
            return;

        // Gather counts for other typical elements embedded within.
        // Traverse backwards so we can remove nodes at the same time
        // without effecting the traversal.
        //
        // TODO: Consider taking into account original contentScore here.
        this._removeNodes(this._getAllNodesWithTag(e, tags), function(node) {
            var weight, contentScore;
            if (node.readability) {
                weight = 0;
                contentScore = node.readability.contentScore  || 0;
            } else {
                weight = this._getAttributesWeight(node);
                contentScore = 0;
            }

            var haveToRemove = false,
                isList = (node.tagName === 'UL' || node.tagName === 'OL');

            if (weight + contentScore < 0) return true;

            var contentText = this._getInnerText(node, false, true)

            if (this._getCharCount(contentText, ',') < this.MIN_COMMAS_IN_PARAGRAPH) {
                // If there are not very many commas, and the number of
                // non-paragraph elements is more than paragraphs or other
                // ominous signs, remove the element.
                var p = node.getElementsByTagName('p').length;
                var img = node.getElementsByTagName('img').length;
                var li = node.getElementsByTagName('li').length - 100;
                var input = node.getElementsByTagName('input').length;

                var embedCount = 0;
                var embeds = this._getAllNodesWithTag(node, this.MEDIA_NODES);
                for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
                    if (!this.REGEXPS.videos.test(embeds[ei].src) && !this.REGEXPS.videos.test(embeds[ei].innerHTML))
                        embedCount += 1;
                }

                var linkDensity = this._getLinkDensity(node),
                    contentLength = contentText.length;

                haveToRemove =
                    // Make an exception for elements with no p's and exactly 1 img.
                    (img > p && !this._hasAncestorTag(node, 'figure')) ||
                    (!isList && li > p) ||
                    (input > Math.floor(p / 3)) ||
                    (!isList && contentLength < 25 && (img === 0 || img > 2)) ||
                    (!isList && weight < 25 && linkDensity > 0.2) ||
                    (weight >= 25 && linkDensity > 0.5) ||
                    ((embedCount === 1 && contentLength < 75) || embedCount > 1);

            }
            if (haveToRemove) this.log('Cleaning Conditionally:', node, 'list:', isList, ';img:', img,';linkDensity:',linkDensity);
            return haveToRemove;
        });
    },

    /**
     * Clean out spurious headers from an Element. Checks things like classnames and link density.
     *
     * @param Element
     * @return void
     **/
    _cleanHeaders: function(e) {
        for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
            this._removeNodes(e.getElementsByTagName('h' + headerIndex), function(header) {
                return this._getAttributesWeight(header) < 0;
            });
        }
    },

    _flagIsActive: function(flag) {
        return (this._flags & flag) > 0;
    },

    _addFlag: function(flag) {
        this._flags = this._flags | flag;
    },

    _removeFlag: function(flag) {
        this._flags = this._flags & ~flag;
    },

    /**
     * Decides whether or not the document is reader-able without parsing the whole thing.
     *
     * @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
     */
    isProbablyReaderable: function(helperIsVisible) {
        var nodes = this._getAllNodesWithTag(this._doc, ['p', 'pre']);

        // FIXME we should have a fallback for helperIsVisible, but this is
        // problematic because of jsdom's elem.style handling - see
        // https://github.com/mozilla/readability/pull/186 for context.

        var score = 0;
        // This is a little cheeky, we use the accumulator 'score' to decide what to return from
        // this callback:
        return this._someNode(nodes, function(node) {
            if (helperIsVisible && !helperIsVisible(node))
                return false;

            var matchString = node.className + ' ' + node.id;

            if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
                !this.REGEXPS.okMaybeItsACandidate.test(matchString))
                return false;

            if (node.matches && node.matches('li p'))
                return false;

            var textContentLength = node.textContent.trim().length;
            if (textContentLength < this.MIN_ARTICLE_LENGTH)
                return false;

            score += Math.round(100 / (1 + Math.exp(-1 * textContentLength / this.MIN_ARTICLE_LENGTH))) - 50;
            //score += Math.sqrt(textContentLength - this.MIN_ARTICLE_LENGTH);

            if (score > 20)
                return true;

            return false;
        });
    },

    _successfulRequest: function(request) {
        return (request.status >= 200 && request.status < 300) ||
            request.status === 304 ||
            (request.status === 0 && request.responseText);
    },

    _ajax: function(url, options) {
        var self = this,
            request = new XMLHttpRequest(),
            respondToReadyState = function(readyState) {
                if (request.readyState === 4) {
                    if (self._successfulRequest(request)) {
                        if (options.success)
                            options.success(self._createHTML(request.responseText));
                    } else if (options.error) {
                        options.error(request);
                    }
                }
            };

        if (typeof options === 'undefined')
            options = {};

        request.onreadystatechange = respondToReadyState;

        request.open('get', url, true);
        request.setRequestHeader('Accept', 'text/html');

        try {
            request.send(options.postBody);
        } catch (e) {
            if (options.error)
                options.error();
        }

        return request;
    },

    /*
     * Initializes instance.
     *
     * @param {String}       uri     The URL string.
     * @param {HTMLDocument} doc     Parent document.
     */
    init: function(uri, doc) {
        if (!uri) throw new Error('No URL specified. Aborting.');
        if (!doc) throw new Error('No document specified. Aborting.');

        var a = doc.createElement('a'),
            base = doc.getElementsByTagName('base');

        a.href = uri;

        if (base[0]) base = base[0];
        else base = null;

        this._doc = doc;
        this._uri = {
            spec: a.href,
            host: a.host,
            prePath: a.protocol + '//' + a.host,
            scheme: a.protocol.substr(0, a.protocol.indexOf(':')),
            pathBase: base || a.protocol + '//' + a.host + a.pathname.substr(0, a.pathname.lastIndexOf('/') + 1),
            domainRe: new RegExp('/' + a.host.replace(/^\w+:\/\/(?:www|m|i)\d*\./i, '').replace(/\./, '.') + '/')
        }

        return this;
    },

    /**
     * Runs readability.
     *
     * Workflow:
     *  1. Prep the document by removing script tags, css, etc.
     *  2. Build readability's DOM tree.
     *  3. Grab the article content from the current dom tree.
     *  4. Replace the current DOM tree with the new one.
     *  5. Read peacefully.
     *
     * @return void
     **/
    parse: function(parse_callback) {
        if (!parse_callback) throw new Error('Aborting parsing document; no callback found');
        var self = this;

        if (!self._doc) {
            throw new Error('Aborting parsing document; no data to process');
        }

        // Avoid parsing too large documents, as per configuration option
        if (self._maxElemsToParse) {
            var numTags = self._doc.getElementsByTagName('*').length;
            if (numTags > self._maxElemsToParse) {
                throw new Error('Aborting parsing document; ' + numTags + ' > max = ' + self._maxElemsToParse + ' elements found');
            }
        }

        if (typeof self._doc.documentElement.firstElementChild === 'undefined') {
            self._getNextNode = self._getNextNodeNoElementProperties;
        }

        self._preProcessContent(self._doc);
        // Remove script tags from the document.
        self._removeScripts(self._doc);

        // FIXME: Disabled multi-page article support for now as it
        // needs more work on infrastructure.

        // Make sure self document is added to the list of parsed pages first,
        // so we don't double up on the first page.
        // self._parsedPages[uri.spec.replace(/\/$/, '')] = true;

        // Pull out any possible next page link first.
        // var nextPageLink = self._findNextPageLink(doc.body);

        var metadata = self._getArticleMetadata();

        self._prepDocument();

        var articleTitle = metadata.title || self._getArticleTitle();

        self._grabArticle(self._doc, function(articleContent) {
            //self.log('Grabbed: ' + articleContent.innerHTML);
            if (!articleContent)
                parse_callback({
                    uri: null,
                    title: null,
                    author: null,
                    dir: null,
                    content: null,
                    textContent: null,
                    length: null,
                    excerpt: null
                });

            // if (nextPageLink) {
            //   // Append any additional pages after a small timeout so that people
            //   // can start reading without having to wait for self to finish processing.
            //   setTimeout((function() {
            //   self._appendNextPage(nextPageLink);
            //   }).bind(self), 500);
            // }

            // If we haven't found an excerpt in the article's metadata, use the article's
            // first paragraph as the excerpt. This is used for displaying a preview of
            // the article's content.
            if (!metadata.excerpt) {
                var paragraphs = articleContent.getElementsByTagName('p');
                if (paragraphs.length > 0) {
                    metadata.excerpt = paragraphs[0].textContent.trim();
                }
            }

            parse_callback({
                uri: self._uri,
                title: articleTitle,
                author: metadata.author || self._articleAuthor,
                dir: self._articleDir,
                content: articleContent.innerHTML,
                textContent: articleContent.textContent,
                length: articleContent.textContent.length,
                excerpt: metadata.excerpt
            });
        });

        return this;

    }
};