Skip to content

Instantly share code, notes, and snippets.

Last active May 24, 2024 00:54
Show Gist options
  • Save KTibow/45eff0d37208910c7d287075786c338f to your computer and use it in GitHub Desktop.
Save KTibow/45eff0d37208910c7d287075786c338f to your computer and use it in GitHub Desktop.
Readability lite/serverless

steps to integrate:

  1. parse the html
  2. get the root, shake it
  3. emflatten it with a temporary array
  4. make a content string; for each flattened item, add it to the content (do fancy formatting/whitespace removal if you want); do other stuff to the content string if you want (eg trimming, adding title, etc)
export type Root = { name: string; data: string; contents: ElementPart[] };
type ElementPart = { name: "#text"; contents: string } | Root;
const NON_ARTICLE_ELEMENTS = ["aside", "nav", "header", "footer", "style"];
const GOVERNMENT_APPROVAL = (e: Root) => {
if (["h1", "h2", "h3", "p", "blockquote", "li"].includes( {
return true;
if ( == "div" &&"nyt-imperial")) {
return true;
return false;
const recursiveFind = (element: Root, filter: (item: Root) => boolean): Root | undefined => {
if (filter(element)) return element;
for (const item of element.contents) {
if ( != "#text") {
const article = recursiveFind(item as Root, filter);
if (article) return article;
const replaceEscape = (text: string) =>
.replace(/&#([0-9]+);/gi, (_, code) => String.fromCharCode(code))
.replace(/&#x([0-9a-f]+);/gi, (_, code) => String.fromCharCode(parseInt(code, 16)))
.replaceAll(" ", " ")
.replaceAll(""", `"`)
.replaceAll("“", `"`)
.replaceAll("”", `"`)
.replaceAll("‘", `'`)
.replaceAll("’", `'`)
.replaceAll("—", "-")
.replaceAll("©", "©")
.replaceAll("£", "£")
.replaceAll("ø", "ø")
.replaceAll("å", "å")
.replaceAll("&", "&");
export const parseHtml = (html: string) => {
const tags = /<\/?([a-z0-9]+)(?:[^>"']|"[^"]*"|'[^']*')*>/g;
const doctypeRegex = /<!DOCTYPE(?:[^>"']|"[^"]*"|'[^']*')*>/i;
const selfClosingTags =
let match;
// this works by creating elements on the stack
// then the first one ends up referencing all of them
let stack: [{ name: string; contents: ElementPart[] }, ...Root[]] = [
{ name: "root", contents: [] },
let lastTextIndex = 0;
let insideScript = false;
while ((match = tags.exec(html))) {
const [fullMatch, tagName] = match;
const isClosing = fullMatch[1] === "/";
const isSelfClosing = selfClosingTags.test(tagName) || === "/";
if (tagName == "script" && !isClosing) {
insideScript = true;
if (tagName == "script" && isClosing) {
insideScript = false;
const text = html.slice(lastTextIndex, match.index);
if (text && text.trim()) {
stack[stack.length - 1].contents.push({ name: "#text", contents: text });
lastTextIndex = tags.lastIndex;
if (insideScript) {
if (match.index > lastTextIndex) {
const text = html.slice(lastTextIndex, match.index);
if (text && text.trim()) {
stack[stack.length - 1].contents.push({ name: "#text", contents: text });
if (isClosing) {
if (stack[stack.length - 1].name == tagName) stack.pop();
} else {
const newTag = { name: tagName, data: fullMatch.slice(1, -1), contents: [] };
stack[stack.length - 1].contents.push(newTag);
if (!isSelfClosing && !doctypeRegex.test(fullMatch)) {
lastTextIndex = tags.lastIndex;
if (lastTextIndex < html.length) {
const text = html.slice(lastTextIndex).trim();
if (text) {
stack[stack.length - 1].contents.push({ name: "#text", contents: text });
return stack[0].contents;
export const shakeElement = (element: Root, isNews: boolean) => {
element.contents = element.contents
.map((item) => {
if ( != "#text") {
const _item = item as Root;
if ( != "body" &&
NON_ARTICLE_ROLES.some((x) =>`role="${x}"`)) ||
(isNews &&"comment")) ||"button") ||"display:none") ||"elementor-widget-posts") ||"enlarge") ||"infobar") ||"more-about-container") ||"newsletter-form") ||"openWeb-wrapper") ||"popular-box") ||"share") ||"score dislikes") ||"score likes") ||"tags") ||"tools"))
) {
return undefined;
shakeElement(_item, isNews);
return item;
.filter((item): item is Root => Boolean(item));
export const findTitle = (html: Root) => {
const try1 = recursiveFind(html, (item) => == "title");
const title = try1!.contents.find(
(item): item is { name: "#text"; contents: string } => == "#text",
const contents = title!.contents;
return replaceEscape(contents);
export const findArticle = (html: Root) => {
const try1 = recursiveFind(html, (item) => == "article");
if (try1) return try1;
const try2 = recursiveFind(html, (item) =>`role="article"`));
if (try2) return try2;
const try3 = recursiveFind(html, (item) =>`role="main"`));
if (try3) return try3;
const try4 = recursiveFind(html, (item) =>`id="story-body"`));
if (try4) return try4;
const try5 = recursiveFind(html, (item) => == "body");
return try5 || html;
export const emflatten = (element: Root, output: Root[]) => {
} else {
for (const item of element.contents) {
if ( != "#text") {
emflatten(item as Root, output);
export const recurse = (element: Root) => {
let content = "";
for (const item of element.contents) {
if ( == "#text") {
content += replaceEscape((item as { contents: string }).contents);
} else {
content += recurse(item as Root);
return content;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment