Skip to content

Instantly share code, notes, and snippets.

@KTibow
Last active May 24, 2024 00:54
Show Gist options
  • Save KTibow/45eff0d37208910c7d287075786c338f to your computer and use it in GitHub Desktop.
Save KTibow/45eff0d37208910c7d287075786c338f to your computer and use it in GitHub Desktop.
Readability lite/serverless

steps to integrate:

  1. parse the html
  2. get the root, shake it
  3. emflatten it with a temporary array
  4. make a content string; for each flattened item, add it to the content (do fancy formatting/whitespace removal if you want); do other stuff to the content string if you want (eg trimming, adding title, etc)
export type Root = { name: string; data: string; contents: ElementPart[] };
type ElementPart = { name: "#text"; contents: string } | Root;
const NON_ARTICLE_ELEMENTS = ["aside", "nav", "header", "footer", "style"];
const NON_ARTICLE_ROLES = [
"menu",
"menubar",
"complementary",
"navigation",
"banner",
"alert",
"alertdialog",
"dialog",
];
const GOVERNMENT_APPROVAL = (e: Root) => {
if (["h1", "h2", "h3", "p", "blockquote", "li"].includes(e.name)) {
return true;
}
if (e.name == "div" && e.data.includes("nyt-imperial")) {
return true;
}
return false;
};
const recursiveFind = (element: Root, filter: (item: Root) => boolean): Root | undefined => {
if (filter(element)) return element;
for (const item of element.contents) {
if (item.name != "#text") {
const article = recursiveFind(item as Root, filter);
if (article) return article;
}
}
};
const replaceEscape = (text: string) =>
text
.replace(/&#([0-9]+);/gi, (_, code) => String.fromCharCode(code))
.replace(/&#x([0-9a-f]+);/gi, (_, code) => String.fromCharCode(parseInt(code, 16)))
.replaceAll(" ", " ")
.replaceAll(""", `"`)
.replaceAll("“", `"`)
.replaceAll("”", `"`)
.replaceAll("‘", `'`)
.replaceAll("’", `'`)
.replaceAll("—", "-")
.replaceAll("©", "©")
.replaceAll("£", "£")
.replaceAll("ø", "ø")
.replaceAll("å", "å")
.replaceAll("&", "&");
export const parseHtml = (html: string) => {
const tags = /<\/?([a-z0-9]+)(?:[^>"']|"[^"]*"|'[^']*')*>/g;
const doctypeRegex = /<!DOCTYPE(?:[^>"']|"[^"]*"|'[^']*')*>/i;
const selfClosingTags =
/^(area|base|br|col|embed|hr|img|input|link|meta|param|path|source|track|wbr)$/i;
let match;
// this works by creating elements on the stack
// then the first one ends up referencing all of them
let stack: [{ name: string; contents: ElementPart[] }, ...Root[]] = [
{ name: "root", contents: [] },
];
let lastTextIndex = 0;
let insideScript = false;
while ((match = tags.exec(html))) {
const [fullMatch, tagName] = match;
const isClosing = fullMatch[1] === "/";
const isSelfClosing = selfClosingTags.test(tagName) || fullMatch.at(-2) === "/";
if (tagName == "script" && !isClosing) {
insideScript = true;
continue;
}
if (tagName == "script" && isClosing) {
insideScript = false;
const text = html.slice(lastTextIndex, match.index);
if (text && text.trim()) {
stack[stack.length - 1].contents.push({ name: "#text", contents: text });
}
lastTextIndex = tags.lastIndex;
continue;
}
if (insideScript) {
continue;
}
if (match.index > lastTextIndex) {
const text = html.slice(lastTextIndex, match.index);
if (text && text.trim()) {
stack[stack.length - 1].contents.push({ name: "#text", contents: text });
}
}
if (isClosing) {
if (stack[stack.length - 1].name == tagName) stack.pop();
} else {
const newTag = { name: tagName, data: fullMatch.slice(1, -1), contents: [] };
stack[stack.length - 1].contents.push(newTag);
if (!isSelfClosing && !doctypeRegex.test(fullMatch)) {
stack.push(newTag);
}
}
lastTextIndex = tags.lastIndex;
}
if (lastTextIndex < html.length) {
const text = html.slice(lastTextIndex).trim();
if (text) {
stack[stack.length - 1].contents.push({ name: "#text", contents: text });
}
}
return stack[0].contents;
};
export const shakeElement = (element: Root, isNews: boolean) => {
element.contents = element.contents
.map((item) => {
if (item.name != "#text") {
const _item = item as Root;
if (
_item.name != "body" &&
(NON_ARTICLE_ELEMENTS.includes(_item.name) ||
NON_ARTICLE_ROLES.some((x) => _item.data.includes(`role="${x}"`)) ||
(isNews && _item.data.includes("comment")) ||
_item.data.includes("button") ||
_item.data.includes("display:none") ||
_item.data.includes("elementor-widget-posts") ||
_item.data.includes("enlarge") ||
_item.data.includes("infobar") ||
_item.data.includes("more-about-container") ||
_item.data.includes("newsletter-form") ||
_item.data.includes("openWeb-wrapper") ||
_item.data.includes("popular-box") ||
_item.data.includes("share") ||
_item.data.includes("score dislikes") ||
_item.data.includes("score likes") ||
_item.data.includes("tags") ||
_item.data.includes("tools"))
) {
return undefined;
}
shakeElement(_item, isNews);
}
return item;
})
.filter((item): item is Root => Boolean(item));
};
export const findTitle = (html: Root) => {
const try1 = recursiveFind(html, (item) => item.name == "title");
const title = try1!.contents.find(
(item): item is { name: "#text"; contents: string } => item.name == "#text",
);
const contents = title!.contents;
return replaceEscape(contents);
};
export const findArticle = (html: Root) => {
const try1 = recursiveFind(html, (item) => item.name == "article");
if (try1) return try1;
const try2 = recursiveFind(html, (item) => item.data.includes(`role="article"`));
if (try2) return try2;
const try3 = recursiveFind(html, (item) => item.data.includes(`role="main"`));
if (try3) return try3;
const try4 = recursiveFind(html, (item) => item.data.includes(`id="story-body"`));
if (try4) return try4;
const try5 = recursiveFind(html, (item) => item.name == "body");
return try5 || html;
};
export const emflatten = (element: Root, output: Root[]) => {
if (GOVERNMENT_APPROVAL(element)) {
output.push(element);
} else {
for (const item of element.contents) {
if (item.name != "#text") {
emflatten(item as Root, output);
}
}
}
};
export const recurse = (element: Root) => {
let content = "";
for (const item of element.contents) {
if (item.name == "#text") {
content += replaceEscape((item as { contents: string }).contents);
} else {
content += recurse(item as Root);
}
}
return content;
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment