Skip to content

Instantly share code, notes, and snippets.

@dcdunkan
Last active June 3, 2023 14:50
Show Gist options
  • Save dcdunkan/38725722806aca8012af485dfa283845 to your computer and use it in GitHub Desktop.
Save dcdunkan/38725722806aca8012af485dfa283845 to your computer and use it in GitHub Desktop.
script for validating links in grammyjs/website before building the website
import { extname, join } from "https://deno.land/std@0.190.0/path/mod.ts";
import {
DOMParser,
HTMLDocument,
} from "https://deno.land/x/deno_dom@v0.1.38/deno-dom-wasm.ts";
import MarkdownIt from "https://esm.sh/markdown-it@13.0.1";
import anchorPlugin from "https://esm.sh/markdown-it-anchor@8.6.7";
import { slugify } from "https://esm.sh/@mdit-vue/shared@0.12.0";
import {
blue as b,
cyan as c,
gray,
red as r,
} from "https://deno.land/std@0.190.0/fmt/colors.ts";
type FetchOptions = Parameters<typeof fetch>[1];
const INDEX_FILE = "README.md";
const ALLOW_HTML_INSTEAD_OF_MD = false;
const RETRY_FAILED_FETCH = true;
const MAX_RETRIES = 5;
// some sites just ... ehh
const ACCEPTABLE_NOT_OK_STATUS: Record<string, number> = {
"https://dash.cloudflare.com/login": 403,
"https://dash.cloudflare.com/?account=workers": 403,
};
const FETCH_OPTIONS: FetchOptions = {
headers: {
"User-Agent":
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0",
"Accept":
"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
},
method: "GET",
mode: "cors",
};
const domParser = new DOMParser();
const md = MarkdownIt({ html: true, linkify: true })
.use(anchorPlugin, { slugify }); // this is what vuepress uses.
// All anchors that are actually present in the file or website.
const allAnchors: Record<string, Set<string>> = {};
const links: Record<string, Set<string>> = {}; // filepath: [...links in the file]
const usedAnchors: Record<string, Record<string, Set<string>>> = {}; // filepath: [...anchors in the file]
// the linked file ^ ^ anchors mentioned in that file.
// ^ the file that mentioned the anchor
interface GeneralIssue {
type:
| "html_instead_of_md"
| "file_not_found"
| "not_ok"
| "parse_error";
reference: string;
}
interface RedirectedIssue {
type: "redirected";
from: string;
to: string;
}
interface MissingAnchorIssue {
type: "missing_anchor";
root: string;
anchor: string;
}
type Issue = GeneralIssue | RedirectedIssue | MissingAnchorIssue;
const issues: Record<string, Issue[]> = {};
async function findLinksFromFiles(directory: string) {
for await (const dirEntry of Deno.readDir(directory)) {
const path = join(directory, dirEntry.name);
if (dirEntry.isFile) {
if (extname(dirEntry.name).toLowerCase() != ".md") continue;
const content = await Deno.readTextFile(path);
const tokens = md.parse(content, {});
const html = md.render(content, {});
const document = domParser.parseFromString(html, "text/html");
if (document == null) {
throw new Error("Document seems to be empty: shouldn't happen");
}
allAnchors[path] = getAnchors(document);
const filtered = filterLinks(tokens);
for (const link of filtered) {
if (link.startsWith("http")) { // external link.
links[path] ??= new Set();
links[path].add(link);
} else if (link.startsWith(".")) { // relative path to a file.
await resolveRelativeLink(directory, path, link);
} else if (link.startsWith("#")) { // anchor to the same file.
usedAnchors[path] ??= {};
usedAnchors[path][path] ??= new Set();
usedAnchors[path][path].add(link.substring(1));
} else { // some other type -- MUST be an invalid one
throw new Error("Different type of link " + link);
}
}
} else if (dirEntry.isDirectory) {
await findLinksFromFiles(path);
}
}
}
function filterLinks(tokens: ReturnType<typeof md.parse>) {
const links: string[] = [];
for (const token of tokens) {
if (token.type === "link_open") {
const href = token.attrGet("href");
if (href != null) links.push(href);
}
if (token.children != null) {
links.push(...filterLinks(token.children));
}
}
return links;
}
async function resolveRelativeLink(
directory: string,
path: string,
link: string,
) {
let [root, anchor] = link.split("#");
if (root.endsWith(".html")) {
if (!ALLOW_HTML_INSTEAD_OF_MD) {
issues[path] ??= [];
issues[path].push({ type: "html_instead_of_md", reference: link });
return;
}
root = root.replace(".html", ".md");
}
if (!root.endsWith(".md")) {
if (!root.endsWith("/")) root += "/";
root += INDEX_FILE;
}
const relativePath = join(directory, root);
try {
await Deno.lstat(relativePath);
if (anchor == null) return;
usedAnchors[relativePath] ??= {};
usedAnchors[relativePath][path] ??= new Set();
usedAnchors[relativePath][path].add(anchor); // means that this anchor have been used to indicate the relPath file.
} catch (error) {
if (error instanceof Deno.errors.NotFound) {
issues[path] ??= [];
issues[path].push({ type: "file_not_found", reference: link });
return;
}
throw error;
}
}
function getAnchors(document: HTMLDocument): Set<string> {
const anchors: string[] = [];
const tags = ["section", "h1", "h2", "h3", "h4", "h5", "h6", "div"];
for (const tag of tags) anchors.push(...anchorsFromId(document, tag));
return new Set([
...document.getElementsByTagName("a")
.map((element) => element.getAttribute("href"))
.filter((href) => href != null && href.startsWith("#") && href.length > 1)
.map((href) => href!.substring(1)),
...anchors,
]);
}
function anchorsFromId(document: HTMLDocument, tag: string) {
return document.getElementsByTagName(tag)
.map((element) => element.getAttribute("id"))
.filter((id) => id != null && id.trim() !== "") as string[];
}
await findLinksFromFiles(".");
/** Transform the URL, if needed, before fetching */
function transformUrl(url: string) {
if (url.includes("://t.me/")) { // My ISP have blocked t.me :(
warn("Changing t.me to telegram.me for convenience");
url = url.replace("://t.me/", "://telegram.me/");
}
return url;
}
async function retryFetch(url: string, options: FetchOptions) {
let retries = 0;
let response: Response | undefined;
// deno-lint-ignore no-explicit-any
let error: any;
do {
try {
response = await fetch(url, options);
} catch (err) {
error = err;
if (!RETRY_FAILED_FETCH) break;
log(`%cINFO%c Retrying (${retries + 1})`, "orange");
}
retries++;
} while (retries < MAX_RETRIES && response == null);
if (response == null) {
log(`%cFailed%c Couldn't get a proper response`, "red");
console.log(error);
}
return response;
}
// Manage external links
for (const file in links) {
for (const url_ of links[file]) {
const [root, anchor] = url_.split("#");
if (usedAnchors[root] == null) {
usedAnchors[root] = {};
usedAnchors[root][file] ??= new Set();
if (anchor != null) usedAnchors[root][file].add(anchor);
} else {
usedAnchors[root][file] ??= new Set();
if (anchor != null) usedAnchors[root][file].add(anchor);
continue; // already fetched once.
}
const url = transformUrl(url_);
log(`%cFetching%c ${root}`, "blue");
const response = await retryFetch(url, FETCH_OPTIONS);
if (response == null) {
delete usedAnchors[root];
continue;
}
if (response.redirected) {
if (!isValidRedirection(url, response.url)) {
issues[file] ??= [];
issues[file].push({ type: "redirected", from: url_, to: response.url });
}
}
if (!response.ok && ACCEPTABLE_NOT_OK_STATUS[url_] != response.status) {
issues[file] ??= [];
issues[file].push({ type: "not_ok", reference: url_ });
log(
`%cNOT OK%c response wasn't okay: ${response.status} ${response.statusText}`,
"red",
);
}
// for parsing the document we need to make sure its html.
const contentType = response.headers.get("content-type");
if (!contentType) {
warn(`%cWARN%c No content-type header, continuing anyway`);
} else if (!contentType.includes("text/html")) {
warn(`Content-type is: ${contentType}, but let's just go with html`);
}
let document: HTMLDocument;
try {
const content = await response.text();
const doc = domParser.parseFromString(content, "text/html");
if (doc == null) throw new Error("no document, skipping");
document = doc;
} catch (err) {
issues[file] ??= [];
issues[file].push({ type: "parse_error", reference: url_ });
log("%cERROR%c Couldn't parse the text (error below), skipping", "red");
console.log(err);
continue;
}
allAnchors[root] = getAnchors(document);
}
}
// Missing anchors
for (const root in usedAnchors) {
const all = allAnchors[root] ?? new Set();
for (const file in usedAnchors[root]) {
for (const anchor of usedAnchors[root][file]) {
const decodedAnchor = decodeURIComponent(anchor); // there are other langs
if (
all.has(decodedAnchor) ||
isValidAnchor(root, all, decodedAnchor)
) continue;
issues[file] ??= [];
issues[file].push({
type: "missing_anchor",
root,
anchor: decodedAnchor,
});
}
}
}
/** Some redirections are okay, so we ignore those changes */
function isValidRedirection(from: string, to: string) {
return (
(
// CASE 1:
from.includes("deno.land/x/") && // a third-party module
!from.includes("@") && // supposed to be redirected to the latest version
to.includes("@") // and it does get redirected
) ||
(
// CASE 2:
from.includes("deno.com/manual/") && // deno manual link: supposed to be redirected to the latest
to.includes("@") // and does get redirected to the latest.
) ||
// CASE 3: short youtu.be links redirecting to youtube.com links.
to.includes(from.replace(new URL(from).origin + "/", "?v=")) ||
// CASE 4: maybe a slash was removed or added --> I don't think we should care.
((to + "/" == from) || (from + "/" == to)) ||
// CASE 5: maybe some search params was appended --> like a language code?
to.includes(from + "?") ||
// CASE 6: Login redirections; e.g., firebase console -> google login
((to.includes("accounts.google.com") && to.includes("signin")) || // Google
(to.includes("github.com/login?return_to="))) // Github
);
}
/* Some anchors might be missing due to how the content is loaded in the website */
function isValidAnchor(root: string, all: Set<string>, anchor: string) {
// firebase (generally google) docs sometimes messes up the response
// from the fetch as the contents are lazy loaded. the following is a hack:
if (root.includes("firebase.google.com/docs")) {
return all.has(anchor + "_1");
}
return false;
}
function warn(text: string) {
console.warn(`%cWARN%c ${text}`, "color: yellow", "color: none");
}
function log(text: string, color: string) {
console.log(text, `color: ${color}`, "color: none");
}
// REPORT
const issueCounts = {
total: 0,
missing_anchor: 0,
html_instead_of_md: 0,
file_not_found: 0,
redirected: 0,
not_ok: 0,
parse_error: 0,
};
const sortedFiles = Object.keys(issues).sort((a, b) => a.localeCompare(b));
const d = decodeURIComponent;
for (const file of sortedFiles) {
const issueList = issues[file];
issueCounts.total += issueList.length;
let report = r(`\n${file} (${issueList.length})`);
for (const issue of issueList) {
issueCounts[issue.type]++;
report += `\n ${gray("-->")} `;
// deno-fmt-ignore
switch (issue.type) {
case "missing_anchor":
report += `${c(issue.root)} does not have an anchor ${b(d(issue.anchor))}.`;
break;
case "html_instead_of_md": {
const [root, anchor] = issue.reference.split("#");
report += `The "${b(root)}" in ${b(`${root}#${d(anchor)}`)} should be ending with ".md" instead of ".html".`;
break;
}
case "file_not_found": {
const [root] = issue.reference.split("#");
report += `The linked file ${c(root)} does not exist.`;
break;
}
case "redirected":
report += `${c(issue.from)} was redirected to ${c(issue.to)}.`;
break;
case "not_ok":
report += `${c(issue.reference)} returned a non-ok status code.`;
break;
case "parse_error":
report += `Couldn't parse the document at ${b(issue.reference)}.`
break;
}
}
console.log(report);
}
const maxDistance = issueCounts.total.toString().length;
function pad(x: number) {
return x.toString().padStart(maxDistance, " ");
}
console.log(`
SUMMARY
--------------------------${"-".repeat(maxDistance)}
Missing anchors : ${pad(issueCounts.missing_anchor)}
Used html instead of md : ${pad(issueCounts.html_instead_of_md)}
Links to missing files : ${pad(issueCounts.file_not_found)}
Redirected : ${pad(issueCounts.redirected)}
Not OK response : ${pad(issueCounts.not_ok)}
DOM parsing failed : ${pad(issueCounts.parse_error)}
--------------------------${"-".repeat(maxDistance)}
Total : ${issueCounts.total}`);
if (issueCounts.total > 0) {
Deno.exit(1);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment