Last active June 3, 2023 14:50
script for validating links in grammyjs/website before building the website
import { extname, join } from "";
import {
} from "";
import MarkdownIt from "";
import anchorPlugin from "";
import { slugify } from "";
import {
blue as b,
cyan as c,
red as r,
} from "";
type FetchOptions = Parameters<typeof fetch>[1];
const INDEX_FILE = "";
const RETRY_FAILED_FETCH = true;
const MAX_RETRIES = 5;
// some sites just ... ehh
const ACCEPTABLE_NOT_OK_STATUS: Record<string, number> = {
"": 403,
"": 403,
const FETCH_OPTIONS: FetchOptions = {
headers: {
"Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
method: "GET",
mode: "cors",
const domParser = new DOMParser();
const md = MarkdownIt({ html: true, linkify: true })
.use(anchorPlugin, { slugify }); // this is what vuepress uses.
// All anchors that are actually present in the file or website.
const allAnchors: Record<string, Set<string>> = {};
const links: Record<string, Set<string>> = {}; // filepath: [...links in the file]
const usedAnchors: Record<string, Record<string, Set<string>>> = {}; // filepath: [...anchors in the file]
// the linked file ^ ^ anchors mentioned in that file.
// ^ the file that mentioned the anchor
interface GeneralIssue {
| "html_instead_of_md"
| "file_not_found"
| "not_ok"
| "parse_error";
reference: string;
interface RedirectedIssue {
type: "redirected";
from: string;
to: string;
interface MissingAnchorIssue {
type: "missing_anchor";
root: string;
anchor: string;
type Issue = GeneralIssue | RedirectedIssue | MissingAnchorIssue;
const issues: Record<string, Issue[]> = {};
async function findLinksFromFiles(directory: string) {
for await (const dirEntry of Deno.readDir(directory)) {
const path = join(directory,;
if (dirEntry.isFile) {
if (extname( != ".md") continue;
const content = await Deno.readTextFile(path);
const tokens = md.parse(content, {});
const html = md.render(content, {});
const document = domParser.parseFromString(html, "text/html");
if (document == null) {
throw new Error("Document seems to be empty: shouldn't happen");
allAnchors[path] = getAnchors(document);
const filtered = filterLinks(tokens);
for (const link of filtered) {
if (link.startsWith("http")) { // external link.
links[path] ??= new Set();
} else if (link.startsWith(".")) { // relative path to a file.
await resolveRelativeLink(directory, path, link);
} else if (link.startsWith("#")) { // anchor to the same file.
usedAnchors[path] ??= {};
usedAnchors[path][path] ??= new Set();
} else { // some other type -- MUST be an invalid one
throw new Error("Different type of link " + link);
} else if (dirEntry.isDirectory) {
await findLinksFromFiles(path);
function filterLinks(tokens: ReturnType<typeof md.parse>) {
const links: string[] = [];
for (const token of tokens) {
if (token.type === "link_open") {
const href = token.attrGet("href");
if (href != null) links.push(href);
if (token.children != null) {
return links;
async function resolveRelativeLink(
directory: string,
path: string,
link: string,
) {
let [root, anchor] = link.split("#");
if (root.endsWith(".html")) {
issues[path] ??= [];
issues[path].push({ type: "html_instead_of_md", reference: link });
root = root.replace(".html", ".md");
if (!root.endsWith(".md")) {
if (!root.endsWith("/")) root += "/";
root += INDEX_FILE;
const relativePath = join(directory, root);
try {
await Deno.lstat(relativePath);
if (anchor == null) return;
usedAnchors[relativePath] ??= {};
usedAnchors[relativePath][path] ??= new Set();
usedAnchors[relativePath][path].add(anchor); // means that this anchor have been used to indicate the relPath file.
} catch (error) {
if (error instanceof Deno.errors.NotFound) {
issues[path] ??= [];
issues[path].push({ type: "file_not_found", reference: link });
throw error;
function getAnchors(document: HTMLDocument): Set<string> {
const anchors: string[] = [];
const tags = ["section", "h1", "h2", "h3", "h4", "h5", "h6", "div"];
for (const tag of tags) anchors.push(...anchorsFromId(document, tag));
return new Set([
.map((element) => element.getAttribute("href"))
.filter((href) => href != null && href.startsWith("#") && href.length > 1)
.map((href) => href!.substring(1)),
function anchorsFromId(document: HTMLDocument, tag: string) {
return document.getElementsByTagName(tag)
.map((element) => element.getAttribute("id"))
.filter((id) => id != null && id.trim() !== "") as string[];
await findLinksFromFiles(".");
/** Transform the URL, if needed, before fetching */
function transformUrl(url: string) {
if (url.includes("://")) { // My ISP have blocked :(
warn("Changing to for convenience");
url = url.replace("://", "://");
return url;
async function retryFetch(url: string, options: FetchOptions) {
let retries = 0;
let response: Response | undefined;
// deno-lint-ignore no-explicit-any
let error: any;
do {
try {
response = await fetch(url, options);
} catch (err) {
error = err;
log(`%cINFO%c Retrying (${retries + 1})`, "orange");
} while (retries < MAX_RETRIES && response == null);
if (response == null) {
log(`%cFailed%c Couldn't get a proper response`, "red");
return response;
// Manage external links
for (const file in links) {
for (const url_ of links[file]) {
const [root, anchor] = url_.split("#");
if (usedAnchors[root] == null) {
usedAnchors[root] = {};
usedAnchors[root][file] ??= new Set();
if (anchor != null) usedAnchors[root][file].add(anchor);
} else {
usedAnchors[root][file] ??= new Set();
if (anchor != null) usedAnchors[root][file].add(anchor);
continue; // already fetched once.
const url = transformUrl(url_);
log(`%cFetching%c ${root}`, "blue");
const response = await retryFetch(url, FETCH_OPTIONS);
if (response == null) {
delete usedAnchors[root];
if (response.redirected) {
if (!isValidRedirection(url, response.url)) {
issues[file] ??= [];
issues[file].push({ type: "redirected", from: url_, to: response.url });
if (!response.ok && ACCEPTABLE_NOT_OK_STATUS[url_] != response.status) {
issues[file] ??= [];
issues[file].push({ type: "not_ok", reference: url_ });
`%cNOT OK%c response wasn't okay: ${response.status} ${response.statusText}`,
// for parsing the document we need to make sure its html.
const contentType = response.headers.get("content-type");
if (!contentType) {
warn(`%cWARN%c No content-type header, continuing anyway`);
} else if (!contentType.includes("text/html")) {
warn(`Content-type is: ${contentType}, but let's just go with html`);
let document: HTMLDocument;
try {
const content = await response.text();
const doc = domParser.parseFromString(content, "text/html");
if (doc == null) throw new Error("no document, skipping");
document = doc;
} catch (err) {
issues[file] ??= [];
issues[file].push({ type: "parse_error", reference: url_ });
log("%cERROR%c Couldn't parse the text (error below), skipping", "red");
allAnchors[root] = getAnchors(document);
// Missing anchors
for (const root in usedAnchors) {
const all = allAnchors[root] ?? new Set();
for (const file in usedAnchors[root]) {
for (const anchor of usedAnchors[root][file]) {
const decodedAnchor = decodeURIComponent(anchor); // there are other langs
if (
all.has(decodedAnchor) ||
isValidAnchor(root, all, decodedAnchor)
) continue;
issues[file] ??= [];
type: "missing_anchor",
anchor: decodedAnchor,
/** Some redirections are okay, so we ignore those changes */
function isValidRedirection(from: string, to: string) {
return (
// CASE 1:
from.includes("") && // a third-party module
!from.includes("@") && // supposed to be redirected to the latest version
to.includes("@") // and it does get redirected
) ||
// CASE 2:
from.includes("") && // deno manual link: supposed to be redirected to the latest
to.includes("@") // and does get redirected to the latest.
) ||
// CASE 3: short links redirecting to links.
to.includes(from.replace(new URL(from).origin + "/", "?v=")) ||
// CASE 4: maybe a slash was removed or added --> I don't think we should care.
((to + "/" == from) || (from + "/" == to)) ||
// CASE 5: maybe some search params was appended --> like a language code?
to.includes(from + "?") ||
// CASE 6: Login redirections; e.g., firebase console -> google login
((to.includes("") && to.includes("signin")) || // Google
(to.includes(""))) // Github
/* Some anchors might be missing due to how the content is loaded in the website */
function isValidAnchor(root: string, all: Set<string>, anchor: string) {
// firebase (generally google) docs sometimes messes up the response
// from the fetch as the contents are lazy loaded. the following is a hack:
if (root.includes("")) {
return all.has(anchor + "_1");
return false;
function warn(text: string) {
console.warn(`%cWARN%c ${text}`, "color: yellow", "color: none");
function log(text: string, color: string) {
console.log(text, `color: ${color}`, "color: none");
const issueCounts = {
total: 0,
missing_anchor: 0,
html_instead_of_md: 0,
file_not_found: 0,
redirected: 0,
not_ok: 0,
parse_error: 0,
const sortedFiles = Object.keys(issues).sort((a, b) => a.localeCompare(b));
const d = decodeURIComponent;
for (const file of sortedFiles) {
const issueList = issues[file]; += issueList.length;
let report = r(`\n${file} (${issueList.length})`);
for (const issue of issueList) {
report += `\n ${gray("-->")} `;
// deno-fmt-ignore
switch (issue.type) {
case "missing_anchor":
report += `${c(issue.root)} does not have an anchor ${b(d(issue.anchor))}.`;
case "html_instead_of_md": {
const [root, anchor] = issue.reference.split("#");
report += `The "${b(root)}" in ${b(`${root}#${d(anchor)}`)} should be ending with ".md" instead of ".html".`;
case "file_not_found": {
const [root] = issue.reference.split("#");
report += `The linked file ${c(root)} does not exist.`;
case "redirected":
report += `${c(issue.from)} was redirected to ${c(}.`;
case "not_ok":
report += `${c(issue.reference)} returned a non-ok status code.`;
case "parse_error":
report += `Couldn't parse the document at ${b(issue.reference)}.`
const maxDistance =;
function pad(x: number) {
return x.toString().padStart(maxDistance, " ");
Missing anchors : ${pad(issueCounts.missing_anchor)}
Used html instead of md : ${pad(issueCounts.html_instead_of_md)}
Links to missing files : ${pad(issueCounts.file_not_found)}
Redirected : ${pad(issueCounts.redirected)}
Not OK response : ${pad(issueCounts.not_ok)}
DOM parsing failed : ${pad(issueCounts.parse_error)}
Total : ${}`);
if ( > 0) {
