Skip to content

Instantly share code, notes, and snippets.

@nberlette
Last active February 27, 2024 10:35
Show Gist options
  • Save nberlette/83bdb713660586a3fb2ce8a3e50e20fa to your computer and use it in GitHub Desktop.
Save nberlette/83bdb713660586a3fb2ce8a3e50e20fa to your computer and use it in GitHub Desktop.
WorkersAI: utility to extract LLM model data from Cloudflare Workers AI docs
/*!
* Run this script with Deno to extract all models from the Workers AI
* documentation site (https://developers.cloudflare.com/workers-ai/models).
*
* @example
* ```sh
* # install Deno if you do not have it already
* curl -fsSL https://deno.land/install.sh | sh -
*
* # run the script
* deno run --allow-net ./extract-workers-ai-models.ts
*
* # example output:
* # [
* # {
* # name: "@hf/thebloke/llama-2-13b-chat-awq",
* # urls: [
* # [
* # "https://huggingface.co/TheBloke/Llama-2-13B-chat-AWQ",
* # "More information"
* # ]
* # ],
* # info: "Llama 2 13B Chat AWQ is an efficient, accurate"... 117 more characters,
* # data: {
* # default_max_sequence_tokens_stream: 512,
* # default_max_sequence_tokens: 256
* # }
* # },
* # ...
* # ]
* ```
*/
import {
Document,
DOMParser,
Element,
} from "https://deno.land/x/deno_dom@v0.1.45/deno-dom-wasm.ts";
export interface Model {
readonly name: string;
readonly info: string;
readonly urls: (readonly [url: string | URL, text: string])[];
readonly data: Record<string, number>;
}
/**
* Fetches the HTML content for a given URL.
* @param url The URL to fetch.
* @returns The HTML content of the fetched URL.
*/
async function loadHTML(url: string, referrer: string): Promise<string> {
const response = await fetch(url, {
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
},
method: "GET",
mode: "cors",
redirect: "follow",
referrer,
});
if (!response.ok) {
throw new Error(`Failed to fetch ${url}: ${response.statusText}`);
}
return await response.text();
}
const parser = new DOMParser(), { parseFromString: parseDOM } = parser;
const parseHTML = (html: string, type = "text/html") => parseDOM(html, type)!;
async function loadDocument(url: string | URL, referrer = url): Promise<Document> {
const html = await loadHTML(url.toString(), referrer.toString());
return parser.parseFromString(html, "text/html")!;
}
/**
* Extracts model information from a given HTML document.
*
* @param html The document HTML from which to extract models.
* @returns The list of models extracted from the document.
*/
function extractModels(html: string): Model[] {
const document = parser.parseFromString(html, "text/html")!;
return Array.from(
document.querySelectorAll(
`#main > article > table > tbody > tr > td > code`,
),
).map((c) => {
const name = c.textCondtent!;
let rhs = c.parentNode!.nextSibling as Element;
while (rhs.nodeName )
const rhsLinks = rhs.querySelectorAll("a[href]");
const rhsInnerText = rhs.innerText!;
let rhsText = rhs.childNodes[0].innerText || rhsInnerText;
const dataRegExp = (/(?<=[a-z.])(?:Default max|Max |languages: ((?:(\w+)(?:, |))+))/g);
const uselessRegExp = /(?:Open external link|External link icon)\n?/g;
const usefulRegExp = /(More information|Terms and license)\n?/g;
const urlRegExp = /https?:\/\/\S+/g;
rhsText = rhsText.trim().replaceAll(uselessRegExp, "");
let rhsHTML = rhs.innerHTML;
const links = Array.from(
rhsLinks.length ? rhsLinks :
rhsText.matchAll(urlRegExp) ?? [],
).flat(2).map((a) => a as Element);
const urls = Array.from<Element>(links).map((a) =>
[
a.getAttribute("href")!,
a.innerText.match(usefulRegExp)?.[0] ?? "More information",
] as [string, string]
);
const info = rhs.innerText!.replaceAll(
/(More infomation)\s*/mg,
`[$1](${urls[0]?.[0] ?? "#"})`,
);
const data = Array.from(
String(rhs.innerHTML).matchAll(/<strong>(.+?)<\/strong>:\s*(\d+)/g) ?? [],
).reduce((acc, [, key, value]) => {
const normalizedKey = key.toLowerCase().replace(/[^a-z0-9_]+/g, "_")
.replace(/^_|_$|(?<=_)_+/g, "");
acc[normalizedKey] = parseInt(value, 10);
return acc;
}, {} as Record<string, number>);
return { name, urls, info, data };
});
}
/**
* Fetches and parses models from each category.
*/
export async function listAllModels(): Promise<Model[]> {
const categories: [string, string][] = [
["text-generation", "Text Generation"],
["speech-recognition", "Automatic Speech Recognition"],
["translation", "Translation"],
["text-classification", "Text Classification"],
["image-classification", "Image Classification"],
["text-to-image", "Text-to-Image"],
["text-embeddings", "Text Embeddings"],
];
const baseUrl: string = "https://developers.cloudflare.com/workers-ai/models";
const models: Model[] = [];
for (const [slug] of categories) {
const url = `${baseUrl}/${slug}`;
try {
const html = await loadHTML(url, baseUrl);
const categoryModels = extractModels(html);
models.push(...categoryModels);
} catch { /* ignore */ }
}
console.log("All models fetched:", models);
return models;
}
if (import.meta.main) {
console.log(await listAllModels());
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment