Created
November 5, 2020 12:22
-
-
Save PaulCapron/98864c20f74cf982fd70f5715210f091 to your computer and use it in GitHub Desktop.
Preview Wikipedia links of an HTML document (via the `title` attribute)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* @file Preview Wikipedia links 📖🔗👀 | |
* | |
* @see <https://en.wikipedia.org/api/rest_v1/> | |
* | |
* @version 2020-11 | |
* @since 2018-08 | |
* @author <https://paul.fragara.com/#me> | |
* @license CC0-1.0 | |
* The author has dedicated all rights to this software to the public domain. | |
* This software is distributed without any warranty. | |
*/ | |
"use strict"; | |
/** | |
* Handle the few special cases in Wikipedia language/host names. | |
* | |
* @see <https://meta.wikimedia.org/wiki/List_of_Wikipedias#Nonstandard_language_codes> | |
* @see <https://www.ietf.org/rfc/bcp/bcp47.txt> | |
* | |
* @param {!string} languageCode (Human) language, in Wikipedia naming | |
* @return {!string} A corresponding BCP 47 language tag | |
*/ | |
function bcp47FromWikipedia(languageCode) { | |
switch (languageCode.toLowerCase()) { | |
case "simple": return "en"; | |
case "nrm": return "nrf"; | |
case "bat-smg": return "sgs"; | |
case "roa-rup": return "rup"; | |
case "fiu-vro": return "vro"; | |
case "zh-yue": return "yue"; | |
case "zh-min-nan": return "nan"; | |
case "zh-classical": return "lzh"; | |
} | |
return languageCode; | |
} | |
/** | |
* Query Wikipedia to put a summary, in the ‘title’ attribute, | |
* of any HTMLAnchorElement pointing to one of its articles. | |
* | |
* @see <https://www.mediawiki.org/wiki/Specs/Summary/1.3.0> | |
* @see <https://en.wikipedia.org/api/rest_v1/#/Page%20content/get_page_summary__title_> | |
* | |
* @param {!Iterable<Element>=} elts Link elements, non-Wikipedia href get filtered | |
* @param {!Map<string, Promise<string>>=} memo (Previously recorded) fetched summaries | |
* @param {RegExp=} regexp Matches Wikipedia URLs, extracts language & article name (“slug”) | |
* @param {number=} reqMax Max API calls; Wikimedia asks for ≤ 200 requests per second | |
* @param {!RequestInit=} reqParams Fetch parameters | |
* @return {!Map<string, Promise<string>>} (Updated) fetched summaries | |
*/ | |
function titleWikipediaAnchors( | |
elts = document.getElementsByTagName("a"), | |
memo = new Map, | |
regexp = /^https?:\/\/([a-z\-]{2,64})\.wikipedia\.org\/wiki\/([^?#]+)/i, | |
reqMax = 100, | |
reqParams = { | |
"headers": { | |
"Accept": "application/json; charset=utf-8; " | |
+ 'profile="https://www.mediawiki.org/wiki/Specs/Summary/1.4.2"', | |
}, | |
"cache": "force-cache", // HTTP-expired data is totally OK | |
"credentials": "omit", | |
"mode": "cors", | |
// Wikimedia asks to “Set a unique User-Agent […] to contact you quickly.” | |
// By using CORS, the ‘Origin’ header points to us, the embedding site. | |
} | |
) { | |
for (const elt of elts) { | |
if (elt.title !== "") continue; | |
const parts = elt.href.match(regexp); | |
if (parts === null) continue; | |
const [ , lang, slug] = parts; | |
if (!elt.matches(`:lang(${bcp47FromWikipedia(lang)})`)) continue; | |
const url = `https://${lang}.wikipedia.org/api/rest_v1/page/summary/` | |
+ slug.replace(/\//g, "%2F"); | |
if (!memo.has(url)) { | |
if (memo.size >= reqMax) continue; | |
memo.set(url, fetch(url, reqParams) | |
.then(resp => resp.json()) | |
.then(json => (json.titles ? json.titles.normalized : slug) | |
+ (json.description ? ` (${json.description})` : ""))); | |
} | |
memo.get(url).then(txt => { elt.title = txt; }); | |
} | |
return memo; | |
} | |
console.log("wkpd: fetched", titleWikipediaAnchors()); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment