Skip to content

Instantly share code, notes, and snippets.

@cup-dev
Created Dec 31, 2021
Embed
What would you like to do?
wikime: interesting Wikipedia pages
// --- usage: ---
// deno run --allow-net wikime.ts [NUM_PAGES] [NUM_RESULTS]
// arguments are optional
// --- about: ---
// finds random Wikipedia pages that are the most "interesting"
// where interesting means something like: has denser, more unique links compared to other pages
// MIT License
import * as Colors from "https://deno.land/std/fmt/colors.ts";
import { DOMParser } from "https://deno.land/x/deno_dom/deno-dom-wasm.ts";
import { parse } from "https://deno.land/std@0.113.0/flags/mod.ts";
import Kia from "https://deno.land/x/kia@v0.1.0/mod.ts";
const args = parse(Deno.args)._.flatMap((x: string | number) =>
Number.isInteger(x) && Number(x) > 0 ? [Number(x)] : []
);
const NUM_PAGES = args.length >= 1 ? args[0] : 250;
const NUM_PAGES_TO_SHOW = args.length >= 2 ? args[1] : 40;
const MAX_CONCURRENT = 20;
const MAX_CONCURRENT_WAIT = 250;
const MAX_CONCURRENT_MAX_WAIT = 5000;
const FETCH_TIMEOUT = 5000;
const LENGTH_COEFFICIENT = 0.25;
const PENALTY_TERMS = [
"Personal information</th>",
"Personal details</th>",
`Release date</th>`,
`Released</th>`,
`Birth date</th>`,
`Recorded</th>`,
`Written by</th>`,
`Directed by</th>`,
`Born</th>`,
`Died</th>`,
`Location</th>`,
`Career information</th>`,
`Population</th>`,
`Early life</span>`,
`>stub</a>`,
`Residence</th>`,
];
const PENALTY_MULTIPLIER = 0.5;
const BOOST_TERMS = [
"'s law",
];
const BOOST_MULTIPLIER = 1.5;
const MIN_LENGTH = 1500;
const MIN_LENGTH_PENALTY = 0.5;
const url = `https://en.wikipedia.org/wiki/Special:Random`;
type PageInfo = {
title: string;
url: string;
length: number;
links: string[];
penaltyTerms: string[];
boostTerms: string[];
};
type ScoredPageInfo = {
title: string;
url: string;
length: number;
highEditDistanceLinks: string[];
highEditDistanceLinkCount: number;
penaltyTerms: string[];
boostTerms: string[];
score: number;
};
const formatDecimal = (x: number) => x.toFixed(2);
const formatNumber = (x: number) => x.toFixed(0);
async function fetchWithTimeout(url: string, options: any) {
const controller = new AbortController();
const id = setTimeout(() => {
console.error("timeout");
controller.abort();
}, FETCH_TIMEOUT);
const response = await fetch(url, {
signal: controller.signal,
...options,
});
clearTimeout(id);
return response;
}
console.log("🌐 Fetching random pages from Wikipedia...");
const spinner = new Kia("...");
spinner.start();
let done = 0;
let concurrent = 0;
// load page and get its length
async function getPageInfo(url: string) {
let backOff = 1;
while (concurrent > MAX_CONCURRENT) {
await new Promise((resolve) =>
setTimeout(
resolve,
Math.min(MAX_CONCURRENT_MAX_WAIT, MAX_CONCURRENT_WAIT * backOff),
)
);
backOff *= 1 + (Math.random() * 0.5);
const str = `${
formatNumber((done / NUM_PAGES) * 100)
}% (${done}/${NUM_PAGES})`;
await spinner.set(str);
}
concurrent += 1;
try {
const result = await fetchWithTimeout(url, {
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36",
},
});
const doc = await result.text()
.then((html) => {
const parser = new DOMParser();
return parser.parseFromString(html, "text/html");
});
if (doc == null) {
return null;
}
const allWpLinks = new Set(Array.from(
doc.querySelectorAll("a[href^='/wiki/']"),
(link: any) => {
return decodeURIComponent(
link.attributes.href.replace("/wiki/", "")
.replaceAll(
"_",
" ",
),
);
},
));
concurrent -= 1;
done += 1;
return {
length: doc.body.innerText.length * LENGTH_COEFFICIENT,
url: result.url,
title: doc.title.replace(" - Wikipedia", ""),
penaltyTerms: PENALTY_TERMS.filter((term: string) =>
doc.body.innerHTML.includes(term)
),
boostTerms: BOOST_TERMS.filter((term: string) =>
doc.body.innerHTML.includes(term)
),
links: [...allWpLinks] as string[],
};
} catch (err) {
console.error("Hit error.. ", err);
concurrent -= 1;
done += 1;
return null;
}
}
const resultsPromises: Promise<PageInfo | null>[] = [];
for (let i = 0; i < NUM_PAGES; i++) {
resultsPromises.push(getPageInfo(url));
}
const results = (await Promise.all(resultsPromises)).flatMap((x) =>
x ? [x] : []
);
// score results on most unique links
function editDistance(s: string, t: string) {
const m = s.length;
const n = t.length;
const d = new Array(m + 1);
for (let i = 0; i <= m; i++) {
d[i] = new Array(n + 1);
}
for (let i = 0; i <= m; i++) {
d[i][0] = i;
}
for (let j = 0; j <= n; j++) {
d[0][j] = j;
}
for (let j = 1; j <= n; j++) {
for (let i = 1; i <= m; i++) {
if (s[i - 1] === t[j - 1]) {
d[i][j] = d[i - 1][j - 1];
} else {
d[i][j] = Math.min(
d[i - 1][j] + 1,
d[i][j - 1] + 1,
d[i - 1][j - 1] + 1,
);
}
}
}
return d[m][n];
}
function getUniqueLinks(results: PageInfo[], page: PageInfo) {
const allOtherLinks = results.filter((x) => x.url !== page.url).flatMap((
x,
) => x.links);
return page.links.filter((pageLink) => !allOtherLinks.includes(pageLink));
}
function getHighEditDistanceLinks(links: string[]) {
return links.map((link) => {
const editDistanceToAllOtherPageLinks = links.filter((x) => x !== link)
.map(
(x) => editDistance(link, x),
);
const minEditDistance = Math.min(...editDistanceToAllOtherPageLinks);
return { link, minEditDistance, keep: minEditDistance > link.length / 2 };
}).filter((x) => x.keep);
}
/// SCORING ----------------------------------------
await spinner.succeed(
`Fetched ${results.length} random pages from Wikipedia`,
);
// await spinner.setSpinnerType('noise')
console.log("🤖 Processing results...");
await spinner.start("...");
const scoredResults: ScoredPageInfo[] = [];
done = 0;
for (const result of results) {
await new Promise((resolve) => {
setTimeout(() => {
const page = result;
const uniqueLinks = getUniqueLinks(results, page);
const highEditDistanceLinks = getHighEditDistanceLinks(uniqueLinks).sort(
(a, b) => b.minEditDistance - a.minEditDistance,
).map((x) => x.link);
scoredResults.push({
title: page.title,
url: page.url,
length: page.length,
boostTerms: page.boostTerms,
// uniqueLinks,
// uniqueLinkCount: uniqueLinks.length,
highEditDistanceLinks,
highEditDistanceLinkCount: highEditDistanceLinks.length,
penaltyTerms: page.penaltyTerms,
score: highEditDistanceLinks.length /
(page.length * LENGTH_COEFFICIENT) *
(page.penaltyTerms.length > 0 ? PENALTY_MULTIPLIER : 1.0) *
(page.length > MIN_LENGTH ? 1.0 : MIN_LENGTH_PENALTY) *
(page.boostTerms.length > 0 ? BOOST_MULTIPLIER : 1.0),
});
done += 1;
spinner.set(
`${
formatNumber((done / results.length) * 100)
}% (${done}/${results.length})`,
);
resolve(true);
}, Math.random() * 1);
});
}
scoredResults.sort((a, b) => b.score - a.score);
// DONE SCORING -----------------------------------
await spinner.succeed("Done processing results!");
scoredResults.slice(0, NUM_PAGES_TO_SHOW).sort((a, b) => a.score - b.score)
.forEach((page, i) => {
const index = Math.min(NUM_PAGES_TO_SHOW, scoredResults.length) - i;
if (index <= 10) {
console.log(
Colors.bold(Colors.green(`${index}. ${page.title}`)),
);
} else {
console.log(
Colors.bold(Colors.white(`${index}. ${page.title}`)),
);
}
console.log(Colors.gray(Colors.italic(`\t${page.url}`)));
console.log(Colors.gray(
`\t${
Colors.bold(`Score: ${formatDecimal(page.score)}`)
} [good links: ${page.highEditDistanceLinkCount}] [length: ${
formatNumber(page.length)
}] [penalty terms: ${page.penaltyTerms.slice(0, 1).join(", ")}${
page.penaltyTerms.length > 2 ? ", ..." : ""
}]`,
));
console.log(Colors.gray(
`\tRelated: ${page.highEditDistanceLinks.slice(0, 3).join(", ")}`,
));
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment