Created
December 31, 2021 23:48
-
-
Save cup-dev/2763be3fad3e7e19e016baeae6100206 to your computer and use it in GitHub Desktop.
wikime: interesting Wikipedia pages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// --- usage: --- | |
// deno run --allow-net wikime.ts [NUM_PAGES] [NUM_RESULTS] | |
// arguments are optional | |
// --- about: --- | |
// finds random Wikipedia pages that are the most "interesting" | |
// where interesting means something like: has denser, more unique links compared to other pages | |
// MIT License | |
import * as Colors from "https://deno.land/std/fmt/colors.ts"; | |
import { DOMParser } from "https://deno.land/x/deno_dom/deno-dom-wasm.ts"; | |
import { parse } from "https://deno.land/std@0.113.0/flags/mod.ts"; | |
import Kia from "https://deno.land/x/kia@v0.1.0/mod.ts"; | |
const args = parse(Deno.args)._.flatMap((x: string | number) => | |
Number.isInteger(x) && Number(x) > 0 ? [Number(x)] : [] | |
); | |
const NUM_PAGES = args.length >= 1 ? args[0] : 250; | |
const NUM_PAGES_TO_SHOW = args.length >= 2 ? args[1] : 40; | |
const MAX_CONCURRENT = 20; | |
const MAX_CONCURRENT_WAIT = 250; | |
const MAX_CONCURRENT_MAX_WAIT = 5000; | |
const FETCH_TIMEOUT = 5000; | |
const LENGTH_COEFFICIENT = 0.25; | |
const PENALTY_TERMS = [ | |
"Personal information</th>", | |
"Personal details</th>", | |
`Release date</th>`, | |
`Released</th>`, | |
`Birth date</th>`, | |
`Recorded</th>`, | |
`Written by</th>`, | |
`Directed by</th>`, | |
`Born</th>`, | |
`Died</th>`, | |
`Location</th>`, | |
`Career information</th>`, | |
`Population</th>`, | |
`Early life</span>`, | |
`>stub</a>`, | |
`Residence</th>`, | |
]; | |
const PENALTY_MULTIPLIER = 0.5; | |
const BOOST_TERMS = [ | |
"'s law", | |
]; | |
const BOOST_MULTIPLIER = 1.5; | |
const MIN_LENGTH = 1500; | |
const MIN_LENGTH_PENALTY = 0.5; | |
const url = `https://en.wikipedia.org/wiki/Special:Random`; | |
type PageInfo = { | |
title: string; | |
url: string; | |
length: number; | |
links: string[]; | |
penaltyTerms: string[]; | |
boostTerms: string[]; | |
}; | |
type ScoredPageInfo = { | |
title: string; | |
url: string; | |
length: number; | |
highEditDistanceLinks: string[]; | |
highEditDistanceLinkCount: number; | |
penaltyTerms: string[]; | |
boostTerms: string[]; | |
score: number; | |
}; | |
const formatDecimal = (x: number) => x.toFixed(2); | |
const formatNumber = (x: number) => x.toFixed(0); | |
async function fetchWithTimeout(url: string, options: any) { | |
const controller = new AbortController(); | |
const id = setTimeout(() => { | |
console.error("timeout"); | |
controller.abort(); | |
}, FETCH_TIMEOUT); | |
const response = await fetch(url, { | |
signal: controller.signal, | |
...options, | |
}); | |
clearTimeout(id); | |
return response; | |
} | |
console.log("🌐 Fetching random pages from Wikipedia..."); | |
const spinner = new Kia("..."); | |
spinner.start(); | |
let done = 0; | |
let concurrent = 0; | |
// load page and get its length | |
async function getPageInfo(url: string) { | |
let backOff = 1; | |
while (concurrent > MAX_CONCURRENT) { | |
await new Promise((resolve) => | |
setTimeout( | |
resolve, | |
Math.min(MAX_CONCURRENT_MAX_WAIT, MAX_CONCURRENT_WAIT * backOff), | |
) | |
); | |
backOff *= 1 + (Math.random() * 0.5); | |
const str = `${ | |
formatNumber((done / NUM_PAGES) * 100) | |
}% (${done}/${NUM_PAGES})`; | |
await spinner.set(str); | |
} | |
concurrent += 1; | |
try { | |
const result = await fetchWithTimeout(url, { | |
headers: { | |
"User-Agent": | |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36", | |
}, | |
}); | |
const doc = await result.text() | |
.then((html) => { | |
const parser = new DOMParser(); | |
return parser.parseFromString(html, "text/html"); | |
}); | |
if (doc == null) { | |
return null; | |
} | |
const allWpLinks = new Set(Array.from( | |
doc.querySelectorAll("a[href^='/wiki/']"), | |
(link: any) => { | |
return decodeURIComponent( | |
link.attributes.href.replace("/wiki/", "") | |
.replaceAll( | |
"_", | |
" ", | |
), | |
); | |
}, | |
)); | |
concurrent -= 1; | |
done += 1; | |
return { | |
length: doc.body.innerText.length * LENGTH_COEFFICIENT, | |
url: result.url, | |
title: doc.title.replace(" - Wikipedia", ""), | |
penaltyTerms: PENALTY_TERMS.filter((term: string) => | |
doc.body.innerHTML.includes(term) | |
), | |
boostTerms: BOOST_TERMS.filter((term: string) => | |
doc.body.innerHTML.includes(term) | |
), | |
links: [...allWpLinks] as string[], | |
}; | |
} catch (err) { | |
console.error("Hit error.. ", err); | |
concurrent -= 1; | |
done += 1; | |
return null; | |
} | |
} | |
const resultsPromises: Promise<PageInfo | null>[] = []; | |
for (let i = 0; i < NUM_PAGES; i++) { | |
resultsPromises.push(getPageInfo(url)); | |
} | |
const results = (await Promise.all(resultsPromises)).flatMap((x) => | |
x ? [x] : [] | |
); | |
// score results on most unique links | |
function editDistance(s: string, t: string) { | |
const m = s.length; | |
const n = t.length; | |
const d = new Array(m + 1); | |
for (let i = 0; i <= m; i++) { | |
d[i] = new Array(n + 1); | |
} | |
for (let i = 0; i <= m; i++) { | |
d[i][0] = i; | |
} | |
for (let j = 0; j <= n; j++) { | |
d[0][j] = j; | |
} | |
for (let j = 1; j <= n; j++) { | |
for (let i = 1; i <= m; i++) { | |
if (s[i - 1] === t[j - 1]) { | |
d[i][j] = d[i - 1][j - 1]; | |
} else { | |
d[i][j] = Math.min( | |
d[i - 1][j] + 1, | |
d[i][j - 1] + 1, | |
d[i - 1][j - 1] + 1, | |
); | |
} | |
} | |
} | |
return d[m][n]; | |
} | |
function getUniqueLinks(results: PageInfo[], page: PageInfo) { | |
const allOtherLinks = results.filter((x) => x.url !== page.url).flatMap(( | |
x, | |
) => x.links); | |
return page.links.filter((pageLink) => !allOtherLinks.includes(pageLink)); | |
} | |
function getHighEditDistanceLinks(links: string[]) { | |
return links.map((link) => { | |
const editDistanceToAllOtherPageLinks = links.filter((x) => x !== link) | |
.map( | |
(x) => editDistance(link, x), | |
); | |
const minEditDistance = Math.min(...editDistanceToAllOtherPageLinks); | |
return { link, minEditDistance, keep: minEditDistance > link.length / 2 }; | |
}).filter((x) => x.keep); | |
} | |
/// SCORING ---------------------------------------- | |
await spinner.succeed( | |
`Fetched ${results.length} random pages from Wikipedia`, | |
); | |
// await spinner.setSpinnerType('noise') | |
console.log("🤖 Processing results..."); | |
await spinner.start("..."); | |
const scoredResults: ScoredPageInfo[] = []; | |
done = 0; | |
for (const result of results) { | |
await new Promise((resolve) => { | |
setTimeout(() => { | |
const page = result; | |
const uniqueLinks = getUniqueLinks(results, page); | |
const highEditDistanceLinks = getHighEditDistanceLinks(uniqueLinks).sort( | |
(a, b) => b.minEditDistance - a.minEditDistance, | |
).map((x) => x.link); | |
scoredResults.push({ | |
title: page.title, | |
url: page.url, | |
length: page.length, | |
boostTerms: page.boostTerms, | |
// uniqueLinks, | |
// uniqueLinkCount: uniqueLinks.length, | |
highEditDistanceLinks, | |
highEditDistanceLinkCount: highEditDistanceLinks.length, | |
penaltyTerms: page.penaltyTerms, | |
score: highEditDistanceLinks.length / | |
(page.length * LENGTH_COEFFICIENT) * | |
(page.penaltyTerms.length > 0 ? PENALTY_MULTIPLIER : 1.0) * | |
(page.length > MIN_LENGTH ? 1.0 : MIN_LENGTH_PENALTY) * | |
(page.boostTerms.length > 0 ? BOOST_MULTIPLIER : 1.0), | |
}); | |
done += 1; | |
spinner.set( | |
`${ | |
formatNumber((done / results.length) * 100) | |
}% (${done}/${results.length})`, | |
); | |
resolve(true); | |
}, Math.random() * 1); | |
}); | |
} | |
scoredResults.sort((a, b) => b.score - a.score); | |
// DONE SCORING ----------------------------------- | |
await spinner.succeed("Done processing results!"); | |
scoredResults.slice(0, NUM_PAGES_TO_SHOW).sort((a, b) => a.score - b.score) | |
.forEach((page, i) => { | |
const index = Math.min(NUM_PAGES_TO_SHOW, scoredResults.length) - i; | |
if (index <= 10) { | |
console.log( | |
Colors.bold(Colors.green(`${index}. ${page.title}`)), | |
); | |
} else { | |
console.log( | |
Colors.bold(Colors.white(`${index}. ${page.title}`)), | |
); | |
} | |
console.log(Colors.gray(Colors.italic(`\t${page.url}`))); | |
console.log(Colors.gray( | |
`\t${ | |
Colors.bold(`Score: ${formatDecimal(page.score)}`) | |
} [good links: ${page.highEditDistanceLinkCount}] [length: ${ | |
formatNumber(page.length) | |
}] [penalty terms: ${page.penaltyTerms.slice(0, 1).join(", ")}${ | |
page.penaltyTerms.length > 2 ? ", ..." : "" | |
}]`, | |
)); | |
console.log(Colors.gray( | |
`\tRelated: ${page.highEditDistanceLinks.slice(0, 3).join(", ")}`, | |
)); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment