Skip to content

Instantly share code, notes, and snippets.

@ChlodAlejandro
Last active April 15, 2023 13:58
Show Gist options
  • Save ChlodAlejandro/4b980db4fafc0be6fdd9f66a8659cc14 to your computer and use it in GitHub Desktop.
Save ChlodAlejandro/4b980db4fafc0be6fdd9f66a8659cc14 to your computer and use it in GitHub Desktop.
Scan for contentful Wikipedia edits per minute
import WikimediaStream from "wikimedia-streams";
import axios from 'axios';
import * as fs from "fs/promises";
import { MediaWikiRecentChangeEditEvent } from "wikimedia-streams/build/src/streams/MediaWikiRecentChangeEvent";
import { diffSentences } from "diff";
const outputFile = "data.json";
const outputMap: Record<string, number> = {};
(async () => {
const words: Record<string, true> = Object.fromEntries(
// https://raw.githubusercontent.com/dwyl/english-words/master/words.txt
(await fs.readFile("words.txt"))
.toString("utf8")
.split(/\r?\n/)
// Filter out number/symbol-only words
.filter((v) => /[a-z]/i.test(v))
.map((v) => [v.toLowerCase(), true])
);
console.log("loaded " + Object.keys(words).length + " words");
const stream = new WikimediaStream('recentchange')
.filter('recentchange')
.all({ wiki: 'enwiki', type: 'edit' })
.any({ namespace: 0 })
.any({ namespace: 118 })
.addListener(async (rc) => {
const byteDiff = rc.length.new - rc.length.old;
const revid = rc.revision.new;
if ( byteDiff > 500 ) {
console.log(`HIT [Special:Diff/${revid}]: +${byteDiff} on "${rc.title}" by ${rc.user}`);
if (await checks(rc)) {
bumpCount();
}
}
});
async function bumpCount() {
const key = new Date().toISOString().replace(/:\d\d\.\d\d\dZ/, "Z");
if ( outputMap[key] ) {
outputMap[key]++;
} else {
outputMap[key] = 1;
}
await fs.writeFile(outputFile, JSON.stringify(outputMap));
}
async function checks(diff: MediaWikiRecentChangeEditEvent): Promise<boolean> {
const revid = diff.revision.new;
const data = await axios("https://en.wikipedia.org/w/api.php", {
params: {
'format': 'json',
'formatversion': '2',
'action': 'query',
'prop': 'revisions',
'revids': [diff.revision.old, diff.revision.new].join("|"),
'rvprop': 'ids|content',
'rvslots': 'main'
},
responseType: 'json'
}).then(r => r.data);
if ( !data?.query?.pages ) {
console.error("Cannot work on data.", data.query);
return false;
}
const revs = data.query.pages.reduce((p: any[], n: { revisions: any }) => {
return p.concat(n.revisions)
}, []);
const oldContent = revs.find((r: any) => r.revid === diff.revision.old)?.slots?.main?.content;
const newContent = revs.find((r: any) => r.revid === diff.revision.new)?.slots?.main?.content;
if ( !oldContent || !newContent ) {
console.warn("old/new revision not found", revs);
return false;
}
const diffed = diffSentences(oldContent, newContent);
const added = diffed.filter(v => v.added).map(v => v.value.trim()).join(" ");
let cleaned = added
// Strip Wikipedia reference tags
.replace(/<ref[^>]*\/>/g, "")
// Strip Wikipedia reference tags
.replace(/<ref[^>]*>[\S\s]+?<\/ref>/g, "")
// <small>
.replace(/<\/?small>/g, "")
// External links
.replace(/==\s*External links\s*==[\S\s]*/i, "")
// Strip HTML comments
.replace(/<!--[\S\s]+?-->/g, "")
// Strip common HTML attributes
.replace(/(style|class|colspan|rowspan|width|alt|name)="[^"]+"/g, "")
// Section headings
.replace(/==([^\n]+)==/g, "$1:")
// and
.replace(/\s+&\s+/g, " and ")
// [[Wikilinks]]
.replace(/\[\[([^\]]+)\]\]/g, "$1")
// [[Piped|Wikilinks]] (and pipe trick)
.replace(/\[\[([^\]\|]+)\|([^\]]+)?\]\]/g, "$2 ($1)")
// Bold, italics, etc.
.replace(/'''''([^']+)'''''/g, "$1")
.replace(/'''([^']+)'''/g, "$1")
.replace(/''([^']+)''/g, "$1");
// Get just the wordy text
const wordy: string[] =
Array.from(cleaned.matchAll(/(?:["\(\[\<$])?[\w'-—–]+(?:[%!",.\>\]\)\/]){0,3}/g))
.map( v => v[0] )
.filter(v => words[/\w+/.exec(v)?.[0].toLowerCase() ?? -1] );
if (wordy.length > 60) {
console.log(`HIT [Special:Diff/${revid}]: Pre-check HIT, ${wordy.length} words`);
return true;
} else {
return false;
}
}
})();
{
"2023-04-07T01:49Z": 3,
"2023-04-07T01:50Z": 3,
"2023-04-07T01:51Z": 4,
"2023-04-07T01:52Z": 2,
"2023-04-07T01:53Z": 2,
"2023-04-07T01:54Z": 2,
"2023-04-07T01:55Z": 2,
"2023-04-07T01:56Z": 2,
"2023-04-07T01:57Z": 6,
"2023-04-07T01:58Z": 2,
"2023-04-07T01:59Z": 3,
"2023-04-07T02:00Z": 2,
"2023-04-07T02:01Z": 3,
"2023-04-07T02:02Z": 1,
"2023-04-07T02:03Z": 5,
"2023-04-07T02:04Z": 2,
"2023-04-07T02:05Z": 3,
"2023-04-07T02:06Z": 3,
"2023-04-07T02:07Z": 3,
"2023-04-07T02:08Z": 1,
"2023-04-07T02:09Z": 5,
"2023-04-07T02:10Z": 1,
"2023-04-07T02:11Z": 2,
"2023-04-07T02:12Z": 5,
"2023-04-07T02:13Z": 1,
"2023-04-07T02:14Z": 2,
"2023-04-07T02:15Z": 5,
"2023-04-07T02:16Z": 4,
"2023-04-07T02:17Z": 2,
"2023-04-07T02:18Z": 2,
"2023-04-07T02:19Z": 4,
"2023-04-07T02:20Z": 2,
"2023-04-07T02:21Z": 1,
"2023-04-07T02:22Z": 6,
"2023-04-07T02:23Z": 1,
"2023-04-07T02:24Z": 3,
"2023-04-07T02:25Z": 4,
"2023-04-07T02:26Z": 5,
"2023-04-07T02:27Z": 4,
"2023-04-07T02:28Z": 5,
"2023-04-07T02:29Z": 6,
"2023-04-07T02:30Z": 2,
"2023-04-07T02:31Z": 2,
"2023-04-07T02:32Z": 7,
"2023-04-07T02:34Z": 5,
"2023-04-07T02:35Z": 1,
"2023-04-07T02:36Z": 4,
"2023-04-07T02:37Z": 2,
"2023-04-07T02:38Z": 1,
"2023-04-07T02:39Z": 3,
"2023-04-07T02:40Z": 3,
"2023-04-07T02:41Z": 4,
"2023-04-07T02:42Z": 4,
"2023-04-07T02:44Z": 1,
"2023-04-07T02:45Z": 3,
"2023-04-07T02:46Z": 1,
"2023-04-07T02:47Z": 4,
"2023-04-07T02:48Z": 1,
"2023-04-07T02:49Z": 4,
"2023-04-07T02:50Z": 2,
"2023-04-07T02:51Z": 6,
"2023-04-07T02:52Z": 3,
"2023-04-07T02:53Z": 6,
"2023-04-07T02:54Z": 2,
"2023-04-07T02:55Z": 3,
"2023-04-07T02:56Z": 5,
"2023-04-07T02:57Z": 2,
"2023-04-07T02:58Z": 1,
"2023-04-07T02:59Z": 5,
"2023-04-07T03:00Z": 3,
"2023-04-07T03:01Z": 1,
"2023-04-07T03:02Z": 6,
"2023-04-07T03:03Z": 3,
"2023-04-07T03:04Z": 3,
"2023-04-07T03:05Z": 4,
"2023-04-07T03:06Z": 5,
"2023-04-07T03:07Z": 4,
"2023-04-07T03:08Z": 3,
"2023-04-07T03:09Z": 4,
"2023-04-07T03:10Z": 2,
"2023-04-07T03:11Z": 4,
"2023-04-07T03:12Z": 6,
"2023-04-07T03:13Z": 1,
"2023-04-07T03:14Z": 3,
"2023-04-07T03:15Z": 1,
"2023-04-07T03:16Z": 1,
"2023-04-07T03:17Z": 4,
"2023-04-07T03:18Z": 4,
"2023-04-07T03:19Z": 3,
"2023-04-07T03:20Z": 4,
"2023-04-07T03:21Z": 6,
"2023-04-07T03:22Z": 2,
"2023-04-07T03:23Z": 5,
"2023-04-07T03:24Z": 4,
"2023-04-07T03:25Z": 2,
"2023-04-07T03:26Z": 4,
"2023-04-07T03:27Z": 4,
"2023-04-07T03:28Z": 2,
"2023-04-07T03:29Z": 5,
"2023-04-07T03:30Z": 3,
"2023-04-07T03:31Z": 2,
"2023-04-07T03:32Z": 1,
"2023-04-07T03:33Z": 3,
"2023-04-07T03:34Z": 4,
"2023-04-07T03:35Z": 6,
"2023-04-07T03:36Z": 3,
"2023-04-07T03:38Z": 3,
"2023-04-07T03:39Z": 6,
"2023-04-07T03:40Z": 6,
"2023-04-07T03:41Z": 2,
"2023-04-07T03:42Z": 2,
"2023-04-07T03:43Z": 5,
"2023-04-07T03:44Z": 6,
"2023-04-07T03:45Z": 3,
"2023-04-07T03:46Z": 2,
"2023-04-07T03:47Z": 7,
"2023-04-07T03:48Z": 8,
"2023-04-07T03:49Z": 3,
"2023-04-07T03:50Z": 5,
"2023-04-07T03:52Z": 4,
"2023-04-07T03:53Z": 6,
"2023-04-07T03:54Z": 1,
"2023-04-07T03:55Z": 6,
"2023-04-07T03:56Z": 4,
"2023-04-07T03:57Z": 2,
"2023-04-07T03:58Z": 2,
"2023-04-07T03:59Z": 4,
"2023-04-07T04:00Z": 3,
"2023-04-07T04:01Z": 3,
"2023-04-07T04:02Z": 5,
"2023-04-07T04:03Z": 3,
"2023-04-07T04:04Z": 6,
"2023-04-07T04:05Z": 4,
"2023-04-07T04:06Z": 4,
"2023-04-07T04:07Z": 2,
"2023-04-07T04:08Z": 3,
"2023-04-07T04:09Z": 3,
"2023-04-07T04:10Z": 6,
"2023-04-07T04:11Z": 4,
"2023-04-07T04:12Z": 5,
"2023-04-07T04:13Z": 2,
"2023-04-07T04:14Z": 3,
"2023-04-07T04:15Z": 4,
"2023-04-07T04:16Z": 5,
"2023-04-07T04:17Z": 7,
"2023-04-07T04:18Z": 2,
"2023-04-07T04:19Z": 3,
"2023-04-07T04:20Z": 4,
"2023-04-07T04:21Z": 1,
"2023-04-07T04:22Z": 3,
"2023-04-07T04:23Z": 7,
"2023-04-07T04:24Z": 2,
"2023-04-07T04:25Z": 3,
"2023-04-07T04:26Z": 3,
"2023-04-07T04:28Z": 5,
"2023-04-07T04:29Z": 1,
"2023-04-07T04:30Z": 7,
"2023-04-07T04:32Z": 2,
"2023-04-07T04:33Z": 6,
"2023-04-07T04:34Z": 5,
"2023-04-07T04:36Z": 1,
"2023-04-07T04:37Z": 1,
"2023-04-07T04:38Z": 7,
"2023-04-07T04:39Z": 2,
"2023-04-07T04:40Z": 2,
"2023-04-07T04:41Z": 2,
"2023-04-07T04:42Z": 3,
"2023-04-07T04:43Z": 2,
"2023-04-07T04:44Z": 3,
"2023-04-07T04:45Z": 1,
"2023-04-07T04:46Z": 2,
"2023-04-07T04:47Z": 4,
"2023-04-07T04:48Z": 3,
"2023-04-07T04:49Z": 3,
"2023-04-07T04:50Z": 1,
"2023-04-07T04:51Z": 1,
"2023-04-07T04:52Z": 3,
"2023-04-07T04:53Z": 5,
"2023-04-07T04:54Z": 2,
"2023-04-07T04:55Z": 3,
"2023-04-07T04:56Z": 2,
"2023-04-07T04:57Z": 3,
"2023-04-07T04:58Z": 2,
"2023-04-07T04:59Z": 6,
"2023-04-07T05:00Z": 2,
"2023-04-07T05:01Z": 3,
"2023-04-07T05:02Z": 1,
"2023-04-07T05:03Z": 3,
"2023-04-07T05:04Z": 5,
"2023-04-07T05:05Z": 3,
"2023-04-07T05:06Z": 2,
"2023-04-07T05:07Z": 6,
"2023-04-07T05:08Z": 8,
"2023-04-07T05:09Z": 4,
"2023-04-07T05:10Z": 7,
"2023-04-07T05:11Z": 5,
"2023-04-07T05:12Z": 2,
"2023-04-07T05:13Z": 2,
"2023-04-07T05:14Z": 4,
"2023-04-07T05:15Z": 4,
"2023-04-07T05:16Z": 3,
"2023-04-07T05:17Z": 4,
"2023-04-07T05:18Z": 2,
"2023-04-07T05:19Z": 6,
"2023-04-07T05:20Z": 3,
"2023-04-07T05:21Z": 7,
"2023-04-07T05:22Z": 2,
"2023-04-07T05:23Z": 2,
"2023-04-07T05:24Z": 3,
"2023-04-07T05:25Z": 4,
"2023-04-07T05:26Z": 2,
"2023-04-07T05:27Z": 3,
"2023-04-07T05:28Z": 2,
"2023-04-07T05:29Z": 4,
"2023-04-07T05:30Z": 1,
"2023-04-07T05:31Z": 2,
"2023-04-07T05:32Z": 3,
"2023-04-07T05:33Z": 1,
"2023-04-07T05:34Z": 5,
"2023-04-07T05:35Z": 5,
"2023-04-07T05:36Z": 2,
"2023-04-07T05:37Z": 3,
"2023-04-07T05:38Z": 2,
"2023-04-07T05:39Z": 2,
"2023-04-07T05:40Z": 1,
"2023-04-07T05:41Z": 3,
"2023-04-07T05:42Z": 1,
"2023-04-07T05:43Z": 2,
"2023-04-07T05:44Z": 2,
"2023-04-07T05:45Z": 2,
"2023-04-07T05:46Z": 4,
"2023-04-07T05:47Z": 3,
"2023-04-07T05:48Z": 2,
"2023-04-07T05:49Z": 4,
"2023-04-07T05:50Z": 2,
"2023-04-07T05:51Z": 3,
"2023-04-07T05:52Z": 2,
"2023-04-07T05:53Z": 3,
"2023-04-07T05:54Z": 1,
"2023-04-07T05:55Z": 5,
"2023-04-07T05:56Z": 1,
"2023-04-07T05:57Z": 3,
"2023-04-07T05:58Z": 4,
"2023-04-07T05:59Z": 2,
"2023-04-07T06:00Z": 1,
"2023-04-07T06:01Z": 4,
"2023-04-07T06:03Z": 2,
"2023-04-07T06:04Z": 5,
"2023-04-07T06:05Z": 3,
"2023-04-07T06:06Z": 5,
"2023-04-07T06:07Z": 3,
"2023-04-07T06:08Z": 2,
"2023-04-07T06:09Z": 4,
"2023-04-07T06:10Z": 4,
"2023-04-07T06:11Z": 3,
"2023-04-07T06:12Z": 2,
"2023-04-07T06:13Z": 3,
"2023-04-07T06:14Z": 4,
"2023-04-07T06:15Z": 1,
"2023-04-07T06:16Z": 3,
"2023-04-07T06:17Z": 2,
"2023-04-07T06:18Z": 1,
"2023-04-07T06:19Z": 3,
"2023-04-07T06:20Z": 4,
"2023-04-07T06:21Z": 3,
"2023-04-07T06:22Z": 2,
"2023-04-07T06:23Z": 4,
"2023-04-07T06:24Z": 5,
"2023-04-07T06:26Z": 1,
"2023-04-07T06:27Z": 2,
"2023-04-07T06:28Z": 1,
"2023-04-07T06:29Z": 4,
"2023-04-07T06:30Z": 3,
"2023-04-07T06:31Z": 2,
"2023-04-07T06:32Z": 3,
"2023-04-07T06:33Z": 1,
"2023-04-07T06:34Z": 4,
"2023-04-07T06:35Z": 1,
"2023-04-07T06:36Z": 1,
"2023-04-07T06:37Z": 4,
"2023-04-07T06:38Z": 1,
"2023-04-07T06:39Z": 2,
"2023-04-07T06:40Z": 5,
"2023-04-07T06:41Z": 2,
"2023-04-07T06:42Z": 6,
"2023-04-07T06:43Z": 2,
"2023-04-07T06:44Z": 1,
"2023-04-07T06:45Z": 2,
"2023-04-07T06:46Z": 2,
"2023-04-07T06:47Z": 1,
"2023-04-07T06:48Z": 3,
"2023-04-07T06:49Z": 3,
"2023-04-07T06:50Z": 5,
"2023-04-07T06:51Z": 6,
"2023-04-07T06:52Z": 1,
"2023-04-07T06:53Z": 3,
"2023-04-07T06:54Z": 3,
"2023-04-07T06:55Z": 3,
"2023-04-07T06:56Z": 5,
"2023-04-07T06:57Z": 3,
"2023-04-07T06:58Z": 1,
"2023-04-07T06:59Z": 5,
"2023-04-07T07:00Z": 4,
"2023-04-07T07:01Z": 2,
"2023-04-07T07:02Z": 3,
"2023-04-07T07:03Z": 2,
"2023-04-07T07:04Z": 3,
"2023-04-07T07:05Z": 2,
"2023-04-07T07:06Z": 5,
"2023-04-07T07:07Z": 1,
"2023-04-07T07:08Z": 5,
"2023-04-07T07:09Z": 3,
"2023-04-07T07:10Z": 2,
"2023-04-07T07:11Z": 2,
"2023-04-07T07:12Z": 2,
"2023-04-07T07:13Z": 3,
"2023-04-07T07:14Z": 3,
"2023-04-07T07:15Z": 3,
"2023-04-07T07:16Z": 2,
"2023-04-07T07:17Z": 4,
"2023-04-07T07:18Z": 3,
"2023-04-07T07:19Z": 1,
"2023-04-07T07:20Z": 5,
"2023-04-07T07:21Z": 3,
"2023-04-07T07:22Z": 2,
"2023-04-07T07:23Z": 4,
"2023-04-07T07:24Z": 1,
"2023-04-07T07:25Z": 4,
"2023-04-07T07:26Z": 2,
"2023-04-07T07:27Z": 1,
"2023-04-07T07:28Z": 4,
"2023-04-07T07:30Z": 7,
"2023-04-07T07:31Z": 4,
"2023-04-07T07:32Z": 3,
"2023-04-07T07:33Z": 5,
"2023-04-07T07:34Z": 1,
"2023-04-07T07:35Z": 3,
"2023-04-07T07:36Z": 2,
"2023-04-07T07:37Z": 3,
"2023-04-07T07:38Z": 2,
"2023-04-07T07:39Z": 2,
"2023-04-07T07:40Z": 4,
"2023-04-07T07:41Z": 2,
"2023-04-07T07:43Z": 3,
"2023-04-07T07:44Z": 1,
"2023-04-07T07:45Z": 2,
"2023-04-07T07:46Z": 1,
"2023-04-07T07:47Z": 1,
"2023-04-07T07:48Z": 1,
"2023-04-07T07:49Z": 2,
"2023-04-07T07:50Z": 3,
"2023-04-07T07:51Z": 4,
"2023-04-07T07:52Z": 4,
"2023-04-07T07:53Z": 2,
"2023-04-07T07:54Z": 2,
"2023-04-07T07:55Z": 4,
"2023-04-07T07:56Z": 4,
"2023-04-07T07:57Z": 1,
"2023-04-07T07:58Z": 1,
"2023-04-07T07:59Z": 2,
"2023-04-07T08:00Z": 2,
"2023-04-07T08:01Z": 1,
"2023-04-07T08:02Z": 1,
"2023-04-07T08:03Z": 2,
"2023-04-07T08:04Z": 3,
"2023-04-07T08:05Z": 1,
"2023-04-07T08:06Z": 2,
"2023-04-07T08:07Z": 1,
"2023-04-07T08:08Z": 2,
"2023-04-07T08:09Z": 2,
"2023-04-07T08:10Z": 6,
"2023-04-07T08:11Z": 7,
"2023-04-07T08:12Z": 3,
"2023-04-07T08:13Z": 4,
"2023-04-07T08:14Z": 1,
"2023-04-07T08:15Z": 1,
"2023-04-07T08:16Z": 3,
"2023-04-07T08:17Z": 4,
"2023-04-07T08:18Z": 3,
"2023-04-07T08:19Z": 3,
"2023-04-07T08:20Z": 4,
"2023-04-07T08:21Z": 3,
"2023-04-07T08:22Z": 3,
"2023-04-07T08:23Z": 2,
"2023-04-07T08:25Z": 4,
"2023-04-07T08:26Z": 6,
"2023-04-07T08:28Z": 2,
"2023-04-07T08:29Z": 1,
"2023-04-07T08:30Z": 1
}
{
"dependencies": {
"axios": "^1.3.5",
"diff": "^5.1.0",
"wikimedia-streams": "^1.0.0-8b04cf0"
},
"devDependencies": {
"@types/diff": "^5.0.3",
"ts-node": "^10.9.1",
"typescript": "^5.0.3"
}
}
@ChlodAlejandro
Copy link
Author

ChlodAlejandro commented Apr 15, 2023

Summarizing the data:

> a = { ... }
> start = Object.keys(a)[0]; end = Object.keys(a)[Object.keys(a).length - 1];
> minutes = (new Date(end).getTime() - new Date(start).getTime()) / 1000 / 60
401
> checksPerDay = ((24 * 60) / minutes) * Object.values(a).reduce((p,n) => p + n)
4298.453865336659

401 minutes total. 389 minutes with edits. Estimated 4,298 edits to be checked per day. 128,940 checks for an entire 30-day month.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment