Skip to content

Instantly share code, notes, and snippets.

@ChlodAlejandro
Last active August 11, 2023 17:32
Show Gist options
  • Save ChlodAlejandro/fc659b3863cb2e2bfaf59c076d87ae31 to your computer and use it in GitHub Desktop.
Save ChlodAlejandro/fc659b3863cb2e2bfaf59c076d87ae31 to your computer and use it in GitHub Desktop.
Calculates open CopyPatrol cases which can't be actioned (page is deleted or content is already revision-deleted).
/*!
Calculates open CopyPatrol cases which can't be actioned (page is
deleted or content is already revision-deleted).
Run the following commands beforehand:
```sh
npm install knex mysql2 dotenv
# Replace <shell> with your Toolforge username.
ssh -L 4711:enwiki.analytics.db.svc.eqiad.wmflabs:3306 -L 4712:tools.db.svc.eqiad.wmflabs:3306 <shell>@dev.toolforge.org -Nv
```
Then make an `.env` file in the same folder as this script with the following
content:
```ini
DB_HOST = localhost
DB_PORT = 4712
DB_USER = <FILLME>
DB_PASS = <FILLME>
DB_NAME_COPYPATROL = s51306__copyright_p
DB_REPLICA_HOST = localhost
DB_REPLICA_PORT = 4711
DB_REPLICA_USER = <FILLME>
DB_REPLICA_PASS = <FILLME>
CPS_PROJECT = wikipedia
CPS_LANG = en
DB_REPLICA_NAME = enwiki_p
```
Replace <FILLME> with the appropriate username/password found in your
Toolforge `~/replica.my.cnf` file.
You can then run `node copypatrol-open-count.js` from another terminal.
(Keep the terminal running SSH open!)
@copyright Chlod Alejandro <chlod@chlod.net>
@license MIT
*/
const knex = require("knex");
const fs = require("fs/promises");
require("dotenv").config();
( async() => {
const BATCH_SIZE = 250;
console.log("Opening fs streams...");
const filePrefix = new Date().toISOString().replace(/[:-TZ]/g, "");
const outRevids = await fs.open(filePrefix + "-revids.txt", "w");
const outRows = await fs.open(filePrefix + "-revids.jsonl", "w");
console.log("Connecting...");
console.log(":: copypatrol...");
const cpdb = knex({
client: "mysql2",
connection: {
host: process.env.DB_HOST,
port: process.env.DB_PORT,
user: process.env.DB_USER,
password: process.env.DB_PASS,
database: process.env.DB_NAME_COPYPATROL
},
pool: { min: 0, idleTimeoutMillis: 5e3 }
});
console.log(":: " + process.env.DB_REPLICA_NAME + "...");
const endb = knex({
client: "mysql2",
connection: {
host: process.env.DB_REPLICA_HOST,
port: process.env.DB_REPLICA_PORT,
user: process.env.DB_REPLICA_USER,
password: process.env.DB_REPLICA_PASS,
database: process.env.DB_REPLICA_NAME
},
pool: { min: 0, idleTimeoutMillis: 5e3 }
});
console.log("Querying...");
const getOpenCases = cpdb.select()
.from('copyright_diffs')
.where('project', process.env.CPS_PROJECT)
.where('lang', process.env.CPS_LANG)
.whereNull('status')
// https://phabricator.wikimedia.org/T138317
.where('diff_timestamp', '>', 20160620000000)
.orderBy('id');
console.log(":::: " + getOpenCases.toString());
console.log(":: Getting open case count totals...");
const countQ = getOpenCases.clone()
.count("*", { as: 'count' })
.first();
console.log(":::: " + countQ.toString());
const count = await countQ.then((row) => row.count);
console.log(":: Found " + count + " open cases.");
console.log(":: Getting draft case count totals...");
const draftCountQ = getOpenCases.clone()
.whereNot('page_ns', 0)
.count("*", { as: 'count' })
.first();
console.log(":::: " + draftCountQ.toString());
const draftCount = await draftCountQ.then((row) => row.count);
console.log(":: " + draftCount + " are drafts.");
let done = 0;
let rd = 0;
let missing = 0;
let processed = new Set();
const writePromises = [];
while (true) {
const dataQ = getOpenCases.clone()
.offset(done)
.limit(BATCH_SIZE);
console.log(":::: " + dataQ.toString());
const data = await dataQ
.then(async (rows) => {
console.log(":::: found " + rows.length + " rows");
done += rows.length;
if (rows.length === 0)
return null;
const indexed = Object.fromEntries(rows.map(
(row) => [row.diff, row]
));
for (const key of Object.keys(indexed)) {
if (processed.has(key)) {
console.error(":::: duplicate revision: " + key);
}
processed.add(key);
}
const revids = rows.map((row) => row.diff);
const endbQ = endb
.select("rev_id", "rev_deleted")
.from("revision")
.whereIn("rev_id", revids);
// big.
// console.log(":::: " + endbQ.toString());
return endbQ
.then((foundRows) => {
const frIndex = Object.fromEntries(foundRows.map(v => [v.rev_id, v]));
const foundRevIds = Object.keys(frIndex);
const notFound = revids
.filter(id => !foundRevIds.includes(`${id}`))
.map(id => indexed[id]);
const revDeld = revids
.filter(id => foundRevIds.includes(`${id}`) && (frIndex[`${id}`].rev_deleted & 1 === 1))
.map(id => indexed[id]);
missing += notFound.length + revDeld.length;
rd += revDeld.length;
return notFound.concat(revDeld);
})
});
if ( data === null ) {
// Out of data. We're done here.
console.log(":: Done. Processed "+ done + " of " + count + " cases (" + (done/count*100).toFixed(2) + "%).");
console.log(":: Found "+ missing + " cases (" + (missing/done*100).toFixed(2) + "% cumulative).");
console.log(":: "+ rd + " of those are revdel'd (" + (rd/missing*100).toFixed(2) + "% cumulative).");
break;
} else {
for ( const rev of data ) {
writePromises.push(outRevids.write(rev.diff + "\n"));
writePromises.push(outRows.write(JSON.stringify({
id: rev.id,
project: rev.project.toString("utf8"),
lang: rev.lang.toString("utf8"),
diff: rev.diff,
diff_timestamp: rev.diff_timestamp.toString("utf8"),
page_title: rev.page_title.toString("utf8"),
page_ns: rev.page_ns,
ithenticate_id: rev.ithenticate_id,
status: rev.status == null ? null : rev.status.toString("utf8"),
status_user: rev.status_user == null ? null : rev.status_user.toString("utf8"),
status_timestamp: rev.status_timestamp == null ? null : rev.status_timestamp.toString("utf8")
}) + "\n"));
}
console.log(":: Processed "+ done + " of " + count + " cases (" + (done/count*100).toFixed(2) + "%).");
console.log(":: Found "+ missing + " cases (" + (missing/done*100).toFixed(2) + "% cumulative).");
console.log(":: "+ rd + " of those are revdel'd (" + (rd/missing*100).toFixed(2) + "% cumulative).");
}
}
await Promise.all(writePromises);
cpdb.destroy();
endb.destroy();
} )();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment