Last active
August 11, 2023 17:32
-
-
Save ChlodAlejandro/fc659b3863cb2e2bfaf59c076d87ae31 to your computer and use it in GitHub Desktop.
Calculates open CopyPatrol cases which can't be actioned (page is deleted or content is already revision-deleted).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*! | |
Calculates open CopyPatrol cases which can't be actioned (page is | |
deleted or content is already revision-deleted). | |
Run the following commands beforehand: | |
```sh | |
npm install knex mysql2 dotenv | |
# Replace <shell> with your Toolforge username. | |
ssh -L 4711:enwiki.analytics.db.svc.eqiad.wmflabs:3306 -L 4712:tools.db.svc.eqiad.wmflabs:3306 <shell>@dev.toolforge.org -Nv | |
``` | |
Then make an `.env` file in the same folder as this script with the following | |
content: | |
```ini | |
DB_HOST = localhost | |
DB_PORT = 4712 | |
DB_USER = <FILLME> | |
DB_PASS = <FILLME> | |
DB_NAME_COPYPATROL = s51306__copyright_p | |
DB_REPLICA_HOST = localhost | |
DB_REPLICA_PORT = 4711 | |
DB_REPLICA_USER = <FILLME> | |
DB_REPLICA_PASS = <FILLME> | |
CPS_PROJECT = wikipedia | |
CPS_LANG = en | |
DB_REPLICA_NAME = enwiki_p | |
``` | |
Replace <FILLME> with the appropriate username/password found in your | |
Toolforge `~/replica.my.cnf` file. | |
You can then run `node copypatrol-open-count.js` from another terminal. | |
(Keep the terminal running SSH open!) | |
@copyright Chlod Alejandro <chlod@chlod.net> | |
@license MIT | |
*/ | |
const knex = require("knex"); | |
const fs = require("fs/promises"); | |
require("dotenv").config(); | |
( async() => { | |
const BATCH_SIZE = 250; | |
console.log("Opening fs streams..."); | |
const filePrefix = new Date().toISOString().replace(/[:-TZ]/g, ""); | |
const outRevids = await fs.open(filePrefix + "-revids.txt", "w"); | |
const outRows = await fs.open(filePrefix + "-revids.jsonl", "w"); | |
console.log("Connecting..."); | |
console.log(":: copypatrol..."); | |
const cpdb = knex({ | |
client: "mysql2", | |
connection: { | |
host: process.env.DB_HOST, | |
port: process.env.DB_PORT, | |
user: process.env.DB_USER, | |
password: process.env.DB_PASS, | |
database: process.env.DB_NAME_COPYPATROL | |
}, | |
pool: { min: 0, idleTimeoutMillis: 5e3 } | |
}); | |
console.log(":: " + process.env.DB_REPLICA_NAME + "..."); | |
const endb = knex({ | |
client: "mysql2", | |
connection: { | |
host: process.env.DB_REPLICA_HOST, | |
port: process.env.DB_REPLICA_PORT, | |
user: process.env.DB_REPLICA_USER, | |
password: process.env.DB_REPLICA_PASS, | |
database: process.env.DB_REPLICA_NAME | |
}, | |
pool: { min: 0, idleTimeoutMillis: 5e3 } | |
}); | |
console.log("Querying..."); | |
const getOpenCases = cpdb.select() | |
.from('copyright_diffs') | |
.where('project', process.env.CPS_PROJECT) | |
.where('lang', process.env.CPS_LANG) | |
.whereNull('status') | |
// https://phabricator.wikimedia.org/T138317 | |
.where('diff_timestamp', '>', 20160620000000) | |
.orderBy('id'); | |
console.log(":::: " + getOpenCases.toString()); | |
console.log(":: Getting open case count totals..."); | |
const countQ = getOpenCases.clone() | |
.count("*", { as: 'count' }) | |
.first(); | |
console.log(":::: " + countQ.toString()); | |
const count = await countQ.then((row) => row.count); | |
console.log(":: Found " + count + " open cases."); | |
console.log(":: Getting draft case count totals..."); | |
const draftCountQ = getOpenCases.clone() | |
.whereNot('page_ns', 0) | |
.count("*", { as: 'count' }) | |
.first(); | |
console.log(":::: " + draftCountQ.toString()); | |
const draftCount = await draftCountQ.then((row) => row.count); | |
console.log(":: " + draftCount + " are drafts."); | |
let done = 0; | |
let rd = 0; | |
let missing = 0; | |
let processed = new Set(); | |
const writePromises = []; | |
while (true) { | |
const dataQ = getOpenCases.clone() | |
.offset(done) | |
.limit(BATCH_SIZE); | |
console.log(":::: " + dataQ.toString()); | |
const data = await dataQ | |
.then(async (rows) => { | |
console.log(":::: found " + rows.length + " rows"); | |
done += rows.length; | |
if (rows.length === 0) | |
return null; | |
const indexed = Object.fromEntries(rows.map( | |
(row) => [row.diff, row] | |
)); | |
for (const key of Object.keys(indexed)) { | |
if (processed.has(key)) { | |
console.error(":::: duplicate revision: " + key); | |
} | |
processed.add(key); | |
} | |
const revids = rows.map((row) => row.diff); | |
const endbQ = endb | |
.select("rev_id", "rev_deleted") | |
.from("revision") | |
.whereIn("rev_id", revids); | |
// big. | |
// console.log(":::: " + endbQ.toString()); | |
return endbQ | |
.then((foundRows) => { | |
const frIndex = Object.fromEntries(foundRows.map(v => [v.rev_id, v])); | |
const foundRevIds = Object.keys(frIndex); | |
const notFound = revids | |
.filter(id => !foundRevIds.includes(`${id}`)) | |
.map(id => indexed[id]); | |
const revDeld = revids | |
.filter(id => foundRevIds.includes(`${id}`) && (frIndex[`${id}`].rev_deleted & 1 === 1)) | |
.map(id => indexed[id]); | |
missing += notFound.length + revDeld.length; | |
rd += revDeld.length; | |
return notFound.concat(revDeld); | |
}) | |
}); | |
if ( data === null ) { | |
// Out of data. We're done here. | |
console.log(":: Done. Processed "+ done + " of " + count + " cases (" + (done/count*100).toFixed(2) + "%)."); | |
console.log(":: Found "+ missing + " cases (" + (missing/done*100).toFixed(2) + "% cumulative)."); | |
console.log(":: "+ rd + " of those are revdel'd (" + (rd/missing*100).toFixed(2) + "% cumulative)."); | |
break; | |
} else { | |
for ( const rev of data ) { | |
writePromises.push(outRevids.write(rev.diff + "\n")); | |
writePromises.push(outRows.write(JSON.stringify({ | |
id: rev.id, | |
project: rev.project.toString("utf8"), | |
lang: rev.lang.toString("utf8"), | |
diff: rev.diff, | |
diff_timestamp: rev.diff_timestamp.toString("utf8"), | |
page_title: rev.page_title.toString("utf8"), | |
page_ns: rev.page_ns, | |
ithenticate_id: rev.ithenticate_id, | |
status: rev.status == null ? null : rev.status.toString("utf8"), | |
status_user: rev.status_user == null ? null : rev.status_user.toString("utf8"), | |
status_timestamp: rev.status_timestamp == null ? null : rev.status_timestamp.toString("utf8") | |
}) + "\n")); | |
} | |
console.log(":: Processed "+ done + " of " + count + " cases (" + (done/count*100).toFixed(2) + "%)."); | |
console.log(":: Found "+ missing + " cases (" + (missing/done*100).toFixed(2) + "% cumulative)."); | |
console.log(":: "+ rd + " of those are revdel'd (" + (rd/missing*100).toFixed(2) + "% cumulative)."); | |
} | |
} | |
await Promise.all(writePromises); | |
cpdb.destroy(); | |
endb.destroy(); | |
} )(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment