Skip to content

Instantly share code, notes, and snippets.

@nrubin999
Created November 15, 2020 19:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nrubin999/b4354313e050043c47bd898fdde0b369 to your computer and use it in GitHub Desktop.
Save nrubin999/b4354313e050043c47bd898fdde0b369 to your computer and use it in GitHub Desktop.
{
"name": "wiki",
"version": "1.0.0",
"description": "",
"main": "scan.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC",
"dependencies": {
"node-fetch": "^2.6.1",
"wikijs": "^6.0.1"
}
}
const fetch = require("node-fetch");
const fs = require('fs');
// Search query string
const SEARCH_QUERY = "hastemplate:Infobox_officeholder incategory:living_people";
// Suspicious revision keywords
const SUSPICIOUS_WORDS = ["award", "charity", "controversies"]
// File name is current timestamp
var stream = fs.createWriteStream(Date.now() + ".txt");
stream.once('open', function(fd) {
// Start at page 1 of API search results
var previous_offset = 0;
var previous_continue = "-||";
// Write header to .txt export file
stream.write("id | title | wordcount | pageviews | revisions | flagged\n");
async function startScan() {
for (var i = 0; i < 100; i++) {
var search_url = "https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=" + encodeURI(SEARCH_QUERY) + "&srlimit=50&format=json";
if (previous_offset) {
search_url = search_url + "&sroffset=" + previous_offset;
search_url = search_url + "&continue=" + previous_continue;
}
var response = await fetch(search_url);
var data = await response.json();
if (data.continue) {
previous_offset = data.continue.sroffset;
previous_continue = data.continue.continue;
}
for (var j = 0; j < 50; j++) {
// Pause for 1 second to avoid API rate limit
await new Promise(r => setTimeout(r, 1000));
var page_id = data.query.search[j].pageid;
var page_url = "https://en.wikipedia.org/w/api.php?action=query&format=json&prop=pageviews|revisions&pageids=" + page_id + "&rvlimit=100";
var page_response = await fetch(page_url);
var page_data = await page_response.json();
// Get page title
var title = page_data.query.pages[page_id].title;
// Get page wordcount
var wordcount = data.query.search[j].wordcount;
// Get number of pageviews in the last month
var pageviews = Object.values(page_data.query.pages[page_id].pageviews).reverse();
var month_pageviews = 0;
for (var p = 0; p < 30; p++) {
month_pageviews += pageviews[p];
}
// Get number of revisions in the last month
var recent_revisions = 0;
for (var r = 0; r < page_data.query.pages[page_id].revisions.length; r++) {
var revision_date = new Date(page_data.query.pages[page_id].revisions[r].timestamp);
if (daysBetween(new Date(), revision_date) < 30) {
recent_revisions += 1;
}
}
// Get number of revisions that mention keywords
var suspicious_revisions = 0;
for (var r = 0; r < page_data.query.pages[page_id].revisions.length; r++) {
var revision_date = new Date(page_data.query.pages[page_id].revisions[r].timestamp);
let revision_comment = page_data.query.pages[page_id].revisions[r].comment;
if (revision_comment) {
revision_comment = revision_comment.toLowerCase();
if (daysBetween(new Date(), revision_date) < 30) {
for (var x = 0; x < SUSPICIOUS_WORDS.length; x++) {
if (revision_comment.includes(SUSPICIOUS_WORDS[x])) {
suspicious_revisions += 1;
}
}
}
}
}
console.log("(" + page_id + ") " + title + " | " + wordcount + " | " + month_pageviews + " | " + recent_revisions + " | " + suspicious_revisions)
// Write data to file
stream.write(page_id + " | " + title + " | " + wordcount + " | " + month_pageviews + " | " + recent_revisions + " | " + suspicious_revisions + "\n");
}
}
}
function daysBetween(d1, d2) {
var diff = Math.abs(d1.getTime() - d2.getTime());
return diff / (1000 * 60 * 60 * 24);
};
startScan();
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment