-
-
Save nrubin999/b4354313e050043c47bd898fdde0b369 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "wiki", | |
"version": "1.0.0", | |
"description": "", | |
"main": "scan.js", | |
"scripts": { | |
"test": "echo \"Error: no test specified\" && exit 1" | |
}, | |
"author": "", | |
"license": "ISC", | |
"dependencies": { | |
"node-fetch": "^2.6.1", | |
"wikijs": "^6.0.1" | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fetch = require("node-fetch"); | |
const fs = require('fs'); | |
// Search query string | |
const SEARCH_QUERY = "hastemplate:Infobox_officeholder incategory:living_people"; | |
// Suspicious revision keywords | |
const SUSPICIOUS_WORDS = ["award", "charity", "controversies"] | |
// File name is current timestamp | |
var stream = fs.createWriteStream(Date.now() + ".txt"); | |
stream.once('open', function(fd) { | |
// Start at page 1 of API search results | |
var previous_offset = 0; | |
var previous_continue = "-||"; | |
// Write header to .txt export file | |
stream.write("id | title | wordcount | pageviews | revisions | flagged\n"); | |
async function startScan() { | |
for (var i = 0; i < 100; i++) { | |
var search_url = "https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=" + encodeURI(SEARCH_QUERY) + "&srlimit=50&format=json"; | |
if (previous_offset) { | |
search_url = search_url + "&sroffset=" + previous_offset; | |
search_url = search_url + "&continue=" + previous_continue; | |
} | |
var response = await fetch(search_url); | |
var data = await response.json(); | |
if (data.continue) { | |
previous_offset = data.continue.sroffset; | |
previous_continue = data.continue.continue; | |
} | |
for (var j = 0; j < 50; j++) { | |
// Pause for 1 second to avoid API rate limit | |
await new Promise(r => setTimeout(r, 1000)); | |
var page_id = data.query.search[j].pageid; | |
var page_url = "https://en.wikipedia.org/w/api.php?action=query&format=json&prop=pageviews|revisions&pageids=" + page_id + "&rvlimit=100"; | |
var page_response = await fetch(page_url); | |
var page_data = await page_response.json(); | |
// Get page title | |
var title = page_data.query.pages[page_id].title; | |
// Get page wordcount | |
var wordcount = data.query.search[j].wordcount; | |
// Get number of pageviews in the last month | |
var pageviews = Object.values(page_data.query.pages[page_id].pageviews).reverse(); | |
var month_pageviews = 0; | |
for (var p = 0; p < 30; p++) { | |
month_pageviews += pageviews[p]; | |
} | |
// Get number of revisions in the last month | |
var recent_revisions = 0; | |
for (var r = 0; r < page_data.query.pages[page_id].revisions.length; r++) { | |
var revision_date = new Date(page_data.query.pages[page_id].revisions[r].timestamp); | |
if (daysBetween(new Date(), revision_date) < 30) { | |
recent_revisions += 1; | |
} | |
} | |
// Get number of revisions that mention keywords | |
var suspicious_revisions = 0; | |
for (var r = 0; r < page_data.query.pages[page_id].revisions.length; r++) { | |
var revision_date = new Date(page_data.query.pages[page_id].revisions[r].timestamp); | |
let revision_comment = page_data.query.pages[page_id].revisions[r].comment; | |
if (revision_comment) { | |
revision_comment = revision_comment.toLowerCase(); | |
if (daysBetween(new Date(), revision_date) < 30) { | |
for (var x = 0; x < SUSPICIOUS_WORDS.length; x++) { | |
if (revision_comment.includes(SUSPICIOUS_WORDS[x])) { | |
suspicious_revisions += 1; | |
} | |
} | |
} | |
} | |
} | |
console.log("(" + page_id + ") " + title + " | " + wordcount + " | " + month_pageviews + " | " + recent_revisions + " | " + suspicious_revisions) | |
// Write data to file | |
stream.write(page_id + " | " + title + " | " + wordcount + " | " + month_pageviews + " | " + recent_revisions + " | " + suspicious_revisions + "\n"); | |
} | |
} | |
} | |
function daysBetween(d1, d2) { | |
var diff = Math.abs(d1.getTime() - d2.getTime()); | |
return diff / (1000 * 60 * 60 * 24); | |
}; | |
startScan(); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment