Skip to content

Instantly share code, notes, and snippets.

@milahu
Created January 19, 2020 17:15
Show Gist options
  • Save milahu/592b49a7f38d49324219decbb93374fd to your computer and use it in GitHub Desktop.
Save milahu/592b49a7f38d49324219decbb93374fd to your computer and use it in GitHub Desktop.
show all additions in wikipedia page history
// show all additions in wikipedia page history
//
// get the XML file from
// https://en.wikipedia.org/wiki/Special:Export
//
// Add pages manually:
// Skibadee
//
// [ ] Include only the current revision, not the full history
// ^ uncheck this
//
// set filename in variable
// wikipedia_xml_file
//
// install dependencies
// npm i xml2js diff
//
// run script
// node skibadee.js
//
// comment the line
// if (ip == 1) { break }
// to see all pages
//
// license = CC0-1.0
const fs = require('fs')
const xml2js = require('xml2js')
const jsdiff = require('diff')
// edit this
const wikipedia_xml_file = 'Wikipedia-20200119163602.xml'
const src = fs.readFileSync(wikipedia_xml_file).toString('utf-8')
xml2js.parseString(src, function (err, result) {
let text_last = ''
for (let [ip, page] of result.mediawiki.page.entries()) {
// comment the next line to see all pages
if (ip == 1) { break } // flood limit
for (let [ir, revision] of page.revision.entries()) {
//if (ir == 20) { break } // flood limit
console.log(`# revision ${revision.id}`)
const text = revision.text[0]._
if (!text) { continue }
// find changed words
const diff = jsdiff.diffWords(text_last, text)
for (let d of diff) {
// print added words
if (d.added) {
console.log(d.value)
}
}
text_last = text
}
}
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment