-
-
Save veltman/e63ec0f44e04cf2c8cb3790c8c9b7838 to your computer and use it in GitHub Desktop.
Extract lengthy Wikipedia articles about people into a TSV
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// First, download and extract: | |
// https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 | |
const fs = require("fs"); | |
const XmlStream = require("xml-stream"); | |
const xml = new XmlStream( | |
fs.createReadStream("enwiki-latest-pages-articles.xml", "utf8") | |
); | |
xml.on("endElement: page", function(item) { | |
const { title, redirect, ns, revision } = item, | |
text = revision.text["$text"]; | |
// Skip redirect pages, disambiguation pages, short pages, pages with no person-related headings | |
if ( | |
ns !== "0" || | |
redirect || | |
!title || | |
title.match(/,|disambiguation|\W(of)/) || | |
text.length < 12000 || | |
!text.match(/\=+\s?(Early life|Personal life|Career|Death)\s?\=+/) | |
) { | |
return; | |
} | |
const numHeadings = text.match(/\s\=+[^\=]{1,100}\=+/g).length; | |
console.log([item.title, text.length, numHeadings].join("\t")); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment