veltman/wiki-people.js Secret

## wiki-people.js
// First, download and extract:
// https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2

const fs = require("fs");
const XmlStream = require("xml-stream");

const xml = new XmlStream(
  fs.createReadStream("enwiki-latest-pages-articles.xml", "utf8")
);

xml.on("endElement: page", function(item) {
  const { title, redirect, ns, revision } = item,
    text = revision.text["$text"];

  // Skip redirect pages, disambiguation pages, short pages, pages with no person-related headings
  if (
    ns !== "0" ||
    redirect ||
    !title ||
    title.match(/,|disambiguation|\W(of)/) ||
    text.length < 12000 ||
    !text.match(/\=+\s?(Early life|Personal life|Career|Death)\s?\=+/)
  ) {
    return;
  }

  const numHeadings = text.match(/\s\=+[^\=]{1,100}\=+/g).length;

  console.log([item.title, text.length, numHeadings].join("\t"));
});
	// First, download and extract:
	// https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2

	const fs = require("fs");
	const XmlStream = require("xml-stream");

	const xml = new XmlStream(
	fs.createReadStream("enwiki-latest-pages-articles.xml", "utf8")
	);

	xml.on("endElement: page", function(item) {
	const { title, redirect, ns, revision } = item,
	text = revision.text["$text"];

	// Skip redirect pages, disambiguation pages, short pages, pages with no person-related headings
	if (
	ns !== "0" \|\|
	redirect \|\|
	!title \|\|
	title.match(/,\|disambiguation\|\W(of)/) \|\|
	text.length < 12000 \|\|
	!text.match(/\=+\s?(Early life\|Personal life\|Career\|Death)\s?\=+/)
	) {
	return;
	}

	const numHeadings = text.match(/\s\=+[^\=]{1,100}\=+/g).length;

	console.log([item.title, text.length, numHeadings].join("\t"));
	});