Skip to content

Instantly share code, notes, and snippets.

@cometkim
Last active July 21, 2023 12:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cometkim/2d6ade96329edfccff0fd8da57010f31 to your computer and use it in GitHub Desktop.
Save cometkim/2d6ade96329edfccff0fd8da57010f31 to your computer and use it in GitHub Desktop.
Convert Wikipedia page-articles data (XML) into a text dataset (NDJSON)
// Convert Wikipedia page articles dump (XML) into a stream of JSON
// { id: 0, "title": "...", "text": "..." }
// The "text" field format will also be converted into plain text
import * as path from 'node:path';
import * as fs from 'node:fs';
import XMLParser from 'node-xml-stream';
import ndjson from 'ndjson';
import instaview from 'instaview';
import htmlEntities from 'html-entities';
import * as htmlToText from 'html-to-text';
const [arg] = process.argv.slice(2);
const filePath = path.resolve(arg);
const inputStream = fs.createReadStream(filePath, 'utf8');
const xmlStream = new XMLParser();
const jsonStream = ndjson.stringify();
let state = 'idle';
let doc = { id: -1, title: '', text: '' };
xmlStream.on('opentag', (name, attrs) => {
switch (name) {
case 'page': {
if (state === 'idle') {
state = 'page';
}
break;
}
case 'id': {
if (state === 'page') {
state = 'id';
}
break;
}
case 'title': {
if (state === 'page') {
state = 'title';
}
break;
}
case 'revision': {
if (state === 'page') {
state = 'revision';
}
break;
}
case 'text': {
if (state === 'revision') {
state = 'text';
}
break;
}
}
});
xmlStream.on('text', text => {
switch (state) {
case 'title': {
doc.title = text;
break;
}
case 'id': {
doc.id = +text;
break;
}
case 'text': {
try {
doc.text = formatText(text);
} catch {
console.error(`failed to format doc(id=${doc.id}, title=${doc.title}))`);
}
break;
}
}
});
xmlStream.on('closetag', name => {
switch (name) {
case 'page': {
if (state === 'page') {
jsonStream.write(doc);
state = 'idle';
}
break;
}
case 'id': {
if (state === 'id') {
state = 'page';
}
break;
}
case 'title': {
if (state === 'title') {
state = 'page';
}
break;
}
case 'revision': {
if (state === 'revision') {
state = 'page';
}
break;
}
case 'text': {
if (state === 'text') {
state = 'revision';
}
break;
}
}
});
inputStream.pipe(xmlStream);
jsonStream.pipe(process.stdout);
function stripWikiTags(markup) {
return markup
// strip llang tags
.replace(/{{llang\|\w+\|([^}]*)}}/g, '$1')
// strip all others
.replace(/{{[^}]*}}\s?/g, '')
;
}
function formatText(text) {
text = htmlEntities.decode(text);
text = instaview.convert(text);
text = stripWikiTags(text);
return htmlToText.convert(text, {
wordwrap: false,
selectors: [
{ selector: 'a', options: { ignoreHref: true } },
{ selector: 'img', format: 'skip' },
],
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment