Skip to content

Instantly share code, notes, and snippets.

@makotom
Last active April 19, 2019 17:24
Show Gist options
  • Save makotom/4b60d6f461e035d36dda4b5e5239021f to your computer and use it in GitHub Desktop.
Save makotom/4b60d6f461e035d36dda4b5e5239021f to your computer and use it in GitHub Desktop.
Generate TSV file from JMDict XML data
const fs = require('fs');
function getEntryStrings() {
const ret = [];
const text = fs.readFileSync('JMdict_e').toString();
const openPos = [];
const endPos = [];
{
const entryOpenTagRegExp = /<entry/g;
for (let entryOpenMatch = entryOpenTagRegExp.exec(text); entryOpenMatch !== null; entryOpenMatch = entryOpenTagRegExp.exec(text)) {
openPos.push(entryOpenMatch.index);
}
}
{
const entryEndTagRegExp = /<\/entry>/g;
for (let entryEndMatch = entryEndTagRegExp.exec(text); entryEndMatch !== null; entryEndMatch = entryEndTagRegExp.exec(text)) {
endPos.push(entryEndMatch.index);
}
}
openPos.forEach((openAt, index) => {
const endAt = endPos[index] + '</entry>'.length;
ret.push(text.substring(openAt, endAt));
});
return ret;
}
{
const rows = [];
const entries = [];
const allKeys = [];
getEntryStrings().forEach((entryString) => {
const textNodes = new Map();
const elemStack = [];
let curTerm = '';
let isTagOpen = false;
for (let iter = 0; iter < entryString.length; iter += 1) {
if (!isTagOpen && entryString[iter] === '<') {
if (/\S/.test(curTerm)) {
const key = elemStack.join(' ').replace(/^entry /, '');
const values = textNodes.get(key) || (() => {
const ret = [];
textNodes.set(key, ret);
return ret;
})();
values.push(curTerm);
}
curTerm = '';
isTagOpen = true;
continue;
} else if (isTagOpen && entryString[iter] === '>') {
if (curTerm[0] === '/') {
elemStack.pop();
} else if (curTerm[curTerm.length - 1] !== '/') {
elemStack.push(/^[^\s]+/.exec(curTerm)[0]);
}
curTerm = '';
isTagOpen = false;
continue;
} else {
curTerm += entryString[iter];
}
}
entries.push(textNodes);
Array.from(textNodes.keys()).forEach((key) => {
if (allKeys.indexOf(key) === -1 && key !== 'ent_seq') {
allKeys.push(key);
}
});
});
rows.push(allKeys.join('\t'));
entries.forEach((entry) => {
const row = [entry.get('ent_seq')[0]];
allKeys.forEach((key) => {
row.push(entry.get(key) !== void 0 ? JSON.stringify(entry.get(key)) : '');
});
rows.push(row.join('\t'));
});
fs.writeFileSync('JMdict_e.tsv', rows.join('\n'));
}
@makotom
Copy link
Author

makotom commented Apr 19, 2019

Usage

Prerequisite

  1. Both parse.js and JMdict_e (i.e. unzipped XML file) have to be located in a single directory.
  2. NodeJS runtime environment. Note: Tested only in UNIX-like environments (including WSL).

Run

$ node parse.js

You will find JMdict_e.tsv in the working directory.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment