We're going to be calling into Python from Bun, so ensure that you have Python 3 and Pip and install these packages for Python:
# https://github.com/polm/fugashi
pip install fugashi
# https://github.com/polm/unidic-py
pip install unidic
Now make a fresh Bun project and install bunpy, a port of deno_python
that lets Bun call Python.
bun init
bun install bunpy
Declare this module:
// index.ts
import { python } from "bunpy";
// Get a long string of Japanese text (a novel), like 4,000
// lines long or more.
//
// You can get a public domain novel here. Just select all, and
// copy-paste into a string. No need to remove the English bits
// as the tagger can handle mixed languages.
// https://www.aozora.gr.jp/cards/001030/files/4803_14204.html
const longJapaneseString = "風立ちぬ…";
// Call the below-declared code.
console.log("Running…");
// Call the below-declared code.
main(longJapaneseString);
console.log("…Survived!");
export function main(japaneseText: string): string {
const { GenericTagger } = python.import("fugashi");
const { DICDIR } = python.import("unidic");
const tagger = GenericTagger(`-d "${DICDIR.toString()}"`);
let pron = '';
for(const line of japaneseText.split("\n")){
pron += getPronunciation(line, tagger) + "\n";
}
return pron;
}
export function getPronunciation(
text: string,
tagger: Fugashi["GenericTagger"]
): string {
let acc = "";
const pronIndex = 9;
for (const word of tagger(text)) {
// The crash will happen around here, during the proxy
// accesses. I'm not too clear which access is faulty.
let pron =
word.proxy.feature.valueOf().length > pronIndex
? word.proxy.feature.valueOf()[pronIndex].valueOf()
: word.proxy.surface.valueOf();
if (pron === "*") {
pron = word.proxy.surface.valueOf();
}
acc = acc + pron;
}
return acc;
}
interface Fugashi {
GenericTagger(args: string): PythonArray<Word>;
}
interface Word {
proxy: {
feature: PythonArray<PythonString>;
surface: PythonString;
};
}
type PythonArray<T> = {
[Symbol.iterator](): Iterator<T>;
valueOf(): Array<T>;
};
interface PythonString {
valueOf(): string;
}
Run the code:
bun run index.ts