Last active
April 17, 2021 20:26
-
-
Save Venryx/ecbea1a0c7a8a6cb21d80886488045f1 to your computer and use it in GitHub Desktop.
Hiragana/Katakana to Romaji Converter (Typescript/Javascript)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Raw chart text obtained by simple copy-paste from: https://en.wikipedia.org/wiki/Hepburn_romanization#Romanization_charts | |
const rawChartText = ` | |
あ ア a い イ i う ウ u え エ e お オ o | |
か カ ka き キ ki く ク ku け ケ ke こ コ ko きゃ キャ kya きゅ キュ kyu きょ キョ kyo | |
さ サ sa し シ shi す ス su せ セ se そ ソ so しゃ シャ sha しゅ シュ shu しょ ショ sho | |
た タ ta ち チ chi つ ツ tsu て テ te と ト to ちゃ チャ cha ちゅ チュ chu ちょ チョ cho | |
な ナ na に ニ ni ぬ ヌ nu ね ネ ne の ノ no にゃ ニャ nya にゅ ニュ nyu にょ ニョ nyo | |
は ハ ha ひ ヒ hi ふ フ fu へ ヘ he ほ ホ ho ひゃ ヒャ hya ひゅ ヒュ hyu ひょ ヒョ hyo | |
ま マ ma み ミ mi む ム mu め メ me も モ mo みゃ ミャ mya みゅ ミュ myu みょ ミョ myo | |
や ヤ ya ゆ ユ yu よ ヨ yo | |
ら ラ ra り リ ri る ル ru れ レ re ろ ロ ro りゃ リャ rya りゅ リュ ryu りょ リョ ryo | |
わ ワ wa ゐ ヰ i † ゑ ヱ e † を ヲ o ‡ | |
ん ン n /n' | |
が ガ ga ぎ ギ gi ぐ グ gu げ ゲ ge ご ゴ go ぎゃ ギャ gya ぎゅ ギュ gyu ぎょ ギョ gyo | |
ざ ザ za じ ジ ji ず ズ zu ぜ ゼ ze ぞ ゾ zo じゃ ジャ ja じゅ ジュ ju じょ ジョ jo | |
だ ダ da ぢ ヂ ji づ ヅ zu で デ de ど ド do ぢゃ ヂャ ja ぢゅ ヂュ ju ぢょ ヂョ jo | |
ば バ ba び ビ bi ぶ ブ bu べ ベ be ぼ ボ bo びゃ ビャ bya びゅ ビュ byu びょ ビョ byo | |
ぱ パ pa ぴ ピ pi ぷ プ pu ぺ ペ pe ぽ ポ po ぴゃ ピャ pya ぴゅ ピュ pyu ぴょ ピョ pyo`.trim(); | |
// Extended chart from the same page. | |
const rawChartText_extended = ` | |
イィ yi イェ ye | |
ウァ wa* ウィ wi ウゥ wu* ウェ we ウォ wo | |
ウュ wyu | |
ヴァ va ヴィ vi ヴ vu⁑ ヴェ ve ヴォ vo | |
ヴャ vya ヴュ vyu ヴィェ vye ヴョ vyo | |
キェ kye | |
ギェ gye | |
クァ kwa クィ kwi クェ kwe クォ kwo | |
クヮ kwa | |
グァ gwa グィ gwi グェ gwe グォ gwo | |
グヮ gwa | |
シェ she | |
ジェ je | |
スィ si | |
ズィ zi | |
チェ che | |
ツァ tsa ツィ tsi ツェ tse ツォ tso | |
ツュ tsyu | |
ティ ti トゥ tu | |
テュ tyu | |
ディ di ドゥ du | |
デュ dyu | |
ニェ nye | |
ヒェ hye | |
ビェ bye | |
ピェ pye | |
ファ fa フィ fi フェ fe フォ fo | |
フャ fya フュ fyu フィェ fye フョ fyo | |
ホゥ hu | |
ミェ mye | |
リェ rye | |
ラ゜ la リ゜ li ル゜ lu レ゜ le ロ゜ lo | |
リ゜ャ lya リ゜ュ lyu リ゜ェ lye リ゜ョ lyo | |
ヷ va⁂ ヸ vi⁂ ヹ ve⁂ ヺ vo⁂ | |
`.trim(); | |
const mappingTexts = rawChartText.Matches(/(\S{1,2})\s(\S{1,2})\s([a-z]+)\s/g); | |
const extendedMappingTexts = rawChartText_extended.Matches(/(\S{1,3})\s([a-z]+)[*⁑⁂]?\s/g); | |
export class Mapping { | |
rawText: string; | |
hiragana: string; | |
katakana: string; | |
romaji: string; | |
} | |
const mappings = mappingTexts.map(text=>{ | |
return { | |
rawText: text[0], | |
hiragana: text[1], | |
katakana: text[2], | |
romaji: text[3], | |
} as Mapping; | |
}); | |
mappings.push(...extendedMappingTexts.map(text=>{ | |
return { | |
rawText: text[0], | |
hiragana: null, | |
katakana: text[1], | |
romaji: text[2], | |
} as Mapping; | |
})); | |
mappings.find(a=>a.katakana == "ヴ").hiragana = "ゔ"; // also add the hiragana form for the one extended-katakana that has one | |
mappings.reverse(); // have the shortest mappings/patterns considered last, since otherwise they disrupt recognition of the larger mappings/patterns | |
export function ConvertHiraganaAndKatakanaToRomaji(sourceText: string) { | |
let result = sourceText; | |
for (const mapping of mappings) { | |
result = result.replace(new RegExp(mapping.hiragana, "g"), mapping.romaji); | |
result = result.replace(new RegExp(mapping.katakana, "g"), mapping.romaji); | |
} | |
for (var i = 0; i < result.length; i++) { | |
// replace the gemination marker with the next consonant | |
if (result[i] == "ッ" || result[i] == "っ") { | |
let nextConsonant = result[i + 1]; | |
if (nextConsonant == "c") { | |
nextConsonant = "t"; | |
} | |
result = result.substring(0, i) + nextConsonant + result.substring(i + 1); | |
} | |
// replace the long-vowel marker with the previous vowel | |
else if (result[i] == "ー") { | |
const previousVowel = result[i - 1]; | |
result = result.substring(0, i) + previousVowel + result.substring(i + 1); | |
} | |
} | |
return result; | |
} | |
And polyfill for the "...".Matches custom function above | |
========== | |
declare global { | |
interface String { | |
Matches(str: string): {index: number}[]; | |
Matches(regex: RegExp): RegExpMatchArray[]; | |
} | |
} | |
String.prototype.Matches = function Matches(strOrRegex: string | RegExp) { | |
if (typeof strOrRegex == "string") { | |
let str = strOrRegex; | |
let result = [] as {index: number}[]; | |
let lastMatchIndex = -1; | |
while (true) { | |
let matchIndex = this.indexOf(str, lastMatchIndex + 1); | |
if (matchIndex == -1) // if another match was not found | |
break; | |
result.push({index: matchIndex}); | |
lastMatchIndex = matchIndex; | |
} | |
return result; | |
} | |
let regex = strOrRegex; | |
if (!regex.global) | |
throw new Error("Regex must have the 'g' flag added. (otherwise an infinite loop occurs)"); | |
let result = [] as RegExpMatchArray[]; | |
let match; | |
while (match = regex.exec(this)) | |
result.push(match); | |
return result; | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment