Skip to content

Instantly share code, notes, and snippets.

@lancejpollard
Last active July 9, 2024 02:13
Show Gist options
  • Save lancejpollard/e2b75cc8eec01fee84086bfddf2eb21a to your computer and use it in GitHub Desktop.
Save lancejpollard/e2b75cc8eec01fee84086bfddf2eb21a to your computer and use it in GitHub Desktop.
Parse Tibetan Syllables (attempt)
const PREFIXES = ['ག', 'ད', 'བ', 'མ', 'འ']
const SUPERSCRIPTS = ['ར', 'ལ', 'ས']
const EXTENDED_ROOT_LETTERS = [
'ཫ',
'ཬ',
'ཁ༹',
'ག༹',
'ཕ༹',
'བ༹',
'གྷ',
'ཛྷ',
'ཊ',
'ཋ',
'ཌ',
'ཌྷ',
'ཎ',
'དྷ',
'བྷ',
'ཥ',
'ཀྵ',
]
const SPECIAL = ['ༀ']
const ROOT_LETTERS = [
'ཀ',
'ཁ',
'ག',
'ང',
'ཅ',
'ཆ',
'ཇ',
'ཉ',
'ཏ',
'ཐ',
'ད',
'ན',
'པ',
'ཕ',
'བ',
'མ',
'ཙ',
'ཚ',
'ཛ',
'ཝ',
'ཞ',
'ཟ',
'འ',
'ཡ',
'ར',
'ལ',
'ཤ',
'ས',
'ཧ',
'ཨ',
].concat(EXTENDED_ROOT_LETTERS)
const SUBSCRIPTS = [
'ྐ',
'ྑ',
'ྒ',
'ྒྷ',
'ྔ',
'ྕ',
'ྖ',
'ྗ',
'ྙ',
'ྚ',
'ྛ',
'ྜ',
'ྜྷ',
'ྞ',
'ྟ',
'ྠ',
'ྡ',
'ྡྷ',
'ྣ',
'ྤ',
'ྥ',
'ྦ',
'ྦྷ',
'ྨ',
'ྩ',
'ྪ',
'ྫ',
'ྫྷ',
'ྭ',
'ྮ',
'ྯ',
'ྰ',
'ྱ',
'ྲ',
'ླ',
'ྴ',
'ྵ',
'ྶ',
'ྷ',
'ྸ',
'ྐྵ',
'ྺ',
'ྻ',
'ྼ',
]
const VOWEL_SIGNS = [
'ཱ',
'ི',
'ཱི',
'ུ',
'ཱུ',
'ྲྀ',
'ཷ',
'ླྀ',
'ཹ',
'ྀ',
'ཱྀ',
]
const SUFFIXES = ['ག', 'ན', 'བ', 'ད', 'མ', 'འ', 'ར', 'ང', 'ས', 'ལ']
const POST_SUFFIXES = ['ད', 'ས']
const IGNORED = [
'༄',
'ེ',
'ཻ',
'ོ',
'ཽ',
'ཾ',
'ཿ',
'ྂ',
'ྃ',
'྄',
'྅',
'྆',
'྇',
'ྈ',
'ྉ',
'ྊ',
'ྋ',
'ྌ',
'ྍ',
'ྎ',
'ྏ',
'\u0f89',
]
let errorI = 1
function splitSyllable(candidate: string) {
if (SPECIAL.includes(candidate)) {
return [candidate]
}
const parts = [...candidate]
if (parts[0] === parts[1]) {
splitSyllable(parts.slice(1).join(''))
}
const output = []
let current = []
let state = 'new'
while (parts.length) {
const next = parts.shift()
if (IGNORED.includes(next)) {
if (current.length) {
output.push(current.join(''))
}
current = []
state = 'new'
continue
}
switch (state) {
case 'new': {
if (PREFIXES.includes(next)) {
const peek = parts[0]
if (SUPERSCRIPTS.includes(peek)) {
const peek2 = parts[1]
if (ROOT_LETTERS.includes(peek2)) {
parts.shift()
parts.shift()
current.push(`${next}${peek}${peek2}`)
} else {
parts.shift()
current.push(`${next}${peek}`)
}
} else if (ROOT_LETTERS.includes(peek)) {
parts.shift()
current.push(`${next}${peek}`)
} else {
current.push(next)
}
state = 'root'
} else if (ROOT_LETTERS.includes(next)) {
current.push(next)
state = 'root'
} else {
console.log(
errorI++,
candidate +
' ' +
JSON.stringify(
[...candidate].map(x =>
x.codePointAt(0).toString(16).padStart(4, '0'),
),
),
)
return []
throw new Error(
'New state: ' +
candidate +
' ' +
JSON.stringify(
[...candidate].map(x =>
x.codePointAt(0).toString(16).padStart(4, '0'),
),
),
)
}
break
}
case 'root': {
if (SUBSCRIPTS.includes(next)) {
current.push(next)
state = 'subscript'
} else if (VOWEL_SIGNS.includes(next)) {
current.push(next)
state = 'vowel'
} else if (SUFFIXES.includes(next)) {
current.push(next)
state = 'suffix'
} else {
parts.unshift(next)
output.push(current.join(''))
current = []
state = 'new'
}
break
}
case 'subscript': {
if (VOWEL_SIGNS.includes(next)) {
current.push(next)
state = 'vowel'
} else if (SUFFIXES.includes(next)) {
current.push(next)
state = 'suffix'
} else {
parts.unshift(next)
output.push(current.join(''))
current = []
state = 'new'
}
break
}
case 'vowel': {
if (SUFFIXES.includes(next)) {
current.push(next)
state = 'suffix'
} else {
parts.unshift(next)
output.push(current.join(''))
current = []
state = 'new'
}
break
}
case 'suffix': {
if (POST_SUFFIXES.includes(next)) {
current.push(next)
output.push(current.join(''))
current = []
state = 'new'
} else {
parts.unshift(next)
output.push(current.join(''))
current = []
state = 'new'
}
break
}
}
}
if (current.length) {
output.push(current.join(''))
}
return output
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment