Last active
December 22, 2015 04:09
-
-
Save brucemcpherson/6415587 to your computer and use it in GitHub Desktop.
pinyin converter custom vba functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'gistThat@mcpher.com :do not modify this line - see ramblings.mcpher.com for details: updated on 02/09/2013 19:06:03 : from manifest:6415598 gist https://gist.github.com/brucemcpherson/6415587/raw/pinyin.vba | |
Option Explicit | |
Private Const NOTONE = 5 | |
Private Const COLORTYPENONE = 0 | |
Private Const COLORTYPEHTML = 1 | |
' v1.0 | |
' | |
' accents a pinyin string with optional html colorizing | |
' @param {string} inputString the string to be converted | |
' @param {number=} optColoringType - 0 = no coloring(default), 1 = return html with coloring options given | |
' @param {string=} optColors - a comma separated string of colors to use for tones 1-5 (5=no tone)- a default will be used if omitted | |
' return {string} the converted string (including colorization if required) | |
' | |
Public Function pinyinToToneMarks(inputString As String, _ | |
Optional optColoringType As Long = COLORTYPENONE, Optional optColors As String = vbNullString) As String | |
pinyinToToneMarks = pinyin(inputString, optColoringType, optColors, False) | |
End Function | |
' | |
' replaces accents in pinyin string with tone numbers and adds optional html colorizing | |
' @param {string} inputString the string to be converted | |
' @param {number} optColoringType - 0 = no coloring(default), 1 = return html with coloring options given | |
' @param {string} optColors - a comma separated string of colors to use for tones 1-5 (5=no tone)- a default will be used if omitted | |
' return {string} the converted string (including colorization if required) | |
' | |
Public Function pinyinToToneNumbers(inputString As String, _ | |
Optional optColoringType As Long = COLORTYPENONE, Optional optColors As String = vbNullString) As String | |
pinyinToToneNumbers = pinyin(inputString, optColoringType, optColors, True) | |
End Function | |
Private Function pinyin(inputString As String, _ | |
Optional optColoringType As Long = COLORTYPENONE, Optional optColors As String = vbNullString, _ | |
Optional optReversePinyin As Boolean = False) | |
Dim colors As Variant | |
If (optColors = vbNullString) Then | |
colors = Array("red", "orange", "green", "blue", "gray") | |
Else | |
colors = Split(optColors, ",") | |
End If | |
If (length(colors) <> 5) Then throw ("you need 5 colors") | |
If (optReversePinyin) Then | |
pinyin = pinyinReverse(inputString, colors, optColoringType) | |
Else | |
pinyin = pinyinRegular(inputString, colors, optColoringType) | |
End If | |
End Function | |
' convert from tone numbers to marked up pinyin | |
Private Function pinyinRegular(originalText As String, colors As Variant, coloringType As Long) As String | |
Dim matches As MatchCollection, match As match, consumedPos As Long, allWords As String, _ | |
r As RegExp, text As String, catchup As String, t As String, tone As Long, _ | |
word As String, dips As Variant, d As Long, k As String, _ | |
vowels As Variant, transform As Variant, i As Long | |
consumedPos = 0 | |
allWords = vbNullString | |
Set r = setUpSyllableRegex() | |
dips = getDips() | |
vowels = getVowels() | |
transform = getTransform() | |
' v hack - v equvalent to ü (I think) | |
text = replaceAll(originalText, "v", "ü") | |
' this regex is a table of all valid syllables | |
Set matches = r.Execute(text) | |
For Each match In matches | |
' this is a catch up separators and other stuff | |
catchup = Mid(text, consumedPos + 1, match.FirstIndex - consumedPos) | |
consumedPos = match.FirstIndex + match.length | |
' lookahead for a tone | |
t = Mid(text, consumedPos + 1, 1) | |
tone = NOTONE | |
' if there's a tone, then we want to remove it from the addbehind buffer (consume it) | |
If (t > "0" And t < CStr(NOTONE)) Then | |
consumedPos = consumedPos + 1 | |
tone = Int(t) | |
End If | |
' if anything other than separators then its an unknown syllable | |
If (Len(catchup) > 0 And replaceAll(catchup, "\W", "XX") = catchup) Then | |
throw ("unknown syllable " & catchup & " before " & match.Value) | |
End If | |
' identify where character subs need to happen in vowel sets | |
word = match.Value | |
For d = LBound(dips) To UBound(dips) | |
k = dips(d)(LBound(dips(d))) | |
If (InStr(1, word, k, vbTextCompare) > 0) Then | |
word = replace(word, k, CStr(dips(d)(UBound(dips(d))))) | |
Exit For | |
End If | |
Next d | |
' now replace them | |
For i = LBound(vowels) To UBound(vowels) | |
word = replace(word, CStr(vowels(i)), CStr(transform(tone - 1)(i))) | |
Next i | |
' colorize | |
If (coloringType = COLORTYPEHTML And tone <> NOTONE And Len(word) > 0) Then | |
word = "<font color=" & colors(tone - 1) & ">" + word + "</font>" | |
End If | |
' add the finished product | |
allWords = allWords & catchup & word | |
Next match | |
' finish it up | |
If (consumedPos < Len(text)) Then allWords = allWords + Mid(text, consumedPos + 1) | |
pinyinRegular = allWords | |
End Function | |
' convert from marked up pinyin to tone numbers | |
Private Function pinyinReverse(text As String, colors As Variant, coloringType As Long) As String | |
Dim r As String, tones As Collection, vowelPos As Collection, flatten As String, _ | |
i As Long, transform As Variant, newText As String, _ | |
matches As MatchCollection, match As match, pat As RegExp, pos As Long, _ | |
v As String, vowels As Variant, re As RegExp, consumedPos As Long, allWords As String, _ | |
catchup As String, t As String, tone As Long, _ | |
word As String, dips As Variant, d As Long, k As String | |
r = vbNullString | |
flatten = vbNullString | |
transform = getTransform() | |
vowels = getVowels() | |
newText = text | |
Set tones = New Collection | |
Set vowelPos = New Collection | |
Set re = setUpSyllableRegex() | |
' create a regex from the transform vowel table | |
For i = LBound(transform) To UBound(transform) | |
If (i <> NOTONE - 1) Then | |
If (r <> vbNullString) Then r = r & "|" | |
r = r & join(transform(i), "|") | |
flatten = flatten & join(transform(i), "") | |
End If | |
Next i | |
' we'll transform the markers to regular vowels and remember their tone numbers | |
Set pat = New RegExp | |
With pat | |
.Global = True | |
.Pattern = r | |
Set matches = .Execute(text) | |
End With | |
' look at each syllable | |
For Each match In matches | |
' should find vowel cluster | |
pos = InStr(1, flatten, match.Value) | |
If (pos < 1) Then throw ("could not find " & match.Value & " equivalent") | |
' this is the substitute vowel | |
v = Mid(vowels((pos - 1) Mod length(transform(0))), 1, 1) | |
newText = Mid(newText, 1, match.FirstIndex) & v & Mid(newText, 1 + match.FirstIndex + Len(v)) | |
''newText = newText.substr(0, match.index) + v + newText.substr(match.index+v.length); | |
' remember where this happened and what the tone was | |
k = "pos" & match.FirstIndex | |
tones.Add 1 + Int((pos - 1) / length(transform(0))), k | |
vowelPos.Add v, k | |
Next match | |
' now we can use the logic for parsing the other direction | |
Set matches = re.Execute(newText) | |
For Each match In matches | |
' this is a catch up separators and other stuff | |
catchup = Mid(text, consumedPos + 1, match.FirstIndex - consumedPos) | |
consumedPos = match.FirstIndex + match.length | |
' figure out the tone | |
tone = NOTONE | |
For i = match.FirstIndex To match.FirstIndex + Len(match.Value) - 1 | |
k = "pos" & i | |
If (exists(tones, k)) Then | |
tone = tones(k) | |
If (vowelPos(k) <> Mid(newText, i + 1, 1)) Then | |
throw ("vowel mismatch at " & match.Value & ":" & i & ":" & vowelPos(k)) | |
End If | |
End If | |
Next i | |
' if there is a tone we need to insert it | |
allWords = allWords & catchup & match.Value | |
If (tone <> NOTONE) Then allWords = allWords & tone | |
Next match | |
' finish it up | |
If (consumedPos < Len(text)) Then allWords = allWords + Mid(text, consumedPos + 1) | |
pinyinReverse = allWords | |
Set tones = Nothing | |
Set vowelPos = Nothing | |
End Function | |
Private Function exists(co As Collection, k As String) As Boolean | |
Dim A As Variant | |
On Error GoTo crapped | |
A = co(k) | |
exists = True | |
Exit Function | |
crapped: | |
exists = False | |
Exit Function | |
End Function | |
Private Sub throw(message) | |
Err.Raise vbObjectError + 22000, "pinyin convertor", message | |
End Sub | |
Private Function cond(test As Boolean, ifTrue As Variant, ifFalse As Variant) As Variant | |
If (test) Then | |
cond = ifTrue | |
Else | |
cond = ifFalse | |
End If | |
End Function | |
Private Function getSylReg() As String | |
getSylReg = "huan(?=gu|ge)|yuan(?=gu|ge)|jian(?=gu|ge)|" & _ | |
"zhuang|chuang|shuang|niang|liang|guang|kuang|huang|zhang|zheng|zhong|zhuai|zhuan|" & _ | |
"chang|cheng|chong|chuai|chuan|shang|sheng|shuai|shuan|jiang|jiong|qiang|qiong|xiang|xiong|" & _ | |
"fan(?=gu|ge)|gan(?=gu|ge)|wan(?=gu|ge)|kan(?=gu|ge)|jin(?=gu|ge)|" & _ | |
"uang|ueng|iang|iong|bang|beng|biao|bian|bing|pang|peng|piao|pian|ping|mang|meng|miao|mian|" & _ | |
"ming|fang|feng|dang|deng|dong|duan|diao|dian|ding|tang|teng|tong|tuan|tiao|tian|ting|nang|" & _ | |
"neng|nong|nuan|niao|nian|ning|lang|leng|long|luan|liao|lian|ling|gang|geng|gong|guai|guan|" & _ | |
"kang|keng|kong|kuai|kuan|hang|heng|hong|huai|huan|zang|zeng|zong|zuan|cang|ceng|cong|cuan|" & _ | |
"sang|seng|song|suan|zhai|zhei|zhao|zhou|zhan|zhen|zhua|zhuo|zhui|zhun|chai|chao|chou|chan|" & _ | |
"chen|chua|chuo|chui|chun|shai|shei|shao|shou|shan|shen|shua|shuo|shui|shun|rang|reng|rong|" & _ | |
"ruan|jiao|jian|jing|juan|qiao|qian|qing|quan|xiao|xian|xing|xuan|wang|weng|yang|ying|yong|" & _ | |
"yuan|" & _ | |
"qu(?=n)|ke(?=n)|" & _ | |
"ang|eng|ong|uai|uan|iao|ian|ing|üan|bai|bei|bao|ban|ben|bie|bin|pai|pei|pao|pou|pan|" & _ | |
"pen|pie|pin|mai|mei|mao|mou|man|men|mie|miu|min|fan|fei|fou|fen|gan|dai|dei|dao|dou|dan|den|" & _ | |
"duo|dui|dun|dia|die|diu|tai|tei|tao|tou|tan|tuo|tui|tun|tie|nai|nei|nao|nou|nan|nen|nuo|nun|" & _ | |
"nie|niu|nin|nüe|lai|lei|lao|lou|lan|luo|lun|lia|lie|liu|lin|lüe|gai|gei|gao|gou|gen|gua|" & _ | |
"guo|gui|gun|kai|kei|kao|kou|kan|ken|kua|kuo|kui|kun|hai|hei|hao|hou|han|hen|hua|huo|hui|hun|" & _ | |
"zai|zei|zao|zou|zan|zen|zuo|zui|zun|cai|cao|cou|can|cen|cuo|cui|cun|sai|sao|sou|san|sen|suo|" & _ | |
"sui|sun|zha|zhe|zhu|zhi|cha|che|chu|chi|sha|she|shu|shi|rao|rou|ran|ren|rua|ruo|rui|run|jia|" & _ | |
"jie|jiu|jin|jue|jun|qia|qie|qiu|qin|que|qun|xia|xie|xiu|xin|xue|xun|ang|eng|wai|wei|wan|wen|" & _ | |
"yao|you|yan|yin|yue|yun|ai|ei|ao|ou|an|en|ua|uo|ui|un|ia|ie|iu|in|üe|ün|ba|bo|bu|bi|pa|po|pu|" & _ | |
"pi|ma|mo|me|mu|mi|fa|fo|fu|da|de|du|di|ta|te|tu|ti|na|ne|nu|ni|nü|la|le|lu|li|lü|ga|ge|gu|ka|" & _ | |
"ke|ku|ha|he|hu|za|ze|zu|zi|ca|ce|cu|ci|sa|se|su|si|re|ru|ri|ji|ju|qi|qu|xi|xu|ai|ei|ao|ou|an|" & _ | |
"en|wu|wa|wo|yi|ya|ye|yu|er|a|o|e|u|i|ü|a|o|e" | |
End Function | |
Private Function getDips() As Variant | |
getDips = Array( _ | |
Array("iao", "ia*o"), Array("uai", "ua*i"), Array("ai", "a*i"), Array("ao", "a*o"), _ | |
Array("ei", "e*i"), Array("ia", "ia*"), Array(" ie", "ie*"), Array("io", "io*"), _ | |
Array("iu", "iu*"), Array("Ai", "A*i"), Array("Ao", "A*o"), Array("Ei", "E*i"), _ | |
Array("ou", "o*u"), Array("ua", "ua*"), Array("ue", "ue*"), Array("ui", "ui*"), _ | |
Array("uo", "uo*"), Array("ve", "üe*"), Array("üe", "üe*"), Array("Ou", "O*u"), _ | |
Array("a", "a*"), Array("e", "e*"), Array("i", "i*"), Array("o", "o*"), _ | |
Array("u", "u*"), Array("ü", "ü*"), Array("v", "v*"), Array("A", "A*"), _ | |
Array("E", "E*"), Array("O", "O*")) | |
End Function | |
Private Function getVowels() As Variant | |
getVowels = Array("a*", "e*", "i*", "o*", "u*", "ü*", "v*", "A*", "E*", "O*") | |
End Function | |
Private Function getTransform() As Variant | |
getTransform = Array( _ | |
Array(ChrW(&H101), ChrW(&H113), ChrW(&H12B), ChrW(&H14D), ChrW(&H16B), ChrW(&H1D6), ChrW(&H1D6), ChrW(&H100), ChrW(&H112), ChrW(&H12A), ChrW(&H14C)), _ | |
Array(ChrW(&HE1), ChrW(&HE9), ChrW(&HED), ChrW(&HF3), ChrW(&HFA), ChrW(&H1D8), ChrW(&H1D8), ChrW(&HC1), ChrW(&HC9), ChrW(&HCD), ChrW(&HD3)), _ | |
Array(ChrW(&H1CE), ChrW(&H11B), ChrW(&H1D0), ChrW(&H1D2), ChrW(&H1D4), ChrW(&H1DA), ChrW(&H1DA), ChrW(&H1CD), ChrW(&H11A), ChrW(&H1CF), ChrW(&H1D1)), _ | |
Array(ChrW(&HE0), ChrW(&HE8), ChrW(&HEC), ChrW(&HF2), ChrW(&HF9), ChrW(&H1DC), ChrW(&H1DC), ChrW(&HC0), ChrW(&HC8), ChrW(&HCC), ChrW(&HD2)), _ | |
Array(ChrW(&H61), ChrW(&H65), ChrW(&H69), ChrW(&H6F), ChrW(&H75), ChrW(&HFC), ChrW(&H76), ChrW(&H41), ChrW(&H45), ChrW(&H49), ChrW(&H4F))) | |
End Function | |
Private Function replaceAll(str As String, find As String, replace As String) As String | |
Dim r As RegExp | |
Set r = New RegExp | |
With r | |
.Global = True | |
.Pattern = find | |
End With | |
replaceAll = r.replace(str, replace) | |
End Function | |
Private Function setUpSyllableRegex() As RegExp | |
Dim r As RegExp | |
Set r = New RegExp | |
With r | |
.IgnoreCase = True | |
.Global = True | |
.Pattern = getSylReg() | |
End With | |
Set setUpSyllableRegex = r | |
End Function | |
Private Function join(A As Variant, d As String) As String | |
Dim s As String, i As Long | |
s = vbNullString | |
For i = LBound(A) To UBound(A) | |
If s <> vbNullString Then s = s & d | |
s = s & A(i) | |
Next i | |
join = s | |
End Function | |
Private Function length(A As Variant) As Long | |
length = UBound(A) - LBound(A) + 1 | |
End Function |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment