Last active
August 29, 2015 14:05
-
-
Save qur2/96ce791d08c23d02f7f1 to your computer and use it in GitHub Desktop.
Javascript handling of combining diacritic characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// TODO: complete the mapping | |
var combiningDiacriticMap = { | |
// combining grave accent | |
'\u0300': { | |
'A': '\u00c0', 'a': '\u00e0', | |
'E': '\u00c8', 'e': '\u00e8', | |
'I': '\u00cc', 'i': '\u00ec', | |
'O': '\u00d2', 'o': '\u00f2', | |
'U': '\u00d9', 'u': '\u00f9' | |
}, | |
// combining acute accent | |
'\u0301': { | |
'A': '\u00c1', 'a': '\u00e1', | |
'E': '\u00c9', 'e': '\u00e9', | |
'I': '\u00cd', 'i': '\u00ed', | |
'O': '\u00d3', 'o': '\u00f3', | |
'U': '\u00da', 'u': '\u00fa', | |
'Y': '\u00dd', 'y': '\u00fd', | |
'C': '\u0106', 'c': '\u0107' | |
}, | |
// combining circumflex accent | |
'\u0302': { | |
'A': '\u00c2', 'a': '\u00e2', | |
'E': '\u00ca', 'e': '\u00ea', | |
'I': '\u00ce', 'i': '\u00ee', | |
'O': '\u00d4', 'o': '\u00f4', | |
'U': '\u00db', 'u': '\u00fb', | |
'C': '\u0108', 'c': '\u0109', | |
'J': '\u0134', 'j': '\u0135' | |
}, | |
// combining tilde | |
'\u0303': { | |
'A': '\u00c3', 'a': '\u00e3', | |
'O': '\u00d5', 'o': '\u00f5', | |
'I': '\u0128', 'i': '\u0129', | |
'U': '\u0168', 'u': '\u0169' | |
}, | |
// combining macron | |
// '\u0304': { | |
// }, | |
// combining overline | |
// '\u0305': { | |
// }, | |
// combining breve | |
// '\u0306': { | |
// }, | |
// combining dot above | |
// '\u0307': { | |
// }, | |
// combining diaresis | |
'\u0308': { | |
'A': '\u00c4', 'a': '\u00e4', | |
'E': '\u00cb', 'e': '\u00eb', | |
'I': '\u00cf', 'i': '\u00ef', | |
'O': '\u00d6', 'o': '\u00f6', | |
'U': '\u00dc', 'u': '\u00fc', | |
'Y': '\u0178', 'y': '\u00ff' | |
}, | |
// combining hook above | |
// '\u0309': { | |
// }, | |
// combining ring above | |
'\u030a': { | |
'A': '\u00c5', | |
'U': '\u016e', 'u': '\u016f' | |
}, | |
// combining double acute accent | |
'\u030b': { | |
'O': '\u0150', 'o': '\u0151', | |
'U': '\u0170', 'u': '\u0171' | |
}, | |
// combining caron | |
'\u030c': { | |
'C': '\u010c', 'c': '\u010d', | |
'D': '\u010e', 'd': '\u010f', | |
'E': '\u011a', 'e': '\u011b', | |
'L': '\u013d', 'l': '\u013e', | |
'N': '\u0147', 'n': '\u0148', | |
'S': '\u0160', 's': '\u0161', | |
'T': '\u0164', 't': '\u0165', | |
'Z': '\u017d', 'z': '\u017e' | |
}, | |
// combining vertical line above | |
// '\u030d': { | |
// }, | |
// combining double vertical line above | |
// '\u030e': { | |
// }, | |
// combining double grave accent | |
// '\u030f': { | |
// }, | |
// combining candrabindu | |
// '\u0310': { | |
// }, | |
// combining inverted breve | |
// '\u0311': { | |
// }, | |
// combining turned comma above | |
// '\u0312': { | |
// }, | |
// combining comma above | |
// '\u0313': { | |
// }, | |
// combining reversed comma above | |
// '\u0314': { | |
// }, | |
// combining comma above right | |
// '\u0315': { | |
// }, | |
// combining grave accent below | |
// '\u0316': { | |
// }, | |
// combining grave accent below | |
'\u0328': { | |
'A': '\u0104', 'a': '\u0105', | |
'E': '\u0118', 'e': '\u0119', | |
'I': '\u012e', 'i': '\u012f', | |
'U': '\u0172', 'u': '\u0173' | |
}, | |
// combining short stroke overlay | |
'\u0335': { | |
'L': '\u0141', 'l': '\u0142' | |
}, | |
// combining long stroke overlay | |
'\u0336': { | |
'L': '\u0141', 'l': '\u0142' | |
}, | |
// combining short solidus overlay | |
'\u0337': { | |
'L': '\u0141', 'l': '\u0142', | |
'O': '\u00d8', 'o': '\u00f8' | |
}, | |
// combining long solidus overlay | |
'\u0338': { | |
'L': '\u0141', 'l': '\u0142', | |
'O': '\u00d8', 'o': '\u00f8' | |
} | |
} | |
function convertCombiningDiacritic(chr, diacritic) { | |
var map = combiningDiacriticMap[diacritic] || {}, | |
newChr = map[chr]; | |
return newChr || chr; | |
} | |
/** | |
* Converts (char, combining diacritic) couple to single diacritic entity. | |
* To do so, it iterates over the given string char by char and when it finds | |
* a combining diacritic character, it replaces it with the combined entity. | |
* If there is no combined entity, the comining diacritic character is dropped. | |
*/ | |
function convertCombiningDiacritics(str) { | |
if (!str.length) return str; | |
// append a guard value for simpler handling | |
var _str = str + '.'; | |
var p, c = _str[0]; | |
var chars = []; | |
for (var i=1, ii=_str.length; i < ii; i++) { | |
p = c; | |
c = _str.charAt(i); | |
if (c > '\u0300' && c < '\u036f') { | |
chars.push(convertCombiningDiacritic(p, c)); | |
// When a a character is combined, go forward to the next char. | |
// Next iteration will not fail thanks to the guard value. | |
c = _str.charAt(++i); | |
} else { | |
chars.push(p); | |
} | |
} | |
return chars.join(''); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
In essence, it's trying to do the following python snippet:
unicodedata.normalize('NFC', string)
.Bottom-line: do it server side.