Skip to content

Instantly share code, notes, and snippets.

@qur2
Last active August 29, 2015 14:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save qur2/96ce791d08c23d02f7f1 to your computer and use it in GitHub Desktop.
Save qur2/96ce791d08c23d02f7f1 to your computer and use it in GitHub Desktop.
Javascript handling of combining diacritic characters
// TODO: complete the mapping
var combiningDiacriticMap = {
// combining grave accent
'\u0300': {
'A': '\u00c0', 'a': '\u00e0',
'E': '\u00c8', 'e': '\u00e8',
'I': '\u00cc', 'i': '\u00ec',
'O': '\u00d2', 'o': '\u00f2',
'U': '\u00d9', 'u': '\u00f9'
},
// combining acute accent
'\u0301': {
'A': '\u00c1', 'a': '\u00e1',
'E': '\u00c9', 'e': '\u00e9',
'I': '\u00cd', 'i': '\u00ed',
'O': '\u00d3', 'o': '\u00f3',
'U': '\u00da', 'u': '\u00fa',
'Y': '\u00dd', 'y': '\u00fd',
'C': '\u0106', 'c': '\u0107'
},
// combining circumflex accent
'\u0302': {
'A': '\u00c2', 'a': '\u00e2',
'E': '\u00ca', 'e': '\u00ea',
'I': '\u00ce', 'i': '\u00ee',
'O': '\u00d4', 'o': '\u00f4',
'U': '\u00db', 'u': '\u00fb',
'C': '\u0108', 'c': '\u0109',
'J': '\u0134', 'j': '\u0135'
},
// combining tilde
'\u0303': {
'A': '\u00c3', 'a': '\u00e3',
'O': '\u00d5', 'o': '\u00f5',
'I': '\u0128', 'i': '\u0129',
'U': '\u0168', 'u': '\u0169'
},
// combining macron
// '\u0304': {
// },
// combining overline
// '\u0305': {
// },
// combining breve
// '\u0306': {
// },
// combining dot above
// '\u0307': {
// },
// combining diaresis
'\u0308': {
'A': '\u00c4', 'a': '\u00e4',
'E': '\u00cb', 'e': '\u00eb',
'I': '\u00cf', 'i': '\u00ef',
'O': '\u00d6', 'o': '\u00f6',
'U': '\u00dc', 'u': '\u00fc',
'Y': '\u0178', 'y': '\u00ff'
},
// combining hook above
// '\u0309': {
// },
// combining ring above
'\u030a': {
'A': '\u00c5',
'U': '\u016e', 'u': '\u016f'
},
// combining double acute accent
'\u030b': {
'O': '\u0150', 'o': '\u0151',
'U': '\u0170', 'u': '\u0171'
},
// combining caron
'\u030c': {
'C': '\u010c', 'c': '\u010d',
'D': '\u010e', 'd': '\u010f',
'E': '\u011a', 'e': '\u011b',
'L': '\u013d', 'l': '\u013e',
'N': '\u0147', 'n': '\u0148',
'S': '\u0160', 's': '\u0161',
'T': '\u0164', 't': '\u0165',
'Z': '\u017d', 'z': '\u017e'
},
// combining vertical line above
// '\u030d': {
// },
// combining double vertical line above
// '\u030e': {
// },
// combining double grave accent
// '\u030f': {
// },
// combining candrabindu
// '\u0310': {
// },
// combining inverted breve
// '\u0311': {
// },
// combining turned comma above
// '\u0312': {
// },
// combining comma above
// '\u0313': {
// },
// combining reversed comma above
// '\u0314': {
// },
// combining comma above right
// '\u0315': {
// },
// combining grave accent below
// '\u0316': {
// },
// combining grave accent below
'\u0328': {
'A': '\u0104', 'a': '\u0105',
'E': '\u0118', 'e': '\u0119',
'I': '\u012e', 'i': '\u012f',
'U': '\u0172', 'u': '\u0173'
},
// combining short stroke overlay
'\u0335': {
'L': '\u0141', 'l': '\u0142'
},
// combining long stroke overlay
'\u0336': {
'L': '\u0141', 'l': '\u0142'
},
// combining short solidus overlay
'\u0337': {
'L': '\u0141', 'l': '\u0142',
'O': '\u00d8', 'o': '\u00f8'
},
// combining long solidus overlay
'\u0338': {
'L': '\u0141', 'l': '\u0142',
'O': '\u00d8', 'o': '\u00f8'
}
}
function convertCombiningDiacritic(chr, diacritic) {
var map = combiningDiacriticMap[diacritic] || {},
newChr = map[chr];
return newChr || chr;
}
/**
* Converts (char, combining diacritic) couple to single diacritic entity.
* To do so, it iterates over the given string char by char and when it finds
* a combining diacritic character, it replaces it with the combined entity.
* If there is no combined entity, the comining diacritic character is dropped.
*/
function convertCombiningDiacritics(str) {
if (!str.length) return str;
// append a guard value for simpler handling
var _str = str + '.';
var p, c = _str[0];
var chars = [];
for (var i=1, ii=_str.length; i < ii; i++) {
p = c;
c = _str.charAt(i);
if (c > '\u0300' && c < '\u036f') {
chars.push(convertCombiningDiacritic(p, c));
// When a a character is combined, go forward to the next char.
// Next iteration will not fail thanks to the guard value.
c = _str.charAt(++i);
} else {
chars.push(p);
}
}
return chars.join('');
}
@qur2
Copy link
Author

qur2 commented Sep 1, 2014

In essence, it's trying to do the following python snippet: unicodedata.normalize('NFC', string).
Bottom-line: do it server side.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment