Created
October 15, 2019 20:46
-
-
Save pserwylo/457877a4a7bfa52a237cbeba9ad07b53 to your computer and use it in GitHub Desktop.
Java Unicode to ASCII conversion.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.text.Normalizer; | |
class UnicodeToAsciiTruncation { | |
/** | |
* Does a best-effort job of converting Unicode to meaningful ASCII characters, by normalizing | |
* and then truncating any bytes that don't fall within the ASCII range (0-127). | |
* | |
* Useful for when you have Unicode data that needs to be sent to a legacy device/service which | |
* only accepts ASCII (e.g. some specialised printers). | |
* | |
* This is done on a best effort, as clearly only a small subset of Unicode can meaningfully be | |
* converted to ASCII. However, for data that is predominantly in Latin-based scripts, this will | |
* do a pretty good job at producing readable text. | |
* | |
* First, ensures that all Unicode characters which represent multiple characters are in | |
* their decomposed form. For example, the Unicode character "LATIN CAPITAL LETTER A WITH ACUTE" | |
* and also the two Unicode characters "LATIN CAPITAL LETTER A" + "COMBINING ACUTE ACCENT" will | |
* both be converted two the two characters "LATIN CAPITAL LETTER A" and "COMBINING ACUTE ACCENT". | |
* This will make out string length potentially include a bunch of characters that are also valid ASCII | |
* characters (e.g. "LATIN CAPITAL LETTER A") and others which are not (e.g. "COMBINING ACUTE ACCENT"). | |
* Those which are outside of the ASCII range are discarded which means we are left with | |
* just "LATIN CAPITAL LETTER A" in the above example. | |
*/ | |
private static String truncateToAscii(String string) { | |
String normalised = Normalizer.normalize(string, Normalizer.Form.NFKD); | |
StringBuilder sb = new StringBuilder(); | |
for(int i = 0; i < normalised.length(); i ++) { | |
char c = normalised.charAt(i); | |
if (c <= 127) { | |
sb.append(normalised.charAt(i)); | |
} | |
} | |
return sb.toString(); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.text.Normalizer; | |
class TestUnicodeToAsciiTruncation { | |
public static void main(String[] args) { | |
String composedUnicode = "\u00C1"; // LATIN CAPITAL LETTER A WITH ACUTE (Á) | |
String decomposedUnicode = "\u0041\u0301"; // LATIN CAPITAL LETTER A + COMBINING ACUTE ACCENT (Á) | |
System.out.println("Composed and decomposed variants"); | |
System.out.println(composedUnicode); | |
System.out.println(decomposedUnicode); | |
System.out.println("NFD"); | |
System.out.println(Normalizer.normalize(composedUnicode, Normalizer.Form.NFD)); | |
System.out.println(Normalizer.normalize(decomposedUnicode, Normalizer.Form.NFD)); | |
System.out.println("NFKD"); | |
System.out.println(Normalizer.normalize(composedUnicode, Normalizer.Form.NFKD)); | |
System.out.println(Normalizer.normalize(decomposedUnicode, Normalizer.Form.NFKD)); | |
System.out.println("NFKD -> ASCII"); | |
System.out.println(truncateToAscii(composedUnicode)); | |
System.out.println(truncateToAscii(decomposedUnicode)); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment