pserwylo/UnicodeToAsciiTruncation.java

## UnicodeToAsciiTruncation.java
import java.text.Normalizer;

class UnicodeToAsciiTruncation {

  /**
    * Does a best-effort job of converting Unicode to meaningful ASCII characters, by normalizing
    * and then truncating any bytes that don't fall within the ASCII range (0-127).
    *
    * Useful for when you have Unicode data that needs to be sent to a legacy device/service which
    * only accepts ASCII (e.g. some specialised printers).
    *
    * This is done on a best effort, as clearly only a small subset of Unicode can meaningfully be
    * converted to ASCII. However, for data that is predominantly in Latin-based scripts, this will
    * do a pretty good job at producing readable text.
    *
    * First, ensures that all Unicode characters which represent multiple characters are in
    * their decomposed form. For example, the Unicode character "LATIN CAPITAL LETTER A WITH ACUTE"
    * and also the two Unicode characters "LATIN CAPITAL LETTER A" + "COMBINING ACUTE ACCENT" will
    * both be converted two the two characters "LATIN CAPITAL LETTER A" and "COMBINING ACUTE ACCENT".
    * This will make out string length potentially include a bunch of characters that are also valid ASCII
    * characters (e.g. "LATIN CAPITAL LETTER A") and others which are not (e.g. "COMBINING ACUTE ACCENT").
    * Those which are outside of the ASCII range are discarded which means we are left with
    * just "LATIN CAPITAL LETTER A" in the above example.
    */
  private static String truncateToAscii(String string) {
    String normalised = Normalizer.normalize(string, Normalizer.Form.NFKD);
    StringBuilder sb = new StringBuilder();
    for(int i = 0; i < normalised.length(); i ++) {
      char c = normalised.charAt(i);
      if (c <= 127) {
        sb.append(normalised.charAt(i));
      }
    }
    return sb.toString();
  }

}

## UnicodeToAsciiTruncationTest.java
import java.text.Normalizer;

class TestUnicodeToAsciiTruncation {

  public static void main(String[] args) {

    String composedUnicode = "\u00C1";         // LATIN CAPITAL LETTER A WITH ACUTE (Á)
    String decomposedUnicode = "\u0041\u0301"; // LATIN CAPITAL LETTER A + COMBINING ACUTE ACCENT (Á)

    System.out.println("Composed and decomposed variants");
    System.out.println(composedUnicode);
    System.out.println(decomposedUnicode);

    System.out.println("NFD");
    System.out.println(Normalizer.normalize(composedUnicode, Normalizer.Form.NFD));
    System.out.println(Normalizer.normalize(decomposedUnicode, Normalizer.Form.NFD));

    System.out.println("NFKD");
    System.out.println(Normalizer.normalize(composedUnicode, Normalizer.Form.NFKD));
    System.out.println(Normalizer.normalize(decomposedUnicode, Normalizer.Form.NFKD));

    System.out.println("NFKD -> ASCII");
    System.out.println(truncateToAscii(composedUnicode));
    System.out.println(truncateToAscii(decomposedUnicode));

  }

}
	import java.text.Normalizer;

	class UnicodeToAsciiTruncation {

	/**
	* Does a best-effort job of converting Unicode to meaningful ASCII characters, by normalizing
	* and then truncating any bytes that don't fall within the ASCII range (0-127).
	*
	* Useful for when you have Unicode data that needs to be sent to a legacy device/service which
	* only accepts ASCII (e.g. some specialised printers).
	*
	* This is done on a best effort, as clearly only a small subset of Unicode can meaningfully be
	* converted to ASCII. However, for data that is predominantly in Latin-based scripts, this will
	* do a pretty good job at producing readable text.
	*
	* First, ensures that all Unicode characters which represent multiple characters are in
	* their decomposed form. For example, the Unicode character "LATIN CAPITAL LETTER A WITH ACUTE"
	* and also the two Unicode characters "LATIN CAPITAL LETTER A" + "COMBINING ACUTE ACCENT" will
	* both be converted two the two characters "LATIN CAPITAL LETTER A" and "COMBINING ACUTE ACCENT".
	* This will make out string length potentially include a bunch of characters that are also valid ASCII
	* characters (e.g. "LATIN CAPITAL LETTER A") and others which are not (e.g. "COMBINING ACUTE ACCENT").
	* Those which are outside of the ASCII range are discarded which means we are left with
	* just "LATIN CAPITAL LETTER A" in the above example.
	*/
	private static String truncateToAscii(String string) {
	String normalised = Normalizer.normalize(string, Normalizer.Form.NFKD);
	StringBuilder sb = new StringBuilder();
	for(int i = 0; i < normalised.length(); i ++) {
	char c = normalised.charAt(i);
	if (c <= 127) {
	sb.append(normalised.charAt(i));
	}
	}
	return sb.toString();
	}

	}
	import java.text.Normalizer;

	class TestUnicodeToAsciiTruncation {

	public static void main(String[] args) {

	String composedUnicode = "\u00C1"; // LATIN CAPITAL LETTER A WITH ACUTE (Á)
	String decomposedUnicode = "\u0041\u0301"; // LATIN CAPITAL LETTER A + COMBINING ACUTE ACCENT (Á)

	System.out.println("Composed and decomposed variants");
	System.out.println(composedUnicode);
	System.out.println(decomposedUnicode);

	System.out.println("NFD");
	System.out.println(Normalizer.normalize(composedUnicode, Normalizer.Form.NFD));
	System.out.println(Normalizer.normalize(decomposedUnicode, Normalizer.Form.NFD));

	System.out.println("NFKD");
	System.out.println(Normalizer.normalize(composedUnicode, Normalizer.Form.NFKD));
	System.out.println(Normalizer.normalize(decomposedUnicode, Normalizer.Form.NFKD));

	System.out.println("NFKD -> ASCII");
	System.out.println(truncateToAscii(composedUnicode));
	System.out.println(truncateToAscii(decomposedUnicode));

	}

	}