Skip to content

Instantly share code, notes, and snippets.

@jabbrwcky
Created March 19, 2012 13:16
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jabbrwcky/2111727 to your computer and use it in GitHub Desktop.
Save jabbrwcky/2111727 to your computer and use it in GitHub Desktop.
StringCleaner: a demonstration how to use ICU4j to convert practically any UTF-8/Java String to some ASCII compatible form. Requires ICU4J, TestNG and Findbugs-Annotations.
package net.hausherr.util;
import com.ibm.icu.text.Normalizer;
import java.util.HashMap;
import java.util.Map;
/**
* StringCleaner provides a method for normalizing a string to generally
* ASCII-compatible form.
*/
public class StringCleaner {
/**
* Map containing replacements for corner cases (i.e. not decomposed by the
* Normalizer)
*/
private Map<Integer, Replacement> charMap = buildReplacementMap();
/**
* builds a map containing all replacements that are not automatically
* performed by the normalizer.
*
* @return Replacement containing both replacements for upper- and lowercase
* mode.
*/
private Map<Integer, Replacement> buildReplacementMap() {
Map<Integer, Replacement> map = new HashMap<Integer, Replacement>();
map.put(0xc6, new Replacement("AE", "Ae"));
map.put(0xe6, new Replacement("ae"));
map.put(0xd0, new Replacement("D"));
map.put(0x111, new Replacement("d"));
map.put(0xd8, new Replacement("O"));
map.put(0xf8, new Replacement("o"));
map.put(0x152, new Replacement("OE", "Oe"));
map.put(0x153, new Replacement("oe"));
map.put(0x166, new Replacement("T"));
map.put(0x167, new Replacement("t"));
return map;
}
/**
* <p>
* This method takes an input String and replaces all special characters
* like umlauts, accented or other letter with diacritical marks with their
* basic ascii eqivalents.
* </p>
* <p>
* Example: The String "André" or "Ándre" would be converted to "Andre".
* </p>
* <p>
* The flag <code>replaceAllCapitalLetters</code> controls the replacement
* behavior of special characters that are decomposed into two plain ASCII
* chars, like "Æ" or "æ".
* </p>
* <p>
* In "lowercase" mode (i.e.<code> replaceAllCapitalLetters=false</code> )
* both aforementioned examples would be converted to "Ae".
* </p>
* <p>
* In "uppercase" mode (<code>replaceAllCapitalLetters=false</code>) the
* replacement would be "AE".
* </p>
*
* @param input String to convert
* @param replaceAllCapitalLetters <code>true</code> causes uppercase special chars that are
* replaced by more than one character to be replaced by
* all-uppercase replacements; <code>false</code> will cause only
* the initial character of the replacements to be in uppercase
* and all subsequent replacement characters will be in
* lowercase.
* @return Input string reduced to ASCII-safe characters.
*/
public String convertToAscii(String input, boolean replaceAllCapitalLetters) {
/*
* operating on char arrays because java.lang.String seems to perform an
* automatic recomposition of decomposed characters.
*/
String result = null;
if (null != input) {
char[] src = input.toCharArray();
/* save space for exotic UTF characters */
char[] target = new char[src.length * 3];
int len = Normalizer.normalize(input.toCharArray(), target, Normalizer.NFKD, 0);
result = processSpecialChars(target, 0, len, replaceAllCapitalLetters);
}
return result;
}
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = {"SF_SWITCH_FALLTHROUGH"}, justification = "Intentional fallthrough (JHAUSHER)")
private String processSpecialChars(char[] target, int offset, int len, boolean uppercase) {
StringBuilder result = new StringBuilder();
boolean skip = false;
for (int i = 0; i < len; i++) {
if (skip) {
skip = false;
} else {
char c = target[i];
if ((c > 0x20 && c < 0x40) || (c > 0x7a && c < 0xc0) || (c > 0x5a && c < 0x61) || (c > 0x79 && c < 0xc0) || c == 0xd7 || c == 0xf7) {
result.append(c);
} else if (Character.isDigit(c) || Character.isISOControl(c)) {
result.append(c);
} else if (Character.isWhitespace(c) || Character.isLetter(c)) {
boolean isUpper = false;
switch (c) {
case '\u00df':
result.append("ss");
break;
/* Handling of capital and lowercase umlauts */
case 'A':
case 'O':
case 'U':
isUpper = true;
case 'a':
case 'o':
case 'u':
result.append(c);
if (i + 1 < target.length && target[i + 1] == 0x308) {
result.append(isUpper && uppercase ? 'E' : 'e');
skip = true;
}
break;
default:
Replacement rep = charMap.get(Integer.valueOf(c));
if (rep != null) {
result.append(uppercase ? rep.upper : rep.lower);
} else
result.append(c);
}
}
}
}
return result.toString();
}
/**
* Combination of replacements for upper- and lowercase mode.
*/
private static class Replacement {
private final String upper;
private final String lower;
Replacement(String ucReplacement, String lcReplacement) {
this.upper = ucReplacement;
this.lower = lcReplacement;
}
Replacement(String caseInsensitiveReplacement) {
this(caseInsensitiveReplacement, caseInsensitiveReplacement);
}
}
}
package net.hausherr.util;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.lang.reflect.Method;
import java.util.Iterator;
import static org.testng.Assert.assertEquals;
/**
* Unit-Test for testing charachter replacement.
*/
public class StringCleanerTest {
private StringCleaner sut;
@BeforeClass
public void setupClass() {
sut = new StringCleaner();
}
@AfterClass
public void tearDownClass() {
}
/**
* Tests replacement of special characters in "lowercase" mode.
*
* @param testData
* Char/String to test
* @param expected
* expected normalized form
*/
@Test(dataProvider = "asciiConversion")
void testReplaceSpecialCharacters(String testData, String expected) {
String result = sut.convertToAscii(testData, false);
assertEquals(result, expected);
}
/**
*
* Tests replacement of special characters in "uppercase" mode.
*
* @param testData
* Char/String to test
* @param expected
* expected normalized form
*/
@Test(dataProvider = "asciiConversion")
void testReplaceSpecialCharactersUppercase(String testData, String expected) {
String result = sut.convertToAscii(testData, true);
assertEquals(result, expected);
}
/**
* Data provider for the test methods either for upper- or lowercase mode..
*
* Provides Data both for upper- and lowercase tests as Iterator over the
* array of String arrays that holds the "raw" data.
*
* @param m
* actual testmethod, provided by TestNG
* @return Iterator over test data.
*/
@DataProvider(name = "asciiConversion")
public Iterator<Object[]> dataProvider(Method m) {
if (m.getName().endsWith("Uppercase")) {
return new TestDataIterator(true);
}
return new TestDataIterator(false);
}
/**
* This array of String Arrays holds the data for all tests.
*
* <p>
* The data contained in each array has the following semantics:
* </p>
* <code>{ "EXP(lc)", "EXP(uc)", "TV_1", ... , "TV-n" }</code>
* <p>
* Legend:
* </p>
* <ul>
* <li>EXP(lc): Reference value for lowercase tests (expectation)</li>
* <li>EXP(uc): Reference value for uppercase tests (expectation)</li>
* <li>TV_1...TV_n: Test String/Character 1...n</li>
* </ul>
*/
private static final String[][] testDataSource = {
//
/* Sanity checks first */
{ "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ" },//
{ "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz" },//
{ "1234567890", "1234567890", "1234567890" },//
/* Symbols */
{ "!\"§$%&/()=?'\\", "!\"§$%&/()=?'\\", "!\"§$%&/()=?'\\" },//
{ "÷ס¢£¤·", "÷ס¢£¤·", "÷ס¢£¤·" }, //
{ " \t\r\n", " \t\r\n", " \t\r\n" },//
/* Fun starts here */
{ "A", "A", "Ā", "Ă", "Å", "À", "Â" }, //
{ "a", "a", "ā", "ă", "å", "à", "â" }, //
{ "Ae", "AE", "Æ", "Ǽ", "Ä" },//
{ "ae", "ae", "æ", "ǣ", "ä", "ǟ" },//
{ "C", "C", "Ċ", "Ç", "Č" },//
{ "c", "c", "ċ", "ç", "č" },//
{ "D", "D", "Ď", "Ð" },//
{ "d", "d", "ď", "đ" },//
{ "E", "E", "Ê", "Ë", "È", "É", "Ê" }, //
{ "e", "e", "ê", "ë", "è", "é" }, //
{ "G", "G", "Ĝ", "Ğ", "Ġ", "Ģ" },//
{ "g", "g", "ĝ", "ğ", "ġ", "ģ" },//
{ "I", "I", "Ì", "Í", "Î", "Ï", "Ĩ" }, //
{ "i", "i", "ĩ", "ì", "í", "î", "ï" },//
{ "N", "N", "Ñ" }, //
{ "n", "n", "ñ", },//
{ "O", "O", "Ø", "Ò", "Ó", "Ô", "Õ", "Ő", "Ǿ" },//
{ "o", "o", "ø", "ő", "ò", "ó", "ô", "õ", "ǿ" },//
{ "Oe", "OE", "Ö", "Œ" },//
{ "oe", "oe", "ö", "œ" },//
{ "ss", "ss", "ß" },//
{ "Aeffin", "AEffin", "Ä\uFB03n" },//
{ "IJ", "IJ", "IJ" },//
{ "ij", "ij", "ij" },//
{ "U", "U", "Û", "Ù", "Ú", "Ů" },//
{ "u", "u", "û", "ù", "ú", "ů" },//
{ "Ue", "UE", "Ü" },//
{ "ue", "ue", "ü" },//
{ "T", "T", "Ţ", "Ŧ" },//
{ "t", "t", "ţ", "ŧ" },//
{ "Y", "Y", "Ý" }, //
{ "y", "y", "ý", "ÿ" } //
};
/**
* Implementation of an iterator that knows how to iterate over the source
* test data array for upper- and lowercase mode.
*
* @author JHAUSHER
*/
private static final class TestDataIterator implements Iterator<Object[]> {
int dataIndex = 0;
int currentIndex = 2;
final boolean uppercase;
public TestDataIterator(boolean uppercase) {
this.uppercase = uppercase;
}
public boolean hasNext() {
return !(dataIndex == testDataSource.length && currentIndex != testDataSource[testDataSource.length - 1].length - 1);
}
public Object[] next() {
Object[] result = new Object[2];
int idx = currentIndex++;
result[0] = testDataSource[dataIndex][idx];
result[1] = (uppercase ? testDataSource[dataIndex][1] : testDataSource[dataIndex][0]);
if (currentIndex == testDataSource[dataIndex].length) {
currentIndex = 2;
dataIndex += 1;
}
return result;
}
public void remove() {
// ignore
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment