jabbrwcky/StringCleaner.java

## StringCleaner.java
package net.hausherr.util;

import com.ibm.icu.text.Normalizer;

import java.util.HashMap;
import java.util.Map;

/**
 * StringCleaner provides a method for normalizing a string to generally
 * ASCII-compatible form.
 */
public class StringCleaner {

	/**
	 * Map containing replacements for corner cases (i.e. not decomposed by the
	 * Normalizer)
	 */
	private Map<Integer, Replacement> charMap = buildReplacementMap();

	/**
	 * builds a map containing all replacements that are not automatically
	 * performed by the normalizer.
	 *
	 * @return Replacement containing both replacements for upper- and lowercase
	 *         mode.
	 */
	private Map<Integer, Replacement> buildReplacementMap() {
		Map<Integer, Replacement> map = new HashMap<Integer, Replacement>();
		map.put(0xc6, new Replacement("AE", "Ae"));
		map.put(0xe6, new Replacement("ae"));
		map.put(0xd0, new Replacement("D"));
		map.put(0x111, new Replacement("d"));
		map.put(0xd8, new Replacement("O"));
		map.put(0xf8, new Replacement("o"));
		map.put(0x152, new Replacement("OE", "Oe"));
		map.put(0x153, new Replacement("oe"));
		map.put(0x166, new Replacement("T"));
		map.put(0x167, new Replacement("t"));
		return map;
	}

	/**
	 * <p>
	 * This method takes an input String and replaces all special characters
	 * like umlauts, accented or other letter with diacritical marks with their
	 * basic ascii eqivalents.
	 * </p>
	 * <p>
	 * Example: The String "André" or "Ándre" would be converted to "Andre".
	 * </p>
	 * <p>
	 * The flag <code>replaceAllCapitalLetters</code> controls the replacement
	 * behavior of special characters that are decomposed into two plain ASCII
	 * chars, like "Æ" or "æ".
	 * </p>
	 * <p>
	 * In "lowercase" mode (i.e.<code> replaceAllCapitalLetters=false</code> )
	 * both aforementioned examples would be converted to "Ae".
	 * </p>
	 * <p>
	 * In "uppercase" mode (<code>replaceAllCapitalLetters=false</code>) the
	 * replacement would be "AE".
	 * </p>
	 *
	 * @param input                    String to convert
	 * @param replaceAllCapitalLetters <code>true</code> causes uppercase special chars that are
	 *                                 replaced by more than one character to be replaced by
	 *                                 all-uppercase replacements; <code>false</code> will cause only
	 *                                 the initial character of the replacements to be in uppercase
	 *                                 and all subsequent replacement characters will be in
	 *                                 lowercase.
	 * @return Input string reduced to ASCII-safe characters.
	 */
	public String convertToAscii(String input, boolean replaceAllCapitalLetters) {
		/*
		 * operating on char arrays because java.lang.String seems to perform an
		 * automatic recomposition of decomposed characters.
		 */
		String result = null;
		if (null != input) {
			char[] src = input.toCharArray();
			/* save space for exotic UTF characters */
			char[] target = new char[src.length * 3];
			int len = Normalizer.normalize(input.toCharArray(), target, Normalizer.NFKD, 0);
			result = processSpecialChars(target, 0, len, replaceAllCapitalLetters);
		}
		return result;
	}

	@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = {"SF_SWITCH_FALLTHROUGH"}, justification = "Intentional fallthrough (JHAUSHER)")
	private String processSpecialChars(char[] target, int offset, int len, boolean uppercase) {
		StringBuilder result = new StringBuilder();
		boolean skip = false;

		for (int i = 0; i < len; i++) {
			if (skip) {
				skip = false;
			} else {
				char c = target[i];
				if ((c > 0x20 && c < 0x40) || (c > 0x7a && c < 0xc0) || (c > 0x5a && c < 0x61) || (c > 0x79 && c < 0xc0) || c == 0xd7 || c == 0xf7) {
					result.append(c);
				} else if (Character.isDigit(c) || Character.isISOControl(c)) {
					result.append(c);
				} else if (Character.isWhitespace(c) || Character.isLetter(c)) {
					boolean isUpper = false;

					switch (c) {
						case '\u00df':
							result.append("ss");
							break;
						/* Handling of capital and lowercase umlauts */
						case 'A':
						case 'O':
						case 'U':
							isUpper = true;
						case 'a':
						case 'o':
						case 'u':
							result.append(c);
							if (i + 1 < target.length && target[i + 1] == 0x308) {
								result.append(isUpper && uppercase ? 'E' : 'e');
								skip = true;
							}
							break;
						default:
							Replacement rep = charMap.get(Integer.valueOf(c));
							if (rep != null) {
								result.append(uppercase ? rep.upper : rep.lower);
							} else
								result.append(c);
					}
				}
			}
		}

		return result.toString();
	}


	/**
	 * Combination of replacements for upper- and lowercase mode.
	 */
	private static class Replacement {

		private final String upper;
		private final String lower;

		Replacement(String ucReplacement, String lcReplacement) {
			this.upper = ucReplacement;
			this.lower = lcReplacement;
		}

		Replacement(String caseInsensitiveReplacement) {
			this(caseInsensitiveReplacement, caseInsensitiveReplacement);
		}

	}
}

## StringCleanerTest.java
package net.hausherr.util;

import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.lang.reflect.Method;
import java.util.Iterator;

import static org.testng.Assert.assertEquals;

/**
 * Unit-Test for testing charachter replacement.
 */
public class StringCleanerTest {

	private StringCleaner sut;

	@BeforeClass
	public void setupClass() {
		sut = new StringCleaner();
	}

	@AfterClass
	public void tearDownClass() {

	}
	/**
	 * Tests replacement of special characters in "lowercase" mode.
	 *
	 * @param testData
	 *            Char/String to test
	 * @param expected
	 *            expected normalized form
	 */
	@Test(dataProvider = "asciiConversion")
	void testReplaceSpecialCharacters(String testData, String expected) {
		String result = sut.convertToAscii(testData, false);
		assertEquals(result, expected);
	}

	/**
	 *
	 * Tests replacement of special characters in "uppercase" mode.
	 *
	 * @param testData
	 *            Char/String to test
	 * @param expected
	 *            expected normalized form
	 */
	@Test(dataProvider = "asciiConversion")
	void testReplaceSpecialCharactersUppercase(String testData, String expected) {
		String result = sut.convertToAscii(testData, true);
		assertEquals(result, expected);
	}

	/**
	 * Data provider for the test methods either for upper- or lowercase mode..
	 *
	 * Provides Data both for upper- and lowercase tests as Iterator over the
	 * array of String arrays that holds the "raw" data.
	 *
	 * @param m
	 *            actual testmethod, provided by TestNG
	 * @return Iterator over test data.
	 */
	@DataProvider(name = "asciiConversion")
	public Iterator<Object[]> dataProvider(Method m) {

		if (m.getName().endsWith("Uppercase")) {
			return new TestDataIterator(true);
		}

		return new TestDataIterator(false);

	}

	/**
	 * This array of String Arrays holds the data for all tests.
	 *
	 * <p>
	 * The data contained in each array has the following semantics:
	 * </p>
	 * <code>{ "EXP(lc)", "EXP(uc)", "TV_1", ... , "TV-n" }</code>
	 * <p>
	 * Legend:
	 * </p>
	 * <ul>
	 * <li>EXP(lc): Reference value for lowercase tests (expectation)</li>
	 * <li>EXP(uc): Reference value for uppercase tests (expectation)</li>
	 * <li>TV_1...TV_n: Test String/Character 1...n</li>
	 * </ul>
	 */
	private static final String[][] testDataSource = {
			//
			/* Sanity checks first */
			{ "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ" },//
			{ "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz" },//
			{ "1234567890", "1234567890", "1234567890" },//
			/* Symbols */
			{ "!\"§$%&/()=?'\\", "!\"§$%&/()=?'\\", "!\"§$%&/()=?'\\" },//
			{ "÷×¡¢£¤·", "÷×¡¢£¤·", "÷×¡¢£¤·" }, //
			{ " \t\r\n", " \t\r\n", " \t\r\n" },//
			/* Fun starts here */
			{ "A", "A", "Ā", "Ă", "Å", "À", "Â" }, //
			{ "a", "a", "ā", "ă", "å", "à", "â" }, //
			{ "Ae", "AE", "Æ", "Ǽ", "Ä" },//
			{ "ae", "ae", "æ", "ǣ", "ä", "ǟ" },//
			{ "C", "C", "Ċ", "Ç", "Č" },//
			{ "c", "c", "ċ", "ç", "č" },//
			{ "D", "D", "Ď", "Ð" },//
			{ "d", "d", "ď", "đ" },//
			{ "E", "E", "Ê", "Ë", "È", "É", "Ê" }, //
			{ "e", "e", "ê", "ë", "è", "é" }, //
			{ "G", "G", "Ĝ", "Ğ", "Ġ", "Ģ" },//
			{ "g", "g", "ĝ", "ğ", "ġ", "ģ" },//
			{ "I", "I", "Ì", "Í", "Î", "Ï", "Ĩ" }, //
			{ "i", "i", "ĩ", "ì", "í", "î", "ï" },//
			{ "N", "N", "Ñ" }, //
			{ "n", "n", "ñ", },//
			{ "O", "O", "Ø", "Ò", "Ó", "Ô", "Õ", "Ő", "Ǿ" },//
			{ "o", "o", "ø", "ő", "ò", "ó", "ô", "õ", "ǿ" },//
			{ "Oe", "OE", "Ö", "Œ" },//
			{ "oe", "oe", "ö", "œ" },//
			{ "ss", "ss", "ß" },//
			{ "Aeffin", "AEffin", "Ä\uFB03n" },//
			{ "IJ", "IJ", "Ĳ" },//
			{ "ij", "ij", "ĳ" },//
			{ "U", "U", "Û", "Ù", "Ú", "Ů" },//
			{ "u", "u", "û", "ù", "ú", "ů" },//
			{ "Ue", "UE", "Ü" },//
			{ "ue", "ue", "ü" },//
			{ "T", "T", "Ţ", "Ŧ" },//
			{ "t", "t", "ţ", "ŧ" },//
			{ "Y", "Y", "Ý" }, //
			{ "y", "y", "ý", "ÿ" } //
	};

	/**
	 * Implementation of an iterator that knows how to iterate over the source
	 * test data array for upper- and lowercase mode.
	 *
	 * @author JHAUSHER
	 */
	private static final class TestDataIterator implements Iterator<Object[]> {
		int dataIndex = 0;
		int currentIndex = 2;

		final boolean uppercase;

		public TestDataIterator(boolean uppercase) {
			this.uppercase = uppercase;
		}

		public boolean hasNext() {
			return !(dataIndex == testDataSource.length && currentIndex != testDataSource[testDataSource.length - 1].length - 1);
		}

		public Object[] next() {
			Object[] result = new Object[2];
			int idx = currentIndex++;
			result[0] = testDataSource[dataIndex][idx];
			result[1] = (uppercase ? testDataSource[dataIndex][1] : testDataSource[dataIndex][0]);

			if (currentIndex == testDataSource[dataIndex].length) {
				currentIndex = 2;
				dataIndex += 1;
			}

			return result;
		}

		public void remove() {
			// ignore
		}
	}

}
	package net.hausherr.util;

	import com.ibm.icu.text.Normalizer;

	import java.util.HashMap;
	import java.util.Map;

	/**
	* StringCleaner provides a method for normalizing a string to generally
	* ASCII-compatible form.
	*/
	public class StringCleaner {

	/**
	* Map containing replacements for corner cases (i.e. not decomposed by the
	* Normalizer)
	*/
	private Map<Integer, Replacement> charMap = buildReplacementMap();

	/**
	* builds a map containing all replacements that are not automatically
	* performed by the normalizer.
	*
	* @return Replacement containing both replacements for upper- and lowercase
	* mode.
	*/
	private Map<Integer, Replacement> buildReplacementMap() {
	Map<Integer, Replacement> map = new HashMap<Integer, Replacement>();
	map.put(0xc6, new Replacement("AE", "Ae"));
	map.put(0xe6, new Replacement("ae"));
	map.put(0xd0, new Replacement("D"));
	map.put(0x111, new Replacement("d"));
	map.put(0xd8, new Replacement("O"));
	map.put(0xf8, new Replacement("o"));
	map.put(0x152, new Replacement("OE", "Oe"));
	map.put(0x153, new Replacement("oe"));
	map.put(0x166, new Replacement("T"));
	map.put(0x167, new Replacement("t"));
	return map;
	}

	/**
	* <p>
	* This method takes an input String and replaces all special characters
	* like umlauts, accented or other letter with diacritical marks with their
	* basic ascii eqivalents.
	* </p>
	* <p>
	* Example: The String "André" or "Ándre" would be converted to "Andre".
	* </p>
	* <p>
	* The flag <code>replaceAllCapitalLetters</code> controls the replacement
	* behavior of special characters that are decomposed into two plain ASCII
	* chars, like "Æ" or "æ".
	* </p>
	* <p>
	* In "lowercase" mode (i.e.<code> replaceAllCapitalLetters=false</code> )
	* both aforementioned examples would be converted to "Ae".
	* </p>
	* <p>
	* In "uppercase" mode (<code>replaceAllCapitalLetters=false</code>) the
	* replacement would be "AE".
	* </p>
	*
	* @param input String to convert
	* @param replaceAllCapitalLetters <code>true</code> causes uppercase special chars that are
	* replaced by more than one character to be replaced by
	* all-uppercase replacements; <code>false</code> will cause only
	* the initial character of the replacements to be in uppercase
	* and all subsequent replacement characters will be in
	* lowercase.
	* @return Input string reduced to ASCII-safe characters.
	*/
	public String convertToAscii(String input, boolean replaceAllCapitalLetters) {
	/*
	* operating on char arrays because java.lang.String seems to perform an
	* automatic recomposition of decomposed characters.
	*/
	String result = null;
	if (null != input) {
	char[] src = input.toCharArray();
	/* save space for exotic UTF characters */
	char[] target = new char[src.length * 3];
	int len = Normalizer.normalize(input.toCharArray(), target, Normalizer.NFKD, 0);
	result = processSpecialChars(target, 0, len, replaceAllCapitalLetters);
	}
	return result;
	}

	@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = {"SF_SWITCH_FALLTHROUGH"}, justification = "Intentional fallthrough (JHAUSHER)")
	private String processSpecialChars(char[] target, int offset, int len, boolean uppercase) {
	StringBuilder result = new StringBuilder();
	boolean skip = false;

	for (int i = 0; i < len; i++) {
	if (skip) {
	skip = false;
	} else {
	char c = target[i];
	if ((c > 0x20 && c < 0x40) \|\| (c > 0x7a && c < 0xc0) \|\| (c > 0x5a && c < 0x61) \|\| (c > 0x79 && c < 0xc0) \|\| c == 0xd7 \|\| c == 0xf7) {
	result.append(c);
	} else if (Character.isDigit(c) \|\| Character.isISOControl(c)) {
	result.append(c);
	} else if (Character.isWhitespace(c) \|\| Character.isLetter(c)) {
	boolean isUpper = false;

	switch (c) {
	case '\u00df':
	result.append("ss");
	break;
	/* Handling of capital and lowercase umlauts */
	case 'A':
	case 'O':
	case 'U':
	isUpper = true;
	case 'a':
	case 'o':
	case 'u':
	result.append(c);
	if (i + 1 < target.length && target[i + 1] == 0x308) {
	result.append(isUpper && uppercase ? 'E' : 'e');
	skip = true;
	}
	break;
	default:
	Replacement rep = charMap.get(Integer.valueOf(c));
	if (rep != null) {
	result.append(uppercase ? rep.upper : rep.lower);
	} else
	result.append(c);
	}
	}
	}
	}

	return result.toString();
	}


	/**
	* Combination of replacements for upper- and lowercase mode.
	*/
	private static class Replacement {

	private final String upper;
	private final String lower;

	Replacement(String ucReplacement, String lcReplacement) {
	this.upper = ucReplacement;
	this.lower = lcReplacement;
	}

	Replacement(String caseInsensitiveReplacement) {
	this(caseInsensitiveReplacement, caseInsensitiveReplacement);
	}

	}
	}
	package net.hausherr.util;

	import org.testng.annotations.AfterClass;
	import org.testng.annotations.BeforeClass;
	import org.testng.annotations.DataProvider;
	import org.testng.annotations.Test;

	import java.lang.reflect.Method;
	import java.util.Iterator;

	import static org.testng.Assert.assertEquals;

	/**
	* Unit-Test for testing charachter replacement.
	*/
	public class StringCleanerTest {

	private StringCleaner sut;

	@BeforeClass
	public void setupClass() {
	sut = new StringCleaner();
	}

	@AfterClass
	public void tearDownClass() {

	}
	/**
	* Tests replacement of special characters in "lowercase" mode.
	*
	* @param testData
	* Char/String to test
	* @param expected
	* expected normalized form
	*/
	@Test(dataProvider = "asciiConversion")
	void testReplaceSpecialCharacters(String testData, String expected) {
	String result = sut.convertToAscii(testData, false);
	assertEquals(result, expected);
	}

	/**
	*
	* Tests replacement of special characters in "uppercase" mode.
	*
	* @param testData
	* Char/String to test
	* @param expected
	* expected normalized form
	*/
	@Test(dataProvider = "asciiConversion")
	void testReplaceSpecialCharactersUppercase(String testData, String expected) {
	String result = sut.convertToAscii(testData, true);
	assertEquals(result, expected);
	}

	/**
	* Data provider for the test methods either for upper- or lowercase mode..
	*
	* Provides Data both for upper- and lowercase tests as Iterator over the
	* array of String arrays that holds the "raw" data.
	*
	* @param m
	* actual testmethod, provided by TestNG
	* @return Iterator over test data.
	*/
	@DataProvider(name = "asciiConversion")
	public Iterator<Object[]> dataProvider(Method m) {

	if (m.getName().endsWith("Uppercase")) {
	return new TestDataIterator(true);
	}

	return new TestDataIterator(false);

	}

	/**
	* This array of String Arrays holds the data for all tests.
	*
	* <p>
	* The data contained in each array has the following semantics:
	* </p>
	* <code>{ "EXP(lc)", "EXP(uc)", "TV_1", ... , "TV-n" }</code>
	* <p>
	* Legend:
	* </p>
	* <ul>
	* <li>EXP(lc): Reference value for lowercase tests (expectation)</li>
	* <li>EXP(uc): Reference value for uppercase tests (expectation)</li>
	* <li>TV_1...TV_n: Test String/Character 1...n</li>
	* </ul>
	*/
	private static final String[][] testDataSource = {
	//
	/* Sanity checks first */
	{ "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "ABCDEFGHIJKLMNOPQRSTUVWXYZ" },//
	{ "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz" },//
	{ "1234567890", "1234567890", "1234567890" },//
	/* Symbols */
	{ "!\"§$%&/()=?'\\", "!\"§$%&/()=?'\\", "!\"§$%&/()=?'\\" },//
	{ "÷×¡¢£¤·", "÷×¡¢£¤·", "÷×¡¢£¤·" }, //
	{ " \t\r\n", " \t\r\n", " \t\r\n" },//
	/* Fun starts here */
	{ "A", "A", "Ā", "Ă", "Å", "À", "Â" }, //
	{ "a", "a", "ā", "ă", "å", "à", "â" }, //
	{ "Ae", "AE", "Æ", "Ǽ", "Ä" },//
	{ "ae", "ae", "æ", "ǣ", "ä", "ǟ" },//
	{ "C", "C", "Ċ", "Ç", "Č" },//
	{ "c", "c", "ċ", "ç", "č" },//
	{ "D", "D", "Ď", "Ð" },//
	{ "d", "d", "ď", "đ" },//
	{ "E", "E", "Ê", "Ë", "È", "É", "Ê" }, //
	{ "e", "e", "ê", "ë", "è", "é" }, //
	{ "G", "G", "Ĝ", "Ğ", "Ġ", "Ģ" },//
	{ "g", "g", "ĝ", "ğ", "ġ", "ģ" },//
	{ "I", "I", "Ì", "Í", "Î", "Ï", "Ĩ" }, //
	{ "i", "i", "ĩ", "ì", "í", "î", "ï" },//
	{ "N", "N", "Ñ" }, //
	{ "n", "n", "ñ", },//
	{ "O", "O", "Ø", "Ò", "Ó", "Ô", "Õ", "Ő", "Ǿ" },//
	{ "o", "o", "ø", "ő", "ò", "ó", "ô", "õ", "ǿ" },//
	{ "Oe", "OE", "Ö", "Œ" },//
	{ "oe", "oe", "ö", "œ" },//
	{ "ss", "ss", "ß" },//
	{ "Aeffin", "AEffin", "Ä\uFB03n" },//
	{ "IJ", "IJ", "Ĳ" },//
	{ "ij", "ij", "ĳ" },//
	{ "U", "U", "Û", "Ù", "Ú", "Ů" },//
	{ "u", "u", "û", "ù", "ú", "ů" },//
	{ "Ue", "UE", "Ü" },//
	{ "ue", "ue", "ü" },//
	{ "T", "T", "Ţ", "Ŧ" },//
	{ "t", "t", "ţ", "ŧ" },//
	{ "Y", "Y", "Ý" }, //
	{ "y", "y", "ý", "ÿ" } //
	};

	/**
	* Implementation of an iterator that knows how to iterate over the source
	* test data array for upper- and lowercase mode.
	*
	* @author JHAUSHER
	*/
	private static final class TestDataIterator implements Iterator<Object[]> {
	int dataIndex = 0;
	int currentIndex = 2;

	final boolean uppercase;

	public TestDataIterator(boolean uppercase) {
	this.uppercase = uppercase;
	}

	public boolean hasNext() {
	return !(dataIndex == testDataSource.length && currentIndex != testDataSource[testDataSource.length - 1].length - 1);
	}

	public Object[] next() {
	Object[] result = new Object[2];
	int idx = currentIndex++;
	result[0] = testDataSource[dataIndex][idx];
	result[1] = (uppercase ? testDataSource[dataIndex][1] : testDataSource[dataIndex][0]);

	if (currentIndex == testDataSource[dataIndex].length) {
	currentIndex = 2;
	dataIndex += 1;
	}

	return result;
	}

	public void remove() {
	// ignore
	}
	}

	}