Created
September 12, 2017 15:02
-
-
Save moelholm/0929a3fecde9e37421b0d451d92bc448 to your computer and use it in GitHub Desktop.
Converting a string into Windows-1252 preserving as much information as possible
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
private static final Charset WINDOWS_1252_CHARSET = Charset.forName("Windows-1252"); | |
// Can below be done more effectively (/robust/secure) ? | |
@Test | |
public void test() { | |
// Given | |
String inputString = "ĢoogleŰberButÅ"; | |
// When | |
String result = convertToWindows1252(inputString); | |
// Then | |
assertEquals("GoogleUberButÅ", result); | |
// Or put in another way: | |
// I want to retain Å's accent but not accents for Chars that are outside Windows-1252. | |
// ..and I will rather remove accents (to obtain valid Chars) than throwing them completely away. | |
// ..ie. oogleberButÅ wouldn't be optimal | |
// The receiver of "result" must get the data in proper Windows-1252 | |
} | |
private static String convertToWindows1252(String inputString) { | |
CharsetEncoder encoder = WINDOWS_1252_CHARSET.newEncoder(); | |
return inputString.chars() | |
.mapToObj(i -> (char) i) | |
.map(c -> convertToWindows1252(encoder, c)) | |
.filter(Objects::nonNull) | |
.collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) | |
.toString(); | |
} | |
private static Character convertToWindows1252(CharsetEncoder encoder, char character) { | |
if (encoder.canEncode(character)) { | |
return character; | |
} | |
// Check if we can reduce 'character' to an accent-stripped valid Windows-1252 character... | |
// It will be split into 2 characters if we can... | |
String decomposed = Normalizer.normalize(String.valueOf(character), Normalizer.Form.NFD); | |
return decomposed.length() == 2 ? decomposed.charAt(0) : null; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi,
how can I remove "�ÿþ" from my String "�ÿþHello, world�ÿþ"?