Created
October 26, 2020 14:54
-
-
Save ozzi-/9af51a68a741016eac913658da95b433 to your computer and use it in GitHub Desktop.
java method to fix double encoded UTF-8 strings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public static void main(String[] args) { | |
String input = "werewräüèö"; | |
String result = fixDoubleUTF8Encoding(input); | |
System.out.println(result); // werewräüèö | |
input = "üäöé"; | |
result = fixDoubleUTF8Encoding(input); | |
System.out.println(result); // üäöé | |
} | |
private static String fixDoubleUTF8Encoding(String s) { | |
// interpret the string as UTF_8 | |
byte[] bytes = s.getBytes(StandardCharsets.UTF_8); | |
// now check if the bytes contain 0x83 0xC2, meaning double encoded garbage | |
if(isDoubleEncoded(bytes)) { | |
// if so, lets fix the string by assuming it is ASCII extended and recode it once | |
s = new String(s.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8); | |
} | |
return s; | |
} | |
private static boolean isDoubleEncoded(byte[] bytes) { | |
for (int i = 0; i < bytes.length; i++) { | |
if(bytes[i] == -125 && i+1 < bytes.length && bytes[i+1] == -62) { | |
return true; | |
} | |
} | |
return false; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment