Skip to content

Instantly share code, notes, and snippets.

@ozzi-
Created October 26, 2020 14:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ozzi-/9af51a68a741016eac913658da95b433 to your computer and use it in GitHub Desktop.
Save ozzi-/9af51a68a741016eac913658da95b433 to your computer and use it in GitHub Desktop.
java method to fix double encoded UTF-8 strings
public static void main(String[] args) {
String input = "werewräüèö";
String result = fixDoubleUTF8Encoding(input);
System.out.println(result); // werewräüèö
input = "üäöé";
result = fixDoubleUTF8Encoding(input);
System.out.println(result); // üäöé
}
private static String fixDoubleUTF8Encoding(String s) {
// interpret the string as UTF_8
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
// now check if the bytes contain 0x83 0xC2, meaning double encoded garbage
if(isDoubleEncoded(bytes)) {
// if so, lets fix the string by assuming it is ASCII extended and recode it once
s = new String(s.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.UTF_8);
}
return s;
}
private static boolean isDoubleEncoded(byte[] bytes) {
for (int i = 0; i < bytes.length; i++) {
if(bytes[i] == -125 && i+1 < bytes.length && bytes[i+1] == -62) {
return true;
}
}
return false;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment