Skip to content

Instantly share code, notes, and snippets.

@timja
Created May 7, 2022 22:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save timja/21d655840f5d617df07a271a7db0a48f to your computer and use it in GitHub Desktop.
Save timja/21d655840f5d617df07a271a7db0a48f to your computer and use it in GitHub Desktop.
UTF-8 / ISO-8859-1 spelunking
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
public class Main {
public static void main(String[] args) throws IOException {
System.out.println("Starting");
Files.find(Paths.get(""), Integer.MAX_VALUE,
(path, basicFileAttributes) -> !path.toString().contains("/target/") && path.toString().endsWith(".properties"))
.forEach(Main::process);
}
private static void process(Path path) {
String fileName = path.toString();
byte[] contents;
try {
contents = Files.readAllBytes(path);
} catch (IOException e) {
throw new RuntimeException(e);
}
if (!isEncoded(contents, StandardCharsets.US_ASCII)) {
boolean isUtf8 = isEncoded(contents, StandardCharsets.UTF_8);
boolean isIso88591 = isEncoded(contents, StandardCharsets.ISO_8859_1);
if (isUtf8 && isIso88591) {
System.err.println("Case 1: " + fileName + " is valid UTF-8 and valid ISO-8859-1");
} else if (isUtf8) {
System.err.println("Case 2: " + fileName + " is valid UTF-8 but not valid ISO-8859-1");
} else if (isIso88591) {
System.err.println("Case 3: " + fileName + " is not valid UTF-8 but is valid ISO-8859-1");
} else {
System.err.println("Case 4: " + fileName + " is neither valid ASCII nor valid UTF-8 nor valid ISO-8859-1");
}
}
}
private static boolean isEncoded(byte[] bytes, Charset charset) {
CharsetDecoder decoder = charset.newDecoder();
decoder.onMalformedInput(CodingErrorAction.REPORT);
decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
ByteBuffer buffer = ByteBuffer.wrap(bytes);
try {
decoder.decode(buffer);
return true;
} catch (CharacterCodingException e) {
if (charset.equals(StandardCharsets.US_ASCII)) {
Object foo = "bar";
}
return false;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment