Skip to content

Instantly share code, notes, and snippets.

@PavelZaytsev
Created November 24, 2019 21:50
Show Gist options
  • Save PavelZaytsev/0aafa21203a27b5f2c8ee8815056214f to your computer and use it in GitHub Desktop.
Save PavelZaytsev/0aafa21203a27b5f2c8ee8815056214f to your computer and use it in GitHub Desktop.
package bitmanipulation;
import java.util.function.Predicate;
public class Utf8Validation {
enum Order{
TWO_BYTE,
THREE_BYTE,
FOUR_BYTE
}
// 8 bits in total.
static int totalBits = 8;
// 1 MSB is on. 10.
static int consecutiveByteMask = 1 << 1;
// 2 MSBs are on. 110.
static int twoByteMask = consecutiveByteMask | (1 << 2);
// 3 MSBs are on. 1110.
static int threeByteMask = twoByteMask | (1 << 3);
// 4 MSBs are on. 11110.
static int fourByteMask = threeByteMask | (1 << 4);
// 3 bits in size. Drop 5 bits and compare.
static Predicate<Integer> isTwoByteChar = input -> (input >> (totalBits - 3)) == twoByteMask;
// 4 bits in size.
static Predicate<Integer> isThreeByteChar = input -> (input >> (totalBits - 4)) == threeByteMask;
// 5 bits in size.
static Predicate<Integer> isFourByteChar = input -> (input >> (totalBits - 5)) == fourByteMask;
// 2 bits in size.
static Predicate<Integer> isConsecutiveByteChar = input -> (input >> (totalBits - 2)) == consecutiveByteMask;
// Check that the MSB is set to 1.
static boolean isNByteChar(int input){
return ((input >> 7) & 1) == 1;
}
static boolean consecutiveBytesSet(int pos, int [] array, Order order){
int lastPos;
// Set the last position to check
if(order == Order.TWO_BYTE){
lastPos = pos + 1;
}
else if(order == Order.THREE_BYTE){
lastPos = pos + 2;
}
else if(order == Order.FOUR_BYTE){
lastPos = pos + 3;
}
else{
throw new IllegalStateException("Order is invalid.");
}
// Last position is out of range.
if(lastPos > array.length){
return false;
}
// For every next integer check that its a valid consecutive byte.
for(int i = pos + 1; i <= lastPos; i++){
if(!isConsecutiveByteChar.test(array[i])){
return false;
}
}
return true;
}
static boolean validUtf8Helper(int [] array){
int currentPos = 0;
while(currentPos < array.length){
int currentInteger = array[currentPos];
if(isNByteChar(currentInteger)){
if(isTwoByteChar.test(currentInteger)){
if(consecutiveBytesSet(currentPos, array, Order.TWO_BYTE)){
// Its a 2 byte character, advance a position by 2.
currentPos += 2;
}
else{
// Its an invalid 2 byte character.
return false;
}
}
else if(isThreeByteChar.test(currentInteger)){
if(consecutiveBytesSet(currentPos, array, Order.THREE_BYTE)){
// Its a 3 byte character, advance a position by 3.
currentPos += 3;
}
else{
return false;
}
}
else if(isFourByteChar.test(currentInteger)){
if(consecutiveBytesSet(currentPos, array, Order.FOUR_BYTE)){
// Its a 4 byte character, advance a position by 4.
currentPos += 4;
}
else{
return false;
}
}
else{
// It's an invalid N byte character.
return false;
}
}
else{
// Its a 1 byte character, advance a position by 1.
currentPos += 1;
}
}
return true;
}
public static boolean validUtf8(int[] data) {
return validUtf8Helper(data);
}
public static void main(String[] args) {
System.out.println(validUtf8(new int [] {197, 130, 1}));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment