Created
November 24, 2019 21:50
-
-
Save PavelZaytsev/0aafa21203a27b5f2c8ee8815056214f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package bitmanipulation; | |
import java.util.function.Predicate; | |
public class Utf8Validation { | |
enum Order{ | |
TWO_BYTE, | |
THREE_BYTE, | |
FOUR_BYTE | |
} | |
// 8 bits in total. | |
static int totalBits = 8; | |
// 1 MSB is on. 10. | |
static int consecutiveByteMask = 1 << 1; | |
// 2 MSBs are on. 110. | |
static int twoByteMask = consecutiveByteMask | (1 << 2); | |
// 3 MSBs are on. 1110. | |
static int threeByteMask = twoByteMask | (1 << 3); | |
// 4 MSBs are on. 11110. | |
static int fourByteMask = threeByteMask | (1 << 4); | |
// 3 bits in size. Drop 5 bits and compare. | |
static Predicate<Integer> isTwoByteChar = input -> (input >> (totalBits - 3)) == twoByteMask; | |
// 4 bits in size. | |
static Predicate<Integer> isThreeByteChar = input -> (input >> (totalBits - 4)) == threeByteMask; | |
// 5 bits in size. | |
static Predicate<Integer> isFourByteChar = input -> (input >> (totalBits - 5)) == fourByteMask; | |
// 2 bits in size. | |
static Predicate<Integer> isConsecutiveByteChar = input -> (input >> (totalBits - 2)) == consecutiveByteMask; | |
// Check that the MSB is set to 1. | |
static boolean isNByteChar(int input){ | |
return ((input >> 7) & 1) == 1; | |
} | |
static boolean consecutiveBytesSet(int pos, int [] array, Order order){ | |
int lastPos; | |
// Set the last position to check | |
if(order == Order.TWO_BYTE){ | |
lastPos = pos + 1; | |
} | |
else if(order == Order.THREE_BYTE){ | |
lastPos = pos + 2; | |
} | |
else if(order == Order.FOUR_BYTE){ | |
lastPos = pos + 3; | |
} | |
else{ | |
throw new IllegalStateException("Order is invalid."); | |
} | |
// Last position is out of range. | |
if(lastPos > array.length){ | |
return false; | |
} | |
// For every next integer check that its a valid consecutive byte. | |
for(int i = pos + 1; i <= lastPos; i++){ | |
if(!isConsecutiveByteChar.test(array[i])){ | |
return false; | |
} | |
} | |
return true; | |
} | |
static boolean validUtf8Helper(int [] array){ | |
int currentPos = 0; | |
while(currentPos < array.length){ | |
int currentInteger = array[currentPos]; | |
if(isNByteChar(currentInteger)){ | |
if(isTwoByteChar.test(currentInteger)){ | |
if(consecutiveBytesSet(currentPos, array, Order.TWO_BYTE)){ | |
// Its a 2 byte character, advance a position by 2. | |
currentPos += 2; | |
} | |
else{ | |
// Its an invalid 2 byte character. | |
return false; | |
} | |
} | |
else if(isThreeByteChar.test(currentInteger)){ | |
if(consecutiveBytesSet(currentPos, array, Order.THREE_BYTE)){ | |
// Its a 3 byte character, advance a position by 3. | |
currentPos += 3; | |
} | |
else{ | |
return false; | |
} | |
} | |
else if(isFourByteChar.test(currentInteger)){ | |
if(consecutiveBytesSet(currentPos, array, Order.FOUR_BYTE)){ | |
// Its a 4 byte character, advance a position by 4. | |
currentPos += 4; | |
} | |
else{ | |
return false; | |
} | |
} | |
else{ | |
// It's an invalid N byte character. | |
return false; | |
} | |
} | |
else{ | |
// Its a 1 byte character, advance a position by 1. | |
currentPos += 1; | |
} | |
} | |
return true; | |
} | |
public static boolean validUtf8(int[] data) { | |
return validUtf8Helper(data); | |
} | |
public static void main(String[] args) { | |
System.out.println(validUtf8(new int [] {197, 130, 1})); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment