Skip to content

Instantly share code, notes, and snippets.

@astaxie
Created July 18, 2012 15:53
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save astaxie/3137091 to your computer and use it in GitHub Desktop.
Save astaxie/3137091 to your computer and use it in GitHub Desktop.
golang IsTextUTF8
func IsTextUTF8(inputStream []byte) bool {
encodingBytesCount := 0
allTextsAreASCIIChars := true;
for i := 0; i < len(inputStream); i++ {
current := inputStream[i]
if (current & 0x80) == 0x80 {
allTextsAreASCIIChars = false
}
// First byte
if encodingBytesCount == 0 {
if (current & 0x80) == 0 {
// ASCII chars, from 0x00-0x7F
continue
}
if (current & 0xC0) == 0xC0 {
encodingBytesCount = 1
current <<= 2
// More than two bytes used to encoding a unicode char.
// Calculate the real length.
for (current & 0x80) == 0x80 {
current <<= 1;
encodingBytesCount++;
}
} else {
// Invalid bits structure for UTF8 encoding rule.
return false;
}
} else {
// Following bytes, must start with 10.
if (current & 0xC0) == 0x80 {
encodingBytesCount--
}else{
// Invalid bits structure for UTF8 encoding rule.
return false
}
}
}
if encodingBytesCount != 0 {
// Invalid bits structure for UTF8 encoding rule.
// Wrong following bytes count.
return false
}
// Although UTF8 supports encoding for ASCII chars, we regard as a input stream, whose contents are all ASCII as default encoding.
return !allTextsAreASCIIChars
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment