Skip to content

Instantly share code, notes, and snippets.

@ShonFrazier
Created December 11, 2013 20:48
Show Gist options
  • Save ShonFrazier/7918192 to your computer and use it in GitHub Desktop.
Save ShonFrazier/7918192 to your computer and use it in GitHub Desktop.
Some utility code for examining strings for UTF-8-ness
enum UTF8SequenceLength {
Bytes1 = 0,
Bytes2 = 0b11000000,
Bytes3 = 0b11100000,
Bytes4 = 0b11110000,
BytesErr = 0b10000000,
};
enum UTF8ParsingMode {
UTF8ParsingModeNone = 0,
UTF8ParsingModeParsing = 1
};
typedef enum UTF8SequenceLength UTF8SequenceLength;
typedef enum UTF8ParsingMode UTF8ParsingMode;
#define UTF8SequenceMask (0xF0)
@implementation NSMutableString (AppendBinaryString)
- (void) appendBinaryStringForValue: (char)c {
int i = 8;
while (i) {
[self appendString: (c&0x80)?@"1":@"0"];
c <<= 1;
i--;
}
}
@end
@implementation NSString (UTF8Dump)
- (NSString *)dumpUTF8Decode {
NSMutableString *result = [@"" mutableCopy];
NSUInteger stringIndex = 0;
NSData *data = [self dataUsingEncoding:NSUTF8StringEncoding];
if (!data) return nil;
const char *bytes = data.bytes;
NSUInteger byteCount = 0;
UTF8ParsingMode currentMode = UTF8ParsingModeNone;
for (NSUInteger i = 0; i<data.length; i++) {
char deesaByte = bytes[i];
UTF8SequenceLength deesaLength = deesaByte & UTF8SequenceMask;
if (byteCount == 0) {
[result appendFormat:@"%C ", [self characterAtIndex:stringIndex++]];
switch (deesaLength) {
case Bytes2:
byteCount = 2;
currentMode = UTF8ParsingModeParsing;
break;
case Bytes3:
byteCount = 3;
currentMode = UTF8ParsingModeParsing;
break;
case Bytes4:
byteCount = 4;
currentMode = UTF8ParsingModeParsing;
break;
case BytesErr:
[result appendString:@" err "];
break;
default:
byteCount = 1;
currentMode = UTF8ParsingModeNone;
break;
}
}
[result appendBinaryStringForValue:deesaByte];
byteCount--;
if (byteCount == 0) {
[result appendString:@"\n"];
}
else {
[result appendString:@" "];
}
}
return result;
}
@end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment