Skip to content

Instantly share code, notes, and snippets.

@oleganza
Created January 16, 2011 13:18
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save oleganza/781772 to your computer and use it in GitHub Desktop.
Save oleganza/781772 to your computer and use it in GitHub Desktop.
#import "NSData+OADataHelpers.h"
@implementation NSData (OADataHelpers)
- (NSString*) UTF8String
{
return [[[NSString alloc] initWithData:[self dataByHealingUTF8Stream] encoding:NSUTF8StringEncoding] autorelease];
}
// Replaces all broken sequences by � character and returns NSData with valid UTF-8 bytes.
- (NSData*) dataByHealingUTF8Stream
{
// bits
// 7 U+007F 0xxxxxxx
// 11 U+07FF 110xxxxx 10xxxxxx
// 16 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
// 21 U+1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// 26 U+3FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// 31 U+7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
#define b00000000 0x00
#define b10000000 0x80
#define b11000000 0xc0
#define b11100000 0xe0
#define b11110000 0xf0
#define b11111000 0xf8
#define b11111100 0xfc
#define b11111110 0xfe
static NSString* replacementCharacter = @"�";
NSData* replacementCharacterData = [replacementCharacter dataUsingEncoding:NSUTF8StringEncoding];
NSMutableData* resultData = [NSMutableData dataWithCapacity:[self length]];
const char *bytes = [self bytes];
NSUInteger length = [self length];
static const NSUInteger bufferMaxSize = 1024;
char buffer[bufferMaxSize]; // not initialized, but will be filled in completely before copying to resultData
NSUInteger bufferIndex = 0;
#define FlushBuffer() if (bufferIndex > 0) { \
[resultData appendBytes:buffer length:bufferIndex]; \
bufferIndex = 0; \
}
#define CheckBuffer() if ((bufferIndex+5) >= bufferMaxSize) { \
[resultData appendBytes:buffer length:bufferIndex]; \
bufferIndex = 0; \
}
NSUInteger byteIndex = 0;
BOOL invalidByte = NO;
while (byteIndex < length)
{
char byte = bytes[byteIndex];
if ((byte & b10000000) == b00000000) // 0xxxxxxx
{
CheckBuffer();
buffer[bufferIndex++] = byte;
}
else if ((byte & b11100000) == b11000000) // 110xxxxx 10xxxxxx
{
if (byteIndex+1 >= length) {
FlushBuffer();
return resultData;
}
char byte2 = bytes[++byteIndex];
if ((byte2 & b11000000) == b10000000)
{
CheckBuffer();
buffer[bufferIndex++] = byte;
buffer[bufferIndex++] = byte2;
}
else
{
invalidByte = YES;
}
}
else if ((byte & b11110000) == b11100000) // 1110xxxx 10xxxxxx 10xxxxxx
{
if (byteIndex+2 >= length) {
FlushBuffer();
return resultData;
}
char byte2 = bytes[++byteIndex];
char byte3 = bytes[++byteIndex];
if ((byte2 & b11000000) == b10000000 &&
(byte3 & b11000000) == b10000000)
{
CheckBuffer();
buffer[bufferIndex++] = byte;
buffer[bufferIndex++] = byte2;
buffer[bufferIndex++] = byte3;
}
else
{
invalidByte = YES;
}
}
else if ((byte & b11111000) == b11110000) // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
{
if (byteIndex+3 >= length) {
FlushBuffer();
return resultData;
}
char byte2 = bytes[++byteIndex];
char byte3 = bytes[++byteIndex];
char byte4 = bytes[++byteIndex];
if ((byte2 & b11000000) == b10000000 &&
(byte3 & b11000000) == b10000000 &&
(byte4 & b11000000) == b10000000)
{
CheckBuffer();
buffer[bufferIndex++] = byte;
buffer[bufferIndex++] = byte2;
buffer[bufferIndex++] = byte3;
buffer[bufferIndex++] = byte4;
}
else
{
invalidByte = YES;
}
}
else if ((byte & b11111100) == b11111000) // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
{
if (byteIndex+4 >= length) {
FlushBuffer();
return resultData;
}
char byte2 = bytes[++byteIndex];
char byte3 = bytes[++byteIndex];
char byte4 = bytes[++byteIndex];
char byte5 = bytes[++byteIndex];
if ((byte2 & b11000000) == b10000000 &&
(byte3 & b11000000) == b10000000 &&
(byte4 & b11000000) == b10000000 &&
(byte5 & b11000000) == b10000000)
{
CheckBuffer();
buffer[bufferIndex++] = byte;
buffer[bufferIndex++] = byte2;
buffer[bufferIndex++] = byte3;
buffer[bufferIndex++] = byte4;
buffer[bufferIndex++] = byte5;
}
else
{
invalidByte = YES;
}
}
else if ((byte & b11111110) == b11111100) // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
{
if (byteIndex+5 >= length) {
FlushBuffer();
return resultData;
}
char byte2 = bytes[++byteIndex];
char byte3 = bytes[++byteIndex];
char byte4 = bytes[++byteIndex];
char byte5 = bytes[++byteIndex];
char byte6 = bytes[++byteIndex];
if ((byte2 & b11000000) == b10000000 &&
(byte3 & b11000000) == b10000000 &&
(byte4 & b11000000) == b10000000 &&
(byte5 & b11000000) == b10000000 &&
(byte6 & b11000000) == b10000000)
{
CheckBuffer();
buffer[bufferIndex++] = byte;
buffer[bufferIndex++] = byte2;
buffer[bufferIndex++] = byte3;
buffer[bufferIndex++] = byte4;
buffer[bufferIndex++] = byte5;
buffer[bufferIndex++] = byte6;
}
else
{
invalidByte = YES;
}
}
else
{
invalidByte = YES;
}
if (invalidByte)
{
invalidByte = NO;
FlushBuffer();
[resultData appendData:replacementCharacterData];
}
byteIndex++;
}
FlushBuffer();
return resultData;
}
@end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment