Skip to content

Instantly share code, notes, and snippets.

@hikui
Last active November 5, 2017 13:15
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save hikui/9408758 to your computer and use it in GitHub Desktop.
Save hikui/9408758 to your computer and use it in GitHub Desktop.
如果在GB18030编码的文字中,有几个不合法的字节。本类可以将这些不合法的字节变成“?”,防止转NSString时出现nil。
//
// NSData+HG_DataHealing.h
// HGDataHealing
//
// Created by 缪 和光 on 14-3-7.
// Copyright (c) 2014年 Hokuang. All rights reserved.
//
#import <Foundation/Foundation.h>
@interface NSData (HG_DataHealing)
- (NSString *)GB18030String;
@end
//
// NSData+HG_DataHealing.m
// HGDataHealing
//
// Created by 缪 和光 on 14-3-7.
// Copyright (c) 2014年 Hokuang. All rights reserved.
//
#import "NSData+HG_DataHealing.h"
@implementation NSData (HG_DataHealing)
- (NSString *)GB18030String {
NSStringEncoding enc = CFStringConvertEncodingToNSStringEncoding(kCFStringEncodingGB_18030_2000);
NSString *str = [[NSString alloc]initWithData:self encoding:enc];
if (!str) {
str = [[NSString alloc]initWithData:[self dataByHealingGB18030Stream] encoding:enc];
}
return str;
}
- (NSData *)dataByHealingGB18030Stream {
NSUInteger length = [self length];
if (length == 0) {
return self;
}
static NSString * replacementCharacter = @"?";
NSStringEncoding enc = CFStringConvertEncodingToNSStringEncoding(kCFStringEncodingGB_18030_2000);
NSData *replacementCharacterData = [replacementCharacter dataUsingEncoding:enc];
NSMutableData *resultData = [NSMutableData dataWithCapacity:self.length];
const Byte *bytes = [self bytes];
static const NSUInteger bufferMaxSize = 1024;
Byte buffer[bufferMaxSize];
NSUInteger bufferIndex = 0;
NSUInteger byteIndex = 0;
BOOL invalidByte = NO;
#define FlushBuffer() if (bufferIndex > 0) { \
[resultData appendBytes:buffer length:bufferIndex]; \
bufferIndex = 0; \
}
#define CheckBuffer() if ((bufferIndex+5) >= bufferMaxSize) { \
[resultData appendBytes:buffer length:bufferIndex]; \
bufferIndex = 0; \
}
while (byteIndex < length) {
Byte byte = bytes[byteIndex];
//检查第一位
if (byte >= 0 && byte <= (Byte)0x7f) {
//单字节文字
CheckBuffer();
buffer[bufferIndex++] = byte;
} else if (byte >= (Byte)0x81 && byte <= (Byte)0xfe){
//可能是双字节,可能是四字节
if (byteIndex + 1 >= length) {
//这是最后一个字节了,但是这个字节表明后面应该还有1或3个字节,那么这个字节一定是错误字节
FlushBuffer();
return resultData;
}
Byte byte2 = bytes[++byteIndex];
if (byte2 >= (Byte)0x40 && byte <= (Byte)0xfe && byte != (Byte)0x7f) {
//是双字节,并且可能合法
Byte tuple[] = {byte, byte2};
CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 2, kCFStringEncodingGB_18030_2000, false);
if (cfstr) {
CFRelease(cfstr);
CheckBuffer();
buffer[bufferIndex++] = byte;
buffer[bufferIndex++] = byte2;
} else {
//这个双字节字符不合法,但byte2可能是下一字符的第一字节
byteIndex -= 1;
invalidByte = YES;
}
} else if (byte2 >= (Byte)0x30 && byte2 <= (Byte)0x39) {
//可能是四字节
if (byteIndex + 2 >= length) {
FlushBuffer();
return resultData;
}
Byte byte3 = bytes[++byteIndex];
if (byte3 >= (Byte)0x81 && byte3 <= (Byte)0xfe) {
// 第三位合法,判断第四位
Byte byte4 = bytes[++byteIndex];
if (byte4 >= (Byte)0x30 && byte4 <= (Byte)0x39) {
//第四位可能合法
Byte tuple[] = {byte, byte2, byte3, byte4};
CFStringRef cfstr = CFStringCreateWithBytes(kCFAllocatorDefault, tuple, 4, kCFStringEncodingGB_18030_2000, false);
if (cfstr) {
CFRelease(cfstr);
CheckBuffer();
buffer[bufferIndex++] = byte;
buffer[bufferIndex++] = byte2;
buffer[bufferIndex++] = byte3;
buffer[bufferIndex++] = byte4;
} else {
//这个四字节字符不合法,但是byte2可能是下一个合法字符的第一字节,回退3位
//并且将byte1,byte2用?替代
byteIndex -= 3;
invalidByte = YES;
}
} else {
//第四字节不合法
byteIndex -= 3;
invalidByte = YES;
}
} else {
// 第三字节不合法
byteIndex -= 2;
invalidByte = YES;
}
} else {
// 第二字节不是合法的第二位,但可能是下一个合法的第一位,所以回退一个byte
invalidByte = YES;
byteIndex -= 1;
}
if (invalidByte) {
invalidByte = NO;
FlushBuffer();
[resultData appendData:replacementCharacterData];
}
}
byteIndex++;
}
FlushBuffer();
return resultData;
}
@end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment