Created
February 27, 2018 10:28
-
-
Save AmatsuZero/b95f1332f5d4bc7035bef3b1ce18c339 to your computer and use it in GitHub Desktop.
Normalize HTML String
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import <Foundation/Foundation.h> | |
/// Utilities for NSStrings containing HTML | |
@interface NSString (GTMNSStringHTMLAdditions) | |
/// Get a string where internal characters that need escaping for HTML are escaped | |
// | |
/// For example, '&' become '&'. This will only cover characters from table | |
/// A.2.2 of http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters | |
/// which is what you want for a unicode encoded webpage. If you have a ascii | |
/// or non-encoded webpage, please use stringByEscapingAsciiHTML which will | |
/// encode all characters. | |
/// | |
/// For obvious reasons this call is only safe once. | |
// | |
// Returns: | |
// Autoreleased NSString | |
// | |
- (NSString *)gtm_stringByEscapingForHTML; | |
/// Get a string where internal characters that need escaping for HTML are escaped | |
// | |
/// For example, '&' become '&' | |
/// All non-mapped characters (unicode that don't have a &keyword; mapping) | |
/// will be converted to the appropriate &#xxx; value. If your webpage is | |
/// unicode encoded (UTF16 or UTF8) use stringByEscapingHTML instead as it is | |
/// faster, and produces less bloated and more readable HTML (as long as you | |
/// are using a unicode compliant HTML reader). | |
/// | |
/// For obvious reasons this call is only safe once. | |
// | |
// Returns: | |
// Autoreleased NSString | |
// | |
- (NSString *)gtm_stringByEscapingForAsciiHTML; | |
/// Get a string where internal characters that are escaped for HTML are unescaped | |
// | |
/// For example, '&' becomes '&' | |
/// Handles   and 2 cases as well | |
/// | |
// Returns: | |
// Autoreleased NSString | |
// | |
- (NSString *)gtm_stringByUnescapingFromHTML; | |
@end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// NSString+GTMNSStringHTMLAdditions.m | |
// MockingBotSketchPlugin | |
// | |
// Created by modao on 2018/2/27. | |
// Copyright © 2018年 MockingBot. All rights reserved. | |
// | |
#import "NSString+GTMNSStringHTMLAdditions.h" | |
typedef struct { | |
__unsafe_unretained NSString *escapeSequence; | |
unichar uchar; | |
} HTMLEscapeMap; | |
// Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters | |
// Ordered by uchar lowest to highest for bsearching | |
static HTMLEscapeMap gAsciiHTMLEscapeMap[] = { | |
// A.2.2. Special characters | |
{ @""", 34 }, | |
{ @"&", 38 }, | |
{ @"'", 39 }, | |
{ @"<", 60 }, | |
{ @">", 62 }, | |
// A.2.1. Latin-1 characters | |
{ @" ", 160 }, | |
{ @"¡", 161 }, | |
{ @"¢", 162 }, | |
{ @"£", 163 }, | |
{ @"¤", 164 }, | |
{ @"¥", 165 }, | |
{ @"¦", 166 }, | |
{ @"§", 167 }, | |
{ @"¨", 168 }, | |
{ @"©", 169 }, | |
{ @"ª", 170 }, | |
{ @"«", 171 }, | |
{ @"¬", 172 }, | |
{ @"­", 173 }, | |
{ @"®", 174 }, | |
{ @"¯", 175 }, | |
{ @"°", 176 }, | |
{ @"±", 177 }, | |
{ @"²", 178 }, | |
{ @"³", 179 }, | |
{ @"´", 180 }, | |
{ @"µ", 181 }, | |
{ @"¶", 182 }, | |
{ @"·", 183 }, | |
{ @"¸", 184 }, | |
{ @"¹", 185 }, | |
{ @"º", 186 }, | |
{ @"»", 187 }, | |
{ @"¼", 188 }, | |
{ @"½", 189 }, | |
{ @"¾", 190 }, | |
{ @"¿", 191 }, | |
{ @"À", 192 }, | |
{ @"Á", 193 }, | |
{ @"Â", 194 }, | |
{ @"Ã", 195 }, | |
{ @"Ä", 196 }, | |
{ @"Å", 197 }, | |
{ @"Æ", 198 }, | |
{ @"Ç", 199 }, | |
{ @"È", 200 }, | |
{ @"É", 201 }, | |
{ @"Ê", 202 }, | |
{ @"Ë", 203 }, | |
{ @"Ì", 204 }, | |
{ @"Í", 205 }, | |
{ @"Î", 206 }, | |
{ @"Ï", 207 }, | |
{ @"Ð", 208 }, | |
{ @"Ñ", 209 }, | |
{ @"Ò", 210 }, | |
{ @"Ó", 211 }, | |
{ @"Ô", 212 }, | |
{ @"Õ", 213 }, | |
{ @"Ö", 214 }, | |
{ @"×", 215 }, | |
{ @"Ø", 216 }, | |
{ @"Ù", 217 }, | |
{ @"Ú", 218 }, | |
{ @"Û", 219 }, | |
{ @"Ü", 220 }, | |
{ @"Ý", 221 }, | |
{ @"Þ", 222 }, | |
{ @"ß", 223 }, | |
{ @"à", 224 }, | |
{ @"á", 225 }, | |
{ @"â", 226 }, | |
{ @"ã", 227 }, | |
{ @"ä", 228 }, | |
{ @"å", 229 }, | |
{ @"æ", 230 }, | |
{ @"ç", 231 }, | |
{ @"è", 232 }, | |
{ @"é", 233 }, | |
{ @"ê", 234 }, | |
{ @"ë", 235 }, | |
{ @"ì", 236 }, | |
{ @"í", 237 }, | |
{ @"î", 238 }, | |
{ @"ï", 239 }, | |
{ @"ð", 240 }, | |
{ @"ñ", 241 }, | |
{ @"ò", 242 }, | |
{ @"ó", 243 }, | |
{ @"ô", 244 }, | |
{ @"õ", 245 }, | |
{ @"ö", 246 }, | |
{ @"÷", 247 }, | |
{ @"ø", 248 }, | |
{ @"ù", 249 }, | |
{ @"ú", 250 }, | |
{ @"û", 251 }, | |
{ @"ü", 252 }, | |
{ @"ý", 253 }, | |
{ @"þ", 254 }, | |
{ @"ÿ", 255 }, | |
// A.2.2. Special characters cont'd | |
{ @"Œ", 338 }, | |
{ @"œ", 339 }, | |
{ @"Š", 352 }, | |
{ @"š", 353 }, | |
{ @"Ÿ", 376 }, | |
// A.2.3. Symbols | |
{ @"ƒ", 402 }, | |
// A.2.2. Special characters cont'd | |
{ @"ˆ", 710 }, | |
{ @"˜", 732 }, | |
// A.2.3. Symbols cont'd | |
{ @"Α", 913 }, | |
{ @"Β", 914 }, | |
{ @"Γ", 915 }, | |
{ @"Δ", 916 }, | |
{ @"Ε", 917 }, | |
{ @"Ζ", 918 }, | |
{ @"Η", 919 }, | |
{ @"Θ", 920 }, | |
{ @"Ι", 921 }, | |
{ @"Κ", 922 }, | |
{ @"Λ", 923 }, | |
{ @"Μ", 924 }, | |
{ @"Ν", 925 }, | |
{ @"Ξ", 926 }, | |
{ @"Ο", 927 }, | |
{ @"Π", 928 }, | |
{ @"Ρ", 929 }, | |
{ @"Σ", 931 }, | |
{ @"Τ", 932 }, | |
{ @"Υ", 933 }, | |
{ @"Φ", 934 }, | |
{ @"Χ", 935 }, | |
{ @"Ψ", 936 }, | |
{ @"Ω", 937 }, | |
{ @"α", 945 }, | |
{ @"β", 946 }, | |
{ @"γ", 947 }, | |
{ @"δ", 948 }, | |
{ @"ε", 949 }, | |
{ @"ζ", 950 }, | |
{ @"η", 951 }, | |
{ @"θ", 952 }, | |
{ @"ι", 953 }, | |
{ @"κ", 954 }, | |
{ @"λ", 955 }, | |
{ @"μ", 956 }, | |
{ @"ν", 957 }, | |
{ @"ξ", 958 }, | |
{ @"ο", 959 }, | |
{ @"π", 960 }, | |
{ @"ρ", 961 }, | |
{ @"ς", 962 }, | |
{ @"σ", 963 }, | |
{ @"τ", 964 }, | |
{ @"υ", 965 }, | |
{ @"φ", 966 }, | |
{ @"χ", 967 }, | |
{ @"ψ", 968 }, | |
{ @"ω", 969 }, | |
{ @"ϑ", 977 }, | |
{ @"ϒ", 978 }, | |
{ @"ϖ", 982 }, | |
// A.2.2. Special characters cont'd | |
{ @" ", 8194 }, | |
{ @" ", 8195 }, | |
{ @" ", 8201 }, | |
{ @"‌", 8204 }, | |
{ @"‍", 8205 }, | |
{ @"‎", 8206 }, | |
{ @"‏", 8207 }, | |
{ @"–", 8211 }, | |
{ @"—", 8212 }, | |
{ @"‘", 8216 }, | |
{ @"’", 8217 }, | |
{ @"‚", 8218 }, | |
{ @"“", 8220 }, | |
{ @"”", 8221 }, | |
{ @"„", 8222 }, | |
{ @"†", 8224 }, | |
{ @"‡", 8225 }, | |
// A.2.3. Symbols cont'd | |
{ @"•", 8226 }, | |
{ @"…", 8230 }, | |
// A.2.2. Special characters cont'd | |
{ @"‰", 8240 }, | |
// A.2.3. Symbols cont'd | |
{ @"′", 8242 }, | |
{ @"″", 8243 }, | |
// A.2.2. Special characters cont'd | |
{ @"‹", 8249 }, | |
{ @"›", 8250 }, | |
// A.2.3. Symbols cont'd | |
{ @"‾", 8254 }, | |
{ @"⁄", 8260 }, | |
// A.2.2. Special characters cont'd | |
{ @"€", 8364 }, | |
// A.2.3. Symbols cont'd | |
{ @"ℑ", 8465 }, | |
{ @"℘", 8472 }, | |
{ @"ℜ", 8476 }, | |
{ @"™", 8482 }, | |
{ @"ℵ", 8501 }, | |
{ @"←", 8592 }, | |
{ @"↑", 8593 }, | |
{ @"→", 8594 }, | |
{ @"↓", 8595 }, | |
{ @"↔", 8596 }, | |
{ @"↵", 8629 }, | |
{ @"⇐", 8656 }, | |
{ @"⇑", 8657 }, | |
{ @"⇒", 8658 }, | |
{ @"⇓", 8659 }, | |
{ @"⇔", 8660 }, | |
{ @"∀", 8704 }, | |
{ @"∂", 8706 }, | |
{ @"∃", 8707 }, | |
{ @"∅", 8709 }, | |
{ @"∇", 8711 }, | |
{ @"∈", 8712 }, | |
{ @"∉", 8713 }, | |
{ @"∋", 8715 }, | |
{ @"∏", 8719 }, | |
{ @"∑", 8721 }, | |
{ @"−", 8722 }, | |
{ @"∗", 8727 }, | |
{ @"√", 8730 }, | |
{ @"∝", 8733 }, | |
{ @"∞", 8734 }, | |
{ @"∠", 8736 }, | |
{ @"∧", 8743 }, | |
{ @"∨", 8744 }, | |
{ @"∩", 8745 }, | |
{ @"∪", 8746 }, | |
{ @"∫", 8747 }, | |
{ @"∴", 8756 }, | |
{ @"∼", 8764 }, | |
{ @"≅", 8773 }, | |
{ @"≈", 8776 }, | |
{ @"≠", 8800 }, | |
{ @"≡", 8801 }, | |
{ @"≤", 8804 }, | |
{ @"≥", 8805 }, | |
{ @"⊂", 8834 }, | |
{ @"⊃", 8835 }, | |
{ @"⊄", 8836 }, | |
{ @"⊆", 8838 }, | |
{ @"⊇", 8839 }, | |
{ @"⊕", 8853 }, | |
{ @"⊗", 8855 }, | |
{ @"⊥", 8869 }, | |
{ @"⋅", 8901 }, | |
{ @"⌈", 8968 }, | |
{ @"⌉", 8969 }, | |
{ @"⌊", 8970 }, | |
{ @"⌋", 8971 }, | |
{ @"⟨", 9001 }, | |
{ @"⟩", 9002 }, | |
{ @"◊", 9674 }, | |
{ @"♠", 9824 }, | |
{ @"♣", 9827 }, | |
{ @"♥", 9829 }, | |
{ @"♦", 9830 } | |
}; | |
// Taken from http://www.w3.org/TR/xhtml1/dtds.html#a_dtd_Special_characters | |
// This is table A.2.2 Special Characters | |
static HTMLEscapeMap gUnicodeHTMLEscapeMap[] = { | |
// C0 Controls and Basic Latin | |
{ @""", 34 }, | |
{ @"&", 38 }, | |
{ @"'", 39 }, | |
{ @"<", 60 }, | |
{ @">", 62 }, | |
// Latin Extended-A | |
{ @"Œ", 338 }, | |
{ @"œ", 339 }, | |
{ @"Š", 352 }, | |
{ @"š", 353 }, | |
{ @"Ÿ", 376 }, | |
// Spacing Modifier Letters | |
{ @"ˆ", 710 }, | |
{ @"˜", 732 }, | |
// General Punctuation | |
{ @" ", 8194 }, | |
{ @" ", 8195 }, | |
{ @" ", 8201 }, | |
{ @"‌", 8204 }, | |
{ @"‍", 8205 }, | |
{ @"‎", 8206 }, | |
{ @"‏", 8207 }, | |
{ @"–", 8211 }, | |
{ @"—", 8212 }, | |
{ @"‘", 8216 }, | |
{ @"’", 8217 }, | |
{ @"‚", 8218 }, | |
{ @"“", 8220 }, | |
{ @"”", 8221 }, | |
{ @"„", 8222 }, | |
{ @"†", 8224 }, | |
{ @"‡", 8225 }, | |
{ @"‰", 8240 }, | |
{ @"‹", 8249 }, | |
{ @"›", 8250 }, | |
{ @"€", 8364 }, | |
}; | |
// Utility function for Bsearching table above | |
static int EscapeMapCompare(const void *ucharVoid, const void *mapVoid) { | |
const unichar *uchar = (const unichar*)ucharVoid; | |
const HTMLEscapeMap *map = (const HTMLEscapeMap*)mapVoid; | |
int val; | |
if (*uchar > map->uchar) { | |
val = 1; | |
} else if (*uchar < map->uchar) { | |
val = -1; | |
} else { | |
val = 0; | |
} | |
return val; | |
} | |
@implementation NSString (GTMNSStringHTMLAdditions) | |
- (NSString *)gtm_stringByEscapingHTMLUsingTable:(HTMLEscapeMap*)table | |
ofSize:(NSUInteger)size | |
escapingUnicode:(BOOL)escapeUnicode { | |
NSUInteger length = [self length]; | |
if (!length) { | |
return self; | |
} | |
NSMutableString *finalString = [NSMutableString string]; | |
NSMutableData *data2 = [NSMutableData dataWithCapacity:sizeof(unichar) * length]; | |
// this block is common between GTMNSString+HTML and GTMNSString+XML but | |
// it's so short that it isn't really worth trying to share. | |
const unichar *buffer = CFStringGetCharactersPtr((CFStringRef)self); | |
if (!buffer) { | |
// We want this buffer to be autoreleased. | |
NSMutableData *data = [NSMutableData dataWithLength:length * sizeof(UniChar)]; | |
if (!data) { | |
// COV_NF_START - Memory fail case | |
// _GTMDevLog(@"couldn't alloc buffer"); | |
return nil; | |
// COV_NF_END | |
} | |
[self getCharacters:[data mutableBytes]]; | |
buffer = [data bytes]; | |
} | |
if (!buffer || !data2) { | |
// COV_NF_START | |
// _GTMDevLog(@"Unable to allocate buffer or data2"); | |
return nil; | |
// COV_NF_END | |
} | |
unichar *buffer2 = (unichar *)[data2 mutableBytes]; | |
NSUInteger buffer2Length = 0; | |
for (NSUInteger i = 0; i < length; ++i) { | |
HTMLEscapeMap *val = bsearch(&buffer[i], table, | |
size / sizeof(HTMLEscapeMap), | |
sizeof(HTMLEscapeMap), EscapeMapCompare); | |
if (val || (escapeUnicode && buffer[i] > 127)) { | |
if (buffer2Length) { | |
CFStringAppendCharacters((CFMutableStringRef)finalString, | |
buffer2, | |
buffer2Length); | |
buffer2Length = 0; | |
} | |
if (val) { | |
[finalString appendString:val->escapeSequence]; | |
} | |
else { | |
// _GTMDevAssert(escapeUnicode && buffer[i] > 127, @"Illegal Character"); | |
[finalString appendFormat:@"&#%d;", buffer[i]]; | |
} | |
} else { | |
buffer2[buffer2Length] = buffer[i]; | |
buffer2Length += 1; | |
} | |
} | |
if (buffer2Length) { | |
CFStringAppendCharacters((CFMutableStringRef)finalString, | |
buffer2, | |
buffer2Length); | |
} | |
return finalString; | |
} | |
- (NSString *)gtm_stringByEscapingForHTML { | |
return [self gtm_stringByEscapingHTMLUsingTable:gUnicodeHTMLEscapeMap | |
ofSize:sizeof(gUnicodeHTMLEscapeMap) | |
escapingUnicode:NO]; | |
} // gtm_stringByEscapingHTML | |
- (NSString *)gtm_stringByEscapingForAsciiHTML { | |
return [self gtm_stringByEscapingHTMLUsingTable:gAsciiHTMLEscapeMap | |
ofSize:sizeof(gAsciiHTMLEscapeMap) | |
escapingUnicode:YES]; | |
} // gtm_stringByEscapingAsciiHTML | |
- (NSString *)gtm_stringByUnescapingFromHTML { | |
NSRange range = NSMakeRange(0, [self length]); | |
NSRange subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]; | |
// if no ampersands, we've got a quick way out | |
if (subrange.length == 0) return self; | |
NSMutableString *finalString = [NSMutableString stringWithString:self]; | |
do { | |
NSRange semiColonRange = NSMakeRange(subrange.location, NSMaxRange(range) - subrange.location); | |
semiColonRange = [self rangeOfString:@";" options:0 range:semiColonRange]; | |
range = NSMakeRange(0, subrange.location); | |
// if we don't find a semicolon in the range, we don't have a sequence | |
if (semiColonRange.location == NSNotFound) { | |
continue; | |
} | |
NSRange escapeRange = NSMakeRange(subrange.location, semiColonRange.location - subrange.location + 1); | |
NSString *escapeString = [self substringWithRange:escapeRange]; | |
NSUInteger length = [escapeString length]; | |
// a squence must be longer than 3 (<) and less than 11 (ϑ) | |
if (length > 3 && length < 11) { | |
if ([escapeString characterAtIndex:1] == '#') { | |
unichar char2 = [escapeString characterAtIndex:2]; | |
if (char2 == 'x' || char2 == 'X') { | |
// Hex escape squences £ | |
NSString *hexSequence = [escapeString substringWithRange:NSMakeRange(3, length - 4)]; | |
NSScanner *scanner = [NSScanner scannerWithString:hexSequence]; | |
unsigned value; | |
if ([scanner scanHexInt:&value] && | |
value < USHRT_MAX && | |
value > 0 | |
&& [scanner scanLocation] == length - 4) { | |
unichar uchar = value; | |
NSString *charString = [NSString stringWithCharacters:&uchar length:1]; | |
[finalString replaceCharactersInRange:escapeRange withString:charString]; | |
} | |
} else { | |
// Decimal Sequences { | |
NSString *numberSequence = [escapeString substringWithRange:NSMakeRange(2, length - 3)]; | |
NSScanner *scanner = [NSScanner scannerWithString:numberSequence]; | |
int value; | |
if ([scanner scanInt:&value] && | |
value < USHRT_MAX && | |
value > 0 | |
&& [scanner scanLocation] == length - 3) { | |
unichar uchar = value; | |
NSString *charString = [NSString stringWithCharacters:&uchar length:1]; | |
[finalString replaceCharactersInRange:escapeRange withString:charString]; | |
} | |
} | |
} else { | |
// "standard" sequences | |
for (unsigned i = 0; i < sizeof(gAsciiHTMLEscapeMap) / sizeof(HTMLEscapeMap); ++i) { | |
if ([escapeString isEqualToString:gAsciiHTMLEscapeMap[i].escapeSequence]) { | |
[finalString replaceCharactersInRange:escapeRange withString:[NSString stringWithCharacters:&gAsciiHTMLEscapeMap[i].uchar length:1]]; | |
break; | |
} | |
} | |
} | |
} | |
} while ((subrange = [self rangeOfString:@"&" options:NSBackwardsSearch range:range]).length != 0); | |
return finalString; | |
} // gtm_stringByUnescapingHTML | |
@end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment