Skip to content

Instantly share code, notes, and snippets.

@mwaterfall
Last active May 27, 2024 04:36
Show Gist options
  • Save mwaterfall/25b4a6a06dc3309d9555 to your computer and use it in GitHub Desktop.
Save mwaterfall/25b4a6a06dc3309d9555 to your computer and use it in GitHub Desktop.
Decoding HTML Entities in Swift
// Very slightly adapted from http://stackoverflow.com/a/30141700/106244
// 99.99% Credit to Martin R!
// Mapping from XML/HTML character entity reference to character
// From http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
private let characterEntities : [String: Character] = [
// XML predefined entities:
""" : "\"",
"&" : "&",
"'" : "'",
"&lt;" : "<",
"&gt;" : ">",
// HTML character entity references:
"&nbsp;" : "\u{00A0}",
"&iexcl;" : "\u{00A1}",
"&cent;" : "\u{00A2}",
"&pound;" : "\u{00A3}",
"&curren;" : "\u{00A4}",
"&yen;" : "\u{00A5}",
"&brvbar;" : "\u{00A6}",
"&sect;" : "\u{00A7}",
"&uml;" : "\u{00A8}",
"&copy;" : "\u{00A9}",
"&ordf;" : "\u{00AA}",
"&laquo;" : "\u{00AB}",
"&not;" : "\u{00AC}",
"&shy;" : "\u{00AD}",
"&reg;" : "\u{00AE}",
"&macr;" : "\u{00AF}",
"&deg;" : "\u{00B0}",
"&plusmn;" : "\u{00B1}",
"&sup2;" : "\u{00B2}",
"&sup3;" : "\u{00B3}",
"&acute;" : "\u{00B4}",
"&micro;" : "\u{00B5}",
"&para;" : "\u{00B6}",
"&middot;" : "\u{00B7}",
"&cedil;" : "\u{00B8}",
"&sup1;" : "\u{00B9}",
"&ordm;" : "\u{00BA}",
"&raquo;" : "\u{00BB}",
"&frac14;" : "\u{00BC}",
"&frac12;" : "\u{00BD}",
"&frac34;" : "\u{00BE}",
"&iquest;" : "\u{00BF}",
"&Agrave;" : "\u{00C0}",
"&Aacute;" : "\u{00C1}",
"&Acirc;" : "\u{00C2}",
"&Atilde;" : "\u{00C3}",
"&Auml;" : "\u{00C4}",
"&Aring;" : "\u{00C5}",
"&AElig;" : "\u{00C6}",
"&Ccedil;" : "\u{00C7}",
"&Egrave;" : "\u{00C8}",
"&Eacute;" : "\u{00C9}",
"&Ecirc;" : "\u{00CA}",
"&Euml;" : "\u{00CB}",
"&Igrave;" : "\u{00CC}",
"&Iacute;" : "\u{00CD}",
"&Icirc;" : "\u{00CE}",
"&Iuml;" : "\u{00CF}",
"&ETH;" : "\u{00D0}",
"&Ntilde;" : "\u{00D1}",
"&Ograve;" : "\u{00D2}",
"&Oacute;" : "\u{00D3}",
"&Ocirc;" : "\u{00D4}",
"&Otilde;" : "\u{00D5}",
"&Ouml;" : "\u{00D6}",
"&times;" : "\u{00D7}",
"&Oslash;" : "\u{00D8}",
"&Ugrave;" : "\u{00D9}",
"&Uacute;" : "\u{00DA}",
"&Ucirc;" : "\u{00DB}",
"&Uuml;" : "\u{00DC}",
"&Yacute;" : "\u{00DD}",
"&THORN;" : "\u{00DE}",
"&szlig;" : "\u{00DF}",
"&agrave;" : "\u{00E0}",
"&aacute;" : "\u{00E1}",
"&acirc;" : "\u{00E2}",
"&atilde;" : "\u{00E3}",
"&auml;" : "\u{00E4}",
"&aring;" : "\u{00E5}",
"&aelig;" : "\u{00E6}",
"&ccedil;" : "\u{00E7}",
"&egrave;" : "\u{00E8}",
"&eacute;" : "\u{00E9}",
"&ecirc;" : "\u{00EA}",
"&euml;" : "\u{00EB}",
"&igrave;" : "\u{00EC}",
"&iacute;" : "\u{00ED}",
"&icirc;" : "\u{00EE}",
"&iuml;" : "\u{00EF}",
"&eth;" : "\u{00F0}",
"&ntilde;" : "\u{00F1}",
"&ograve;" : "\u{00F2}",
"&oacute;" : "\u{00F3}",
"&ocirc;" : "\u{00F4}",
"&otilde;" : "\u{00F5}",
"&ouml;" : "\u{00F6}",
"&divide;" : "\u{00F7}",
"&oslash;" : "\u{00F8}",
"&ugrave;" : "\u{00F9}",
"&uacute;" : "\u{00FA}",
"&ucirc;" : "\u{00FB}",
"&uuml;" : "\u{00FC}",
"&yacute;" : "\u{00FD}",
"&thorn;" : "\u{00FE}",
"&yuml;" : "\u{00FF}",
"&OElig;" : "\u{0152}",
"&oelig;" : "\u{0153}",
"&Scaron;" : "\u{0160}",
"&scaron;" : "\u{0161}",
"&Yuml;" : "\u{0178}",
"&fnof;" : "\u{0192}",
"&circ;" : "\u{02C6}",
"&tilde;" : "\u{02DC}",
"&Alpha;" : "\u{0391}",
"&Beta;" : "\u{0392}",
"&Gamma;" : "\u{0393}",
"&Delta;" : "\u{0394}",
"&Epsilon;" : "\u{0395}",
"&Zeta;" : "\u{0396}",
"&Eta;" : "\u{0397}",
"&Theta;" : "\u{0398}",
"&Iota;" : "\u{0399}",
"&Kappa;" : "\u{039A}",
"&Lambda;" : "\u{039B}",
"&Mu;" : "\u{039C}",
"&Nu;" : "\u{039D}",
"&Xi;" : "\u{039E}",
"&Omicron;" : "\u{039F}",
"&Pi;" : "\u{03A0}",
"&Rho;" : "\u{03A1}",
"&Sigma;" : "\u{03A3}",
"&Tau;" : "\u{03A4}",
"&Upsilon;" : "\u{03A5}",
"&Phi;" : "\u{03A6}",
"&Chi;" : "\u{03A7}",
"&Psi;" : "\u{03A8}",
"&Omega;" : "\u{03A9}",
"&alpha;" : "\u{03B1}",
"&beta;" : "\u{03B2}",
"&gamma;" : "\u{03B3}",
"&delta;" : "\u{03B4}",
"&epsilon;" : "\u{03B5}",
"&zeta;" : "\u{03B6}",
"&eta;" : "\u{03B7}",
"&theta;" : "\u{03B8}",
"&iota;" : "\u{03B9}",
"&kappa;" : "\u{03BA}",
"&lambda;" : "\u{03BB}",
"&mu;" : "\u{03BC}",
"&nu;" : "\u{03BD}",
"&xi;" : "\u{03BE}",
"&omicron;" : "\u{03BF}",
"&pi;" : "\u{03C0}",
"&rho;" : "\u{03C1}",
"&sigmaf;" : "\u{03C2}",
"&sigma;" : "\u{03C3}",
"&tau;" : "\u{03C4}",
"&upsilon;" : "\u{03C5}",
"&phi;" : "\u{03C6}",
"&chi;" : "\u{03C7}",
"&psi;" : "\u{03C8}",
"&omega;" : "\u{03C9}",
"&thetasym;" : "\u{03D1}",
"&upsih;" : "\u{03D2}",
"&piv;" : "\u{03D6}",
"&ensp;" : "\u{2002}",
"&emsp;" : "\u{2003}",
"&thinsp;" : "\u{2009}",
"&zwnj;" : "\u{200C}",
"&zwj;" : "\u{200D}",
"&lrm;" : "\u{200E}",
"&rlm;" : "\u{200F}",
"&ndash;" : "\u{2013}",
"&mdash;" : "\u{2014}",
"&lsquo;" : "\u{2018}",
"&rsquo;" : "\u{2019}",
"&sbquo;" : "\u{201A}",
"&ldquo;" : "\u{201C}",
"&rdquo;" : "\u{201D}",
"&bdquo;" : "\u{201E}",
"&dagger;" : "\u{2020}",
"&Dagger;" : "\u{2021}",
"&bull;" : "\u{2022}",
"&hellip;" : "\u{2026}",
"&permil;" : "\u{2030}",
"&prime;" : "\u{2032}",
"&Prime;" : "\u{2033}",
"&lsaquo;" : "\u{2039}",
"&rsaquo;" : "\u{203A}",
"&oline;" : "\u{203E}",
"&frasl;" : "\u{2044}",
"&euro;" : "\u{20AC}",
"&image;" : "\u{2111}",
"&weierp;" : "\u{2118}",
"&real;" : "\u{211C}",
"&trade;" : "\u{2122}",
"&alefsym;" : "\u{2135}",
"&larr;" : "\u{2190}",
"&uarr;" : "\u{2191}",
"&rarr;" : "\u{2192}",
"&darr;" : "\u{2193}",
"&harr;" : "\u{2194}",
"&crarr;" : "\u{21B5}",
"&lArr;" : "\u{21D0}",
"&uArr;" : "\u{21D1}",
"&rArr;" : "\u{21D2}",
"&dArr;" : "\u{21D3}",
"&hArr;" : "\u{21D4}",
"&forall;" : "\u{2200}",
"&part;" : "\u{2202}",
"&exist;" : "\u{2203}",
"&empty;" : "\u{2205}",
"&nabla;" : "\u{2207}",
"&isin;" : "\u{2208}",
"&notin;" : "\u{2209}",
"&ni;" : "\u{220B}",
"&prod;" : "\u{220F}",
"&sum;" : "\u{2211}",
"&minus;" : "\u{2212}",
"&lowast;" : "\u{2217}",
"&radic;" : "\u{221A}",
"&prop;" : "\u{221D}",
"&infin;" : "\u{221E}",
"&ang;" : "\u{2220}",
"&and;" : "\u{2227}",
"&or;" : "\u{2228}",
"&cap;" : "\u{2229}",
"&cup;" : "\u{222A}",
"&int;" : "\u{222B}",
"&there4;" : "\u{2234}",
"&sim;" : "\u{223C}",
"&cong;" : "\u{2245}",
"&asymp;" : "\u{2248}",
"&ne;" : "\u{2260}",
"&equiv;" : "\u{2261}",
"&le;" : "\u{2264}",
"&ge;" : "\u{2265}",
"&sub;" : "\u{2282}",
"&sup;" : "\u{2283}",
"&nsub;" : "\u{2284}",
"&sube;" : "\u{2286}",
"&supe;" : "\u{2287}",
"&oplus;" : "\u{2295}",
"&otimes;" : "\u{2297}",
"&perp;" : "\u{22A5}",
"&sdot;" : "\u{22C5}",
"&lceil;" : "\u{2308}",
"&rceil;" : "\u{2309}",
"&lfloor;" : "\u{230A}",
"&rfloor;" : "\u{230B}",
"&lang;" : "\u{2329}",
"&rang;" : "\u{232A}",
"&loz;" : "\u{25CA}",
"&spades;" : "\u{2660}",
"&clubs;" : "\u{2663}",
"&hearts;" : "\u{2665}",
"&diams;" : "\u{2666}",
]
extension String {
/// Returns a new string made by replacing in the `String`
/// all HTML character entity references with the corresponding
/// character.
var stringByDecodingHTMLEntities: String {
return decodeHTMLEntities().decodedString
}
/// Returns a tuple containing the string made by relpacing in the
/// `String` all HTML character entity references with the corresponding
/// character. Also returned is an array of offset information describing
/// the location and length offsets for each replacement. This allows
/// for the correct adjust any attributes that may be associated with
/// with substrings within the `String`
func decodeHTMLEntities() -> (decodedString: String, replacementOffsets: [(index: String.Index, offset: String.Index.Distance)]) {
// ===== Utility functions =====
// Record the index offsets of each replacement
// This allows anyone to correctly adjust any attributes that may be
// associated with substrings within the string
var replacementOffsets: [(index: String.Index, offset: String.Index.Distance)] = []
// Convert the number in the string to the corresponding
// Unicode character, e.g.
// decodeNumeric("64", 10) --> "@"
// decodeNumeric("20ac", 16) --> "€"
func decodeNumeric(string : String, base : Int32) -> Character? {
let code = UInt32(strtoul(string, nil, base))
return Character(UnicodeScalar(code))
}
// Decode the HTML character entity to the corresponding
// Unicode character, return `nil` for invalid input.
// decode("&#64;") --> "@"
// decode("&#x20ac;") --> "€"
// decode("&lt;") --> "<"
// decode("&foo;") --> nil
func decode(entity : String) -> Character? {
if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){
return decodeNumeric(entity.substringFromIndex(advance(entity.startIndex, 3)), 16)
} else if entity.hasPrefix("&#") {
return decodeNumeric(entity.substringFromIndex(advance(entity.startIndex, 2)), 10)
} else {
return characterEntities[entity]
}
}
// ===== Method starts here =====
var result = ""
var position = startIndex
// Find the next '&' and copy the characters preceding it to `result`:
while let ampRange = self.rangeOfString("&", range: position ..< endIndex) {
result.extend(self[position ..< ampRange.startIndex])
position = ampRange.startIndex
// Find the next ';' and copy everything from '&' to ';' into `entity`
if let semiRange = self.rangeOfString(";", range: position ..< endIndex) {
let entity = self[position ..< semiRange.endIndex]
if let decoded = decode(entity) {
// Replace by decoded character:
result.append(decoded)
// Record offset
let offset = (index: semiRange.endIndex, offset: 1 - distance(position, semiRange.endIndex))
replacementOffsets.append(offset)
} else {
// Invalid entity, copy verbatim:
result.extend(entity)
}
position = semiRange.endIndex
} else {
// No matching ';'.
break
}
}
// Copy remaining characters to `result`:
result.extend(self[position ..< endIndex])
// Return results
return (decodedString: result, replacementOffsets: replacementOffsets)
}
}
@nyxee
Copy link

nyxee commented Jun 5, 2017

extension String {
    func htmlDecoded()->String {
        
        guard (self != "") else { return self }
        
        var newStr = self
        // from https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
        let entities = [ //a dictionary of HTM/XML entities.
            "&quot;"    : "\"",
            "&amp;"     : "&",
            "&apos;"    : "'",
            "&lt;"      : "<",
            "&gt;"      : ">",
            "&deg;"     : "º",
            ]
        
        for (name,value) in entities {
            newStr = newStr.replacingOccurrences(of: name, with: value)
        }
        return newStr
    }
}

var input = "Mostly dry. Warm (max 28&deg;C on Thu morning, min 13&deg;C on Wed night). Wind will be generally light."
print(input)
var output = input.htmlDecoded()
print(output)

some manual work needed to extend the above for Swift 3.

@nyxee
Copy link

nyxee commented Jun 5, 2017

@marbetschar any links on how to use the example you linked to?

@nathanfjohnson
Copy link

Swift 4 anyone? https://gist.github.com/nathanfjohnson/380b9f24c991a8970144e13ddd044d21 Many changes and updates to the original approach.

@RishabhSRS
Copy link

for swift 3 :-

extension String {

/// Returns a new string made by replacing in the `String`
/// all HTML character entity references with the corresponding
/// character.
var stringByDecodingHTMLEntities : String {

    // ===== Utility functions =====

    // Convert the number in the string to the corresponding
    // Unicode character, e.g.
    //    decodeNumeric("64", 10)   --> "@"
    //    decodeNumeric("20ac", 16) --> "€"
    func decodeNumeric(_ string : String, base : Int) -> Character? {
        guard let code = UInt32(string, radix: base),
            let uniScalar = UnicodeScalar(code) else { return nil }
        return Character(uniScalar)
    }

    // Decode the HTML character entity to the corresponding
    // Unicode character, return `nil` for invalid input.
    //     decode("&#64;")    --> "@"
    //     decode("&#x20ac;") --> "€"
    //     decode("&lt;")     --> "<"
    //     decode("&foo;")    --> nil
    func decode(_ entity : String) -> Character? {

        if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){
            return decodeNumeric(entity.substring(with: entity.index(entity.startIndex, offsetBy: 3) ..< entity.index(entity.endIndex, offsetBy: -1)), base: 16)
        } else if entity.hasPrefix("&#") {
            return decodeNumeric(entity.substring(with: entity.index(entity.startIndex, offsetBy: 2) ..< entity.index(entity.endIndex, offsetBy: -1)), base: 10)
        } else {
            return characterEntities[entity]
        }
    }

    // ===== Method starts here =====

    var result = ""
    var position = startIndex

    // Find the next '&' and copy the characters preceding it to `result`:
    while let ampRange = self.range(of: "&", range: position ..< endIndex) {
        result.append(self[position ..< ampRange.lowerBound])
        position = ampRange.lowerBound

        // Find the next ';' and copy everything from '&' to ';' into `entity`
        if let semiRange = self.range(of: ";", range: position ..< endIndex) {
            let entity = self[position ..< semiRange.upperBound]
            position = semiRange.upperBound

            if let decoded = decode(entity) {
                // Replace by decoded character:
                result.append(decoded)
            } else {
                // Invalid entity, copy verbatim:
                result.append(entity)
            }
        } else {
            // No matching ';'.
            break
        }
    }
    // Copy remaining characters to `result`:
    result.append(self[position ..< endIndex])
    return result
}

}

@witekbobrowski
Copy link

Swift 5+

extension String {
    /// Returns a new string made by replacing in the `String`
    /// all HTML character entity references with the corresponding
    /// character.
    var stringByDecodingHTMLEntities : String {

        // ===== Utility functions =====

        // Convert the number in the string to the corresponding
        // Unicode character, e.g.
        //    decodeNumeric("64", 10)   --> "@"
        //    decodeNumeric("20ac", 16) --> "€"
        func decodeNumeric(_ string : String, base : Int) -> Character? {
            guard let code = UInt32(string, radix: base),
                let uniScalar = UnicodeScalar(code) else { return nil }
            return Character(uniScalar)
        }

        // Decode the HTML character entity to the corresponding
        // Unicode character, return `nil` for invalid input.
        //     decode("&#64;")    --> "@"
        //     decode("&#x20ac;") --> "€"
        //     decode("&lt;")     --> "<"
        //     decode("&foo;")    --> nil
        func decode(_ entity : String) -> Character? {

            if entity.hasPrefix("&#x") || entity.hasPrefix("&#X"){
                return decodeNumeric(
                    String(entity[
                        entity.index(entity.startIndex, offsetBy: 3)
                        ..< entity.index(entity.endIndex, offsetBy: -1)
                    ]), base: 16
                )
            } else if entity.hasPrefix("&#") {
                return decodeNumeric(
                    String(entity[
                        entity.index(entity.startIndex, offsetBy: 2)
                        ..< entity.index(entity.endIndex, offsetBy: -1)
                    ]), base: 10
                )
            } else {
                return characterEntities[entity]
            }
        }

        // ===== Method starts here =====

        var result = ""
        var position = startIndex

        // Find the next '&' and copy the characters preceding it to `result`:
        while let ampRange = self.range(of: "&", range: position ..< endIndex) {
            result.append(String(self[position ..< ampRange.lowerBound]))
            position = ampRange.lowerBound

            // Find the next ';' and copy everything from '&' to ';' into `entity`
            if let semiRange = self.range(of: ";", range: position ..< endIndex) {
                let entity = String(self[position ..< semiRange.upperBound])
                position = semiRange.upperBound

                if let decoded = decode(entity) {
                    // Replace by decoded character:
                    result.append(decoded)
                } else {
                    // Invalid entity, copy verbatim:
                    result.append(entity)
                }
            } else {
                // No matching ';'.
                break
            }
        }
        // Copy remaining characters to `result`:
        result.append(String(self[position ..< endIndex]))
        return result
    }
}

@bdkjones
Copy link

bdkjones commented May 27, 2024

Swift 5.7+

With the Regex Literal support introduced in Swift 5.7, this process has gotten much simpler:

let test: String = "Insisting on &gt; splitting on&nbsp;the first &lt; date"
var cleanText: String = ""

// Matches an HTML entity encoding, such as `&lt;` or `&nbsp;`
let regex = /&[A-Za-z0-9]{2,};/ 

var lastIndex: String.Index = test.startIndex

let ranges: [Range<String.Index>] = test.ranges(of: regex)
for range: Range<String.Index> in ranges
{
    if range.lowerBound > lastIndex 
    {
        cleanText.append(String(test[lastIndex ..< range.lowerBound]))
    }
    
    let doomedText: String = String(test[range.lowerBound ..< range.upperBound])
    if let newCharacter: Character = characterEntities[doomedText]
    {
        cleanText.append(newCharacter)
        lastIndex = range.upperBound
    }
    else
    {
        lastIndex = range.lowerBound
    }
}

if lastIndex < test.endIndex 
{
    cleanText.append(String(test[lastIndex ..< test.endIndex]))
}

Warning

I haven't really tested all the edge cases in the above snippet and I typed it on GitHub, so it may require tweaking. Still, it's way more streamlined than the older approach and the performance is phenomenal.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment