Skip to content

Instantly share code, notes, and snippets.

@jaredsinclair
Created December 18, 2017 05:21
Show Gist options
  • Save jaredsinclair/268457c96867a0ac93540d6105a8afa1 to your computer and use it in GitHub Desktop.
Save jaredsinclair/268457c96867a0ac93540d6105a8afa1 to your computer and use it in GitHub Desktop.
Convert HTML string to plain text (Swift version of MWFeedParser's utility)
/// Quick-n-dirty translation of MWFeedParser's algorithm from Objective-C to Swift
/// seealso: https://github.com/mwaterfall/MWFeedParser/blob/master/Classes/NSString%2BHTML.m
public extension NSString {
public func byConvertingHTMLToPlainText() -> String {
let stopCharacters = CharacterSet(charactersIn: "< \t\n\r\(0x0085)\(0x000C)\(0x2028)\(0x2029)")
let newLineAndWhitespaceCharacters = CharacterSet(charactersIn: " \t\n\r\(0x0085)\(0x000C)\(0x2028)\(0x2029)")
let tagNameCharacters = CharacterSet(charactersIn: "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
let result = NSMutableString(capacity: length)
let scanner = Scanner(string: self as String)
scanner.charactersToBeSkipped = nil
scanner.caseSensitive = true
var str: NSString? = nil
var tagName: NSString? = nil
var dontReplaceTagWithSpace = false
repeat {
// Scan up to the start of a tag or whitespace
if scanner.scanUpToCharacters(from: stopCharacters, into: &str), let s = str {
result.append(s as String)
str = nil
}
// Check if we've stopped at a tag/comment or whitespace
if scanner.scanString("<", into: nil) {
// Stopped at a comment, script tag, or other tag
if scanner.scanString("!--", into: nil) {
// Comment
scanner.scanUpTo("-->", into: nil)
scanner.scanString("-->", into: nil)
} else if scanner.scanString("script", into: nil) {
// Script tag where things don't need escaping!
scanner.scanUpTo("</script>", into: nil)
scanner.scanString("</script>", into: nil)
} else {
// Tag - remove and replace with space unless it's
// a closing inline tag then dont replace with a space
if scanner.scanString("/", into: nil) {
// Closing tag - replace with space unless it's inline
tagName = nil
dontReplaceTagWithSpace = false
if scanner.scanCharacters(from: tagNameCharacters, into: &tagName), let t = tagName {
tagName = t.lowercased as NSString
dontReplaceTagWithSpace =
tagName == "a" ||
tagName == "b" ||
tagName == "i" ||
tagName == "q" ||
tagName == "span" ||
tagName == "em" ||
tagName == "strong" ||
tagName == "cite" ||
tagName == "abbr" ||
tagName == "acronym" ||
tagName == "label"
}
// Replace tag with string unless it was an inline
if !dontReplaceTagWithSpace && result.length > 0 && !scanner.isAtEnd {
result.append(" ")
}
}
// Scan past tag
scanner.scanUpTo(">", into: nil)
scanner.scanString(">", into: nil)
}
} else {
// Stopped at whitespace - replace all whitespace and newlines with a space
if scanner.scanCharacters(from: newLineAndWhitespaceCharacters, into: nil) {
if result.length > 0 && !scanner.isAtEnd {
result.append(" ") // Dont append space to beginning or end of result
}
}
}
} while !scanner.isAtEnd
// Cleanup
// Decode HTML entities and return (this isn't included in this gist, but is often important)
// let retString = (result as String).stringByDecodingHTMLEntities
// Return
return result as String // retString;
}
}
@cedricbahirwe
Copy link

Helpful, but still need improvement

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment