Lemmatize strings
public struct wordToken { | |
let word: String | |
let wordStem: String? | |
init(word: String, wordStem: String?) { | |
self.word = word | |
self.wordStem = wordStem | |
} | |
} | |
func lemmatize(_ text: String) -> [wordToken] { | |
let text = text.lowercased() | |
let options: NSLinguisticTagger.Options = [.omitWhitespace, .omitPunctuation, .omitOther] | |
let tagger = NSLinguisticTagger(tagSchemes: NSLinguisticTagger.availableTagSchemes(forLanguage: "en"), | |
options: Int(options.rawValue)) | |
tagger.string = text | |
var tokens: [wordToken] = [] | |
tagger.enumerateTags(in: NSMakeRange(0, text.characters.count), scheme: NSLinguisticTagSchemeLemma, options: options) { tag, tokenRange, _, _ in | |
let word = (text as NSString).substring(with: tokenRange) | |
tokens.append(wordToken(word: word, wordStem: tag)) | |
} | |
return tokens | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment