Skip to content

Instantly share code, notes, and snippets.

@mrdavey
Created June 23, 2017 14:55
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mrdavey/132e23f43f98d45eaf5b986b314a2d84 to your computer and use it in GitHub Desktop.
Save mrdavey/132e23f43f98d45eaf5b986b314a2d84 to your computer and use it in GitHub Desktop.
//: Natural Language Processing in iOS 11, with the help of this post: https://medium.com/swiftworld/swift-world-whats-new-in-ios-11-natural-language-processing-2a16b7422334
// This can only be run in Xcode 9
import UIKit
let options: NSLinguisticTagger.Options = [.omitPunctuation, .omitWhitespace]
let text = "Silicon Valley is a nickname for the southern portion of the San Francisco Bay Area, in the northern part of the U.S. state of California. In 2014, tech companies Google, Yahoo!, Facebook, Apple, and others, released corporate transparency reports that offered detailed employee breakdowns. Let's go running or walking to the shops."
let range = NSRange(location: 0, length: text.utf16.count)
//
// Language Identification
//
// Identify the dominant language in a text
//
//let textZh = "WWDC 2017 已经结束了。"
//let taggerLanguage = NSLinguisticTagger(tagSchemes: [.language], options: 0)
//taggerLanguage.string = textZh
//
//if let language = taggerLanguage.dominantLanguage {
// print(language)
//} else {
// print("can't get dominant language")
//}
//
// Tokenization
//
// Tokenization is the process of demarcating and possibly classifying sections of a string of input characters.
// The resulting tokens are then passed on to some other form of processing. The process can be considered a
// sub-task of parsing input.
//
//let taggerToken = NSLinguisticTagger(tagSchemes: [.tokenType], options: 0)
//taggerToken.string = text
//taggerToken.enumerateTags(in: range, unit: .word, scheme: .tokenType, options: options) { tag, tokenRange, stop in
// let token = (text as NSString).substring(with: tokenRange)
// print("\(tag!.rawValue): \(token)")
//}
//
// Lemmatization
//
// Lemmatisation (or lemmatization) in linguistics is the process of grouping together the inflected forms of a word
// so they can be analysed as a single item, identified by the word's lemma, or dictionary form.
// E.g. 'walk', 'walked', 'walks', 'walking' all have a lemma of 'walk'
//
//let taggerLemma = NSLinguisticTagger(tagSchemes: [.lemma], options: 0)
//taggerLemma.string = text
//
//taggerLemma.enumerateTags(in: range, unit: .word, scheme: .lemma, options: options) { tag, tokenRange, stop in
// if let lemma = tag?.rawValue { print(lemma) }
//}
//
// Name Type
//
// Named-entity recognition (NER) (also known as entity identification, entity chunking and entity extraction) is a subtask
// of information extraction that seeks to locate and classify named entities in text into pre-defined categories such as
// the names of persons, organizations, locations, expressions of times, quantities, monetary values, percentages, etc.
//
//let taggerNameType = NSLinguisticTagger(tagSchemes: [.nameType], options: 0)
//taggerNameType.string = text
//let tags: [NSLinguisticTag] = [.personalName, .placeName, .organizationName]
//
//taggerNameType.enumerateTags(in: range, unit: .word, scheme: .nameType, options: options) { tag, tokenRange, stop in
// if let tag = tag, tags.contains(tag) {
// let name = (text as NSString).substring(with: tokenRange)
// print(name)
// }
//}
//
// Lexical Class
//
// In grammar, a lexical category (also word class, lexical class, or in traditional grammar part of speech) is a linguistic
// category of words (or more precisely lexical items), which is generally defined by the syntactic or morphological behaviour
// of the lexical item in question. Common linguistic categories include noun and verb, among others.
//
let taggerLexical = NSLinguisticTagger(tagSchemes: [.lexicalClass], options: 0)
taggerLexical.string = text
taggerLexical.enumerateTags(in: range, unit: .word, scheme: .lexicalClass, options: options) { tag, tokenRange, stop in
let word = (text as NSString).substring(with: tokenRange)
print("\(tag!.rawValue): \(word)")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment