Skip to content

Instantly share code, notes, and snippets.

@profh
Created November 9, 2021 21:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save profh/479d63b1dfcb5a002c19b74daf323676 to your computer and use it in GitHub Desktop.
Save profh/479d63b1dfcb5a002c19b74daf323676 to your computer and use it in GitHub Desktop.
Contents of playground for NLP example (parts of speech and language recognition)
// Playground using NSLinguisticTagger to analyze and tag a block of text
// https://developer.apple.com/documentation/foundation/nslinguistictagger/identifying_parts_of_speech
import Foundation
// text from some article on Democratic candidate preferences (Nov 2018)
let text = "The Democratic frontrunner, according to Politico's poll: Joe Biden, former Vice President and Senator from Delaware, who managed to grab just over a quarter (26%) of the Democrats' vote for who they'd most like to see facing off against Trump in two years for control of the White House. The runner-up is Vermont Senator Bernie Sanders, who ran a close primary campaign against Hillary Clinton in 2016, managing to get about a fifth of the votes (19%). The third-place candidate is Rep. Beto O’Rourke from Texas, who built national name-recognition through his losing Senate bid last week, with 8 percent. Following O’Rourke are three senators, all thought to be likely candidates: Sens. Elizabeth Warren (Mass.) at 5 percent, Kamala Harris (Calif.) at 4 percent and Cory Booker (N.J.) at 3 percent."
// getting the first sentence from the text:
let endOfSentence = text.firstIndex(of: ".")!
let sentence = text[...endOfSentence]
// set up a tagger with NSLinguisticTagger
let tagger = NSLinguisticTagger(tagSchemes: [NSLinguisticTagScheme.lexicalClass], options: 0)
// setting the tagger text to our text
tagger.string = text
// set the range of the text to be analyzed
let range = NSMakeRange(0, 287) // just the first sentence (287 is the count; MS Word)
// let range = NSMakeRange(0, sentence.utf16.count) // assuming hadn't found the count
// let range = NSMakeRange(0, text.utf16.count) // the entire block
// set tagger options (pretty typical choices)
let options:NSLinguisticTagger.Options = [.omitPunctuation, .omitWhitespace, .joinNames]
// set up a tags array and arrays to hold results
// other tag options: https://developer.apple.com/documentation/foundation/nslinguistictag
let tags:[NSLinguisticTag] = [.noun, .verb, .adjective]
var nouns:[String] = []
var verbs:[String] = []
var adjs:[String] = []
// The meat of the operation: enumerate the tags and add to results
tagger.enumerateTags(in: range, unit: .word, scheme: .lexicalClass, options: options){
tag, tokenRange, stop in
guard let tag = tag else { return } // just in case there is no tag...
let token = (text as NSString).substring(with: tokenRange)
// depending on the tag, add to the appropriate results array
switch tag {
case .noun: nouns.append(token)
case .verb: verbs.append(token)
case .adjective: adjs.append(token)
default:break
}
}
// Display results arrays
print("NOUNS:")
for noun in nouns {
print(noun)
}
print("--------")
print("VERBS:")
for verb in verbs {
print(verb)
}
print("--------")
print("ADJECTIVES:")
for adj in adjs {
print(adj)
}
//** Adding language detection... **//
print("--------")
// verify that the dominant language is English
let language = tagger.dominantLanguage
print("The language is \(language!)")
// turn this into func; use repeatedly
func determineLanguage(for text: String) {
tagger.string = text
let language = tagger.dominantLanguage
print("The language is \(language!)")
}
let frenchQuote = "La science n'a pas de patrie." // Science has no homeland -- Pasteur
let germanQuote = "Das ist nicht mein Bier." // That's not my beer
let italianQuote = "L’amore è cieco" // Love is blind
let spanishQuote = "El amor todo lo puede." // Love will find a way
let klingonQuote = "Heghlu'meH QaQ jajvam" // It is a good day to die
let loremQuote = "Lorem ipsum dolor sit amet"
let gibberish = "asdf plmjus qawsedrf"
determineLanguage(for: frenchQuote)
determineLanguage(for: germanQuote)
determineLanguage(for: italianQuote)
determineLanguage(for: spanishQuote)
determineLanguage(for: klingonQuote) // thinks it's Croation, not Klingon
determineLanguage(for: loremQuote) // thinks it's Romanian
determineLanguage(for: gibberish) // that's English too...
determineLanguage(for: "x") // this is undefined
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment