profh/nlp_parts_of_speech.swift

## nlp_parts_of_speech.swift
// Playground using NSLinguisticTagger to analyze and tag a block of text
// https://developer.apple.com/documentation/foundation/nslinguistictagger/identifying_parts_of_speech

import Foundation

// text from some article on Democratic candidate preferences (Nov 2018)
let text = "The Democratic frontrunner, according to Politico's poll: Joe Biden, former Vice President and Senator from Delaware, who managed to grab just over a quarter (26%) of the Democrats' vote for who they'd most like to see facing off against Trump in two years for control of the White House. The runner-up is Vermont Senator Bernie Sanders, who ran a close primary campaign against Hillary Clinton in 2016, managing to get about a fifth of the votes (19%). The third-place candidate is Rep. Beto O’Rourke from Texas, who built national name-recognition through his losing Senate bid last week, with 8 percent. Following O’Rourke are three senators, all thought to be likely candidates: Sens. Elizabeth Warren (Mass.) at 5 percent, Kamala Harris (Calif.) at 4 percent and Cory Booker (N.J.) at 3 percent."

// getting the first sentence from the text:
 let endOfSentence = text.firstIndex(of: ".")!
 let sentence = text[...endOfSentence]


// set up a tagger with NSLinguisticTagger
let tagger = NSLinguisticTagger(tagSchemes: [NSLinguisticTagScheme.lexicalClass], options: 0)

// setting the tagger text to our text
tagger.string = text

// set the range of the text to be analyzed
let range = NSMakeRange(0, 287)  // just the first sentence (287 is the count; MS Word)
// let range = NSMakeRange(0, sentence.utf16.count)  // assuming hadn't found the count
// let range = NSMakeRange(0, text.utf16.count)  // the entire block


// set tagger options (pretty typical choices)
let options:NSLinguisticTagger.Options = [.omitPunctuation, .omitWhitespace, .joinNames]

// set up a tags array and arrays to hold results
// other tag options: https://developer.apple.com/documentation/foundation/nslinguistictag
let tags:[NSLinguisticTag] = [.noun, .verb, .adjective]

var nouns:[String] = []
var verbs:[String] = []
var adjs:[String] = []


// The meat of the operation: enumerate the tags and add to results
tagger.enumerateTags(in: range, unit: .word, scheme: .lexicalClass, options: options){
  tag, tokenRange, stop in

  guard let tag = tag else { return }  // just in case there is no tag...
  let token = (text as NSString).substring(with: tokenRange)

  // depending on the tag, add to the appropriate results array
  switch tag {
  case .noun: nouns.append(token)
  case .verb: verbs.append(token)
  case .adjective: adjs.append(token)
  default:break

  }
}

// Display results arrays
print("NOUNS:")
for noun in nouns {
  print(noun)
}
print("--------")
print("VERBS:")
for verb in verbs {
  print(verb)
}
print("--------")
print("ADJECTIVES:")
for adj in adjs {
  print(adj)
}

//** Adding language detection...  **//
print("--------")
// verify that the dominant language is English

let language = tagger.dominantLanguage
print("The language is \(language!)")

// turn this into func; use repeatedly
func determineLanguage(for text: String) {
  tagger.string = text
  let language = tagger.dominantLanguage
  print("The language is \(language!)")
}

let frenchQuote  = "La science n'a pas de patrie."  // Science has no homeland -- Pasteur
let germanQuote  = "Das ist nicht mein Bier." // That's not my beer
let italianQuote = "L’amore è cieco" // Love is blind
let spanishQuote = "El amor todo lo puede." // Love will find a way
let klingonQuote = "Heghlu'meH QaQ jajvam" // It is a good day to die
let loremQuote   = "Lorem ipsum dolor sit amet"
let gibberish    = "asdf plmjus qawsedrf"

determineLanguage(for: frenchQuote)
determineLanguage(for: germanQuote)
determineLanguage(for: italianQuote)
determineLanguage(for: spanishQuote)
determineLanguage(for: klingonQuote)   // thinks it's Croation, not Klingon
determineLanguage(for: loremQuote)     // thinks it's Romanian
determineLanguage(for: gibberish)      // that's English too...
determineLanguage(for: "x")            // this is undefined
	// Playground using NSLinguisticTagger to analyze and tag a block of text
	// https://developer.apple.com/documentation/foundation/nslinguistictagger/identifying_parts_of_speech

	import Foundation

	// text from some article on Democratic candidate preferences (Nov 2018)
	let text = "The Democratic frontrunner, according to Politico's poll: Joe Biden, former Vice President and Senator from Delaware, who managed to grab just over a quarter (26%) of the Democrats' vote for who they'd most like to see facing off against Trump in two years for control of the White House. The runner-up is Vermont Senator Bernie Sanders, who ran a close primary campaign against Hillary Clinton in 2016, managing to get about a fifth of the votes (19%). The third-place candidate is Rep. Beto O’Rourke from Texas, who built national name-recognition through his losing Senate bid last week, with 8 percent. Following O’Rourke are three senators, all thought to be likely candidates: Sens. Elizabeth Warren (Mass.) at 5 percent, Kamala Harris (Calif.) at 4 percent and Cory Booker (N.J.) at 3 percent."

	// getting the first sentence from the text:
	let endOfSentence = text.firstIndex(of: ".")!
	let sentence = text[...endOfSentence]


	// set up a tagger with NSLinguisticTagger
	let tagger = NSLinguisticTagger(tagSchemes: [NSLinguisticTagScheme.lexicalClass], options: 0)

	// setting the tagger text to our text
	tagger.string = text

	// set the range of the text to be analyzed
	let range = NSMakeRange(0, 287) // just the first sentence (287 is the count; MS Word)
	// let range = NSMakeRange(0, sentence.utf16.count) // assuming hadn't found the count
	// let range = NSMakeRange(0, text.utf16.count) // the entire block


	// set tagger options (pretty typical choices)
	let options:NSLinguisticTagger.Options = [.omitPunctuation, .omitWhitespace, .joinNames]

	// set up a tags array and arrays to hold results
	// other tag options: https://developer.apple.com/documentation/foundation/nslinguistictag
	let tags:[NSLinguisticTag] = [.noun, .verb, .adjective]

	var nouns:[String] = []
	var verbs:[String] = []
	var adjs:[String] = []


	// The meat of the operation: enumerate the tags and add to results
	tagger.enumerateTags(in: range, unit: .word, scheme: .lexicalClass, options: options){
	tag, tokenRange, stop in

	guard let tag = tag else { return } // just in case there is no tag...
	let token = (text as NSString).substring(with: tokenRange)

	// depending on the tag, add to the appropriate results array
	switch tag {
	case .noun: nouns.append(token)
	case .verb: verbs.append(token)
	case .adjective: adjs.append(token)
	default:break

	}
	}

	// Display results arrays
	print("NOUNS:")
	for noun in nouns {
	print(noun)
	}
	print("--------")
	print("VERBS:")
	for verb in verbs {
	print(verb)
	}
	print("--------")
	print("ADJECTIVES:")
	for adj in adjs {
	print(adj)
	}

	// Adding language detection... //
	print("--------")
	// verify that the dominant language is English

	let language = tagger.dominantLanguage
	print("The language is \(language!)")

	// turn this into func; use repeatedly
	func determineLanguage(for text: String) {
	tagger.string = text
	let language = tagger.dominantLanguage
	print("The language is \(language!)")
	}

	let frenchQuote = "La science n'a pas de patrie." // Science has no homeland -- Pasteur
	let germanQuote = "Das ist nicht mein Bier." // That's not my beer
	let italianQuote = "L’amore è cieco" // Love is blind
	let spanishQuote = "El amor todo lo puede." // Love will find a way
	let klingonQuote = "Heghlu'meH QaQ jajvam" // It is a good day to die
	let loremQuote = "Lorem ipsum dolor sit amet"
	let gibberish = "asdf plmjus qawsedrf"

	determineLanguage(for: frenchQuote)
	determineLanguage(for: germanQuote)
	determineLanguage(for: italianQuote)
	determineLanguage(for: spanishQuote)
	determineLanguage(for: klingonQuote) // thinks it's Croation, not Klingon
	determineLanguage(for: loremQuote) // thinks it's Romanian
	determineLanguage(for: gibberish) // that's English too...
	determineLanguage(for: "x") // this is undefined