Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import com.johnsnowlabs.nlp.{DocumentAssembler, Finisher}
import com.johnsnowlabs.nlp.annotators.{Normalizer, Stemmer, Tokenizer}
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.util.Benchmark
import org.apache.spark.ml.feature.NGram
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{StopWordsRemover, IDF, HashingTF, CountVectorizer, Word2Vec}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.functions._
val documentAssembler = new DocumentAssembler()
.setInputCol(textColumnName)
.setOutputCol("document")
val sentenceDetector = new SentenceDetector()
.setInputCols(Array("document"))
.setOutputCol("sentence")
val token = new Tokenizer()
.setInputCols(Array("document"))
.setOutputCol("token")
val normalizer = new Normalizer()
.setInputCols(Array("token"))
.setOutputCol("normalized")
val stemmer = new Stemmer()
.setInputCols(Array("normalized"))
.setOutputCol("stem")
val posOptions = Map("format" -> "text")
val posTagger = new PerceptronApproach()
.setNIterations(5)
.setInputCols(Array("sentence", "token"))
.setOutputCol("pos")
.setCorpus(path = "hdfs:///input/nlp/pos-tagger/masc_tagged/data/*", delimiter = "_", readAs = "SPARK_DATASET", options = posOptions)
val token_finisher = new Finisher()
.setInputCols("normalized")
.setOutputCols("tokens_array")
.setCleanAnnotations(false)
.setOutputAsArray(true)
val pipeline = new Pipeline()
.setStages(Array(
documentAssembler,
sentenceDetector,
token,
normalizer,
stemmer,
posTagger,
token_finisher
))
val model = Benchmark.time("Time to train model") {
pipeline.fit(test) // test.count = es20: Long = 52414 (Wikipedia Page's title)
}
/*
Time to train model: 1364.868964391sec
model: org.apache.spark.ml.PipelineModel = pipeline_8df5ba357611
*/
val pipeLineDF = Benchmark.time("Time for prediction") {
model.transform(training)
}
/*
Time for prediction: 0.136970197sec
pipeLineDF: org.apache.spark.sql.DataFrame = [id: string, title: string ... 7 more fields]
*/
pipeLineDF.printSchema
/*
root
|-- id: string (nullable = true)
|-- title: string (nullable = true)
|-- document: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- annotatorType: string (nullable = true)
| | |-- begin: integer (nullable = false)
| | |-- end: integer (nullable = false)
| | |-- result: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
|-- sentence: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- annotatorType: string (nullable = true)
| | |-- begin: integer (nullable = false)
| | |-- end: integer (nullable = false)
| | |-- result: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
|-- token: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- annotatorType: string (nullable = true)
| | |-- begin: integer (nullable = false)
| | |-- end: integer (nullable = false)
| | |-- result: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
|-- normalized: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- annotatorType: string (nullable = true)
| | |-- begin: integer (nullable = false)
| | |-- end: integer (nullable = false)
| | |-- result: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
|-- stem: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- annotatorType: string (nullable = true)
| | |-- begin: integer (nullable = false)
| | |-- end: integer (nullable = false)
| | |-- result: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
|-- pos: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- annotatorType: string (nullable = true)
| | |-- begin: integer (nullable = false)
| | |-- end: integer (nullable = false)
| | |-- result: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
|-- tokens_array: array (nullable = true)
| |-- element: string (containsNull = true)
*/
pipeLineDF.count
// res58: Long = 5208697
pipeLineDF.select("title", "pos.result").show(100, false)
/*
+-------------------------------------------------------------------+--------------------------------------+
|title |result |
+-------------------------------------------------------------------+--------------------------------------+
|The Sneetches and Other Stories |[DT, NNP, CC, JJ, NNS] |
|Seocho-dong |[NNP] |
|Pelargonium sidoides |[NNP, NNS] |
|El Gran Combo de Puerto Rico |[NNP, NNP, NNP, IN, NNP, NNP] |
|Love Is the Plan the Plan Is Death |[NN, VBZ, DT, NNP, DT, NNP, VBZ, NN] |
|Pratima Kumari |[NNP, NNP] |
|List of tropical and subtropical moist broadleaf forests ecoregions|[NN, IN, JJ, CC, JJ, NN, NN, NNS, NNS]|
|2003–04 Segunda División |[CD, NNP, NNP] |
|Lifetouch |[NNP] |
|Metrostar |[NNP] |
|CANPASS |[NNP] |
|Fallen Angel (UK TV series) |[NNP, NNP, (, NNP, NNP, NN, )] |
|Kuni-kyō |[NNP] |
|Barham Salih |[NNP, NNP] |
|Chokher Bali (film) |[NNP, NNP, (, NN, )] |
|Durio dulcis |[NNP, NN] |
|Florentine painting |[NNP, NN] |
|Zoidogamy |[NNP] |
|PO postcode area |[NNP, NN, NN] |
|Eveleigh, New South Wales |[NNP, ,, NNP, NNP, NNP] |
|Android Nim |[NNP, NNP] |
|Kyle Dunnigan |[NNP, NNP] |
|Jawad Bashir |[NNP, NNP] |
|Continental O-190 |[NNP, NNP] |
|List of 3D graphics libraries |[NN, IN, CD, NNS, NNS] |
|British Universities and Colleges Sport |[JJ, NNP, CC, NNP, NNP] |
|Horns of Hattin |[NNP, IN, NNP] |
|Systemic risk |[NNP, NN] |
|Ho Ching |[NNP, NNP] |
|Blake's Lock |[NNP, POS, NNP] |
|Vincent Buckley |[NNP, NNP] |
|Steve Bozek |[NNP, NNP] |
|The Bird and the Worm |[DT, NNP, CC, DT, NNP] |
|MP3Gain |[NNP] |
|Lost City of the Jungle |[NNP, NNP, IN, DT, NNP] |
|Bible College of Malaysia |[NNP, NNP, IN, NNP] |
|Grease duct |[NNP, NN] |
|Air America (TV series) |[NNP, NNP, (, NN, NN, )] |
|Water Framework Directive |[NNP, NNP, NNP] |
|Regent Hotel |[NNP, NNP] |
|One-shot (comics) |[NNP, (, NNS, )] |
|Before We Were So Rudely Interrupted |[IN, PRP, VBD, RB, RB, NNP] |
|Lindauer Dornier |[NNP, NNP] |
|Mariner Software |[NNP, NNP] |
|The Fisher-Girl and the Crab |[DT, NNP, CC, DT, NNP] |
|7-orthoplex |[NN] |
|French military mission to Japan (1872–80) |[JJ, JJ, NN, TO, NNP, (, CD, )] |
|Hui Liangyu |[NNP, NNP] |
|Christine Arron |[NNP, NNP] |
|Moose test |[NNP, NN] |
|Arrasando (song) |[NNP, (, NN, )] |
|Daydream (1964 film) |[NNP, (, CD, NN, )] |
|Anecdote of Men by the Thousands |[NNP, IN, NN, IN, DT, NNS] |
|Strain (biology) |[NNP, (, NN, )] |
|Haustrinae |[NNP] |
|Cirrus Aircraft |[NNP, NNP] |
|Syracuse High School (Syracuse, Utah) |[NNP, NNP, NNP, (, NNP, ,, NNP, )] |
|Mezamashi TV |[NNP, NN] |
|Vermont statistical areas |[NNP, JJ, NNS] |
|Portugal during World War I |[NNP, IN, NNP, NNP, PRP] |
|Cycles (The Doobie Brothers album) |[NNP, (, DT, NNP, NNP, NN, )] |
|Inferior frontal sulcus |[NNP, JJ, NN] |
|Saskatchewan Highway 41 |[NNP, NNP, CD] |
|Barony Rosendal |[NNP, NNP] |
|Mishima ware |[NNP, NN] |
|Ijon Tichy |[NNP, NNP] |
|Wilusa |[NNP] |
|Thomas Dybdahl |[NNP, NNP] |
|Adam Gardiner |[NNP, NNP] |
|Fournier RF-9 |[NNP, NNP] |
|Ola Sundell |[NNP, NNP] |
|My Barbarian |[PRP$, NN] |
|2004–05 Iraqi Premier League |[CD, NNP, NNP, NNP] |
|Jean Wade Rindlaub |[NNP, NNP, NNP] |
|Miskel Spillman |[NNP, NNP] |
|Bonytail chub |[NNP, NN] |
|Japanese Journal of Applied Physics |[JJ, NNP, IN, NNP, NNP] |
|Disembowelment (band) |[NNP, (, NN, )] |
|Brethren of the Coast |[NNP, IN, DT, NNP] |
|Fly (exercise) |[NNP, (, NN, )] |
|Mathilde Krim |[NNP, NNP] |
|Usman Tariq |[NNP, NNP] |
|Christopher Plunkett, 1st Baron of Dunsany |[NNP, NNP, ,, CD, NNP, IN, NNP] |
|Wanna Get to Know You |[NNP, VB, TO, VB, PRP] |
|Yaxuna |[NNP] |
|Glass (Index Case album) |[NN, (, NNP, NN, NN, )] |
|Christy Hemme |[NNP, NNP] |
|Zod |[NNP] |
|River City High |[NNP, NNP, NNP] |
|William Fleming High School |[NNP, NNP, NNP, NNP] |
|Wee Waa |[NNP, NNP] |
|I3 |[NNP] |
|Stephen V. Cole |[NNP, NNP, NNP] |
|Royal Australian Army Nursing Corps |[NNP, JJ, NNP, NNP, NNP] |
|The Courier |[DT, NNP] |
|Olof Johansson |[NNP, NNP] |
|Solicitor General of the United States |[NNP, NNP, IN, DT, NNP, NNPS] |
|MWR |[NNP] |
|Michael Boyer |[NNP, NNP] |
|Common Fund for Commodities |[NNP, NNP, IN, NNP] |
+-------------------------------------------------------------------+--------------------------------------+
only showing top 100 rows
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment