Last active
May 22, 2018 12:37
-
-
Save maziyarpanahi/9a70d64b5b86b2a73e5f70050e5d2fbe to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import com.johnsnowlabs.nlp.{DocumentAssembler, Finisher} | |
import com.johnsnowlabs.nlp.annotators.{Normalizer, Stemmer, Tokenizer} | |
import com.johnsnowlabs.nlp.annotator._ | |
import com.johnsnowlabs.nlp.base._ | |
import com.johnsnowlabs.util.Benchmark | |
import org.apache.spark.ml.feature.NGram | |
import org.apache.spark.ml.Pipeline | |
import org.apache.spark.ml.feature.{StopWordsRemover, IDF, HashingTF, CountVectorizer, Word2Vec} | |
import org.apache.spark.sql.{Row, SparkSession} | |
import org.apache.spark.sql.functions._ | |
val documentAssembler = new DocumentAssembler() | |
.setInputCol(textColumnName) | |
.setOutputCol("document") | |
val sentenceDetector = new SentenceDetector() | |
.setInputCols(Array("document")) | |
.setOutputCol("sentence") | |
val token = new Tokenizer() | |
.setInputCols(Array("document")) | |
.setOutputCol("token") | |
val normalizer = new Normalizer() | |
.setInputCols(Array("token")) | |
.setOutputCol("normalized") | |
val stemmer = new Stemmer() | |
.setInputCols(Array("normalized")) | |
.setOutputCol("stem") | |
val posOptions = Map("format" -> "text") | |
val posTagger = new PerceptronApproach() | |
.setNIterations(5) | |
.setInputCols(Array("sentence", "token")) | |
.setOutputCol("pos") | |
.setCorpus(path = "hdfs:///input/nlp/pos-tagger/masc_tagged/data/*", delimiter = "_", readAs = "SPARK_DATASET", options = posOptions) | |
val token_finisher = new Finisher() | |
.setInputCols("normalized") | |
.setOutputCols("tokens_array") | |
.setCleanAnnotations(false) | |
.setOutputAsArray(true) | |
val pipeline = new Pipeline() | |
.setStages(Array( | |
documentAssembler, | |
sentenceDetector, | |
token, | |
normalizer, | |
stemmer, | |
posTagger, | |
token_finisher | |
)) | |
val model = Benchmark.time("Time to train model") { | |
pipeline.fit(test) // test.count = es20: Long = 52414 (Wikipedia Page's title) | |
} | |
/* | |
Time to train model: 1364.868964391sec | |
model: org.apache.spark.ml.PipelineModel = pipeline_8df5ba357611 | |
*/ | |
val pipeLineDF = Benchmark.time("Time for prediction") { | |
model.transform(training) | |
} | |
/* | |
Time for prediction: 0.136970197sec | |
pipeLineDF: org.apache.spark.sql.DataFrame = [id: string, title: string ... 7 more fields] | |
*/ | |
pipeLineDF.printSchema | |
/* | |
root | |
|-- id: string (nullable = true) | |
|-- title: string (nullable = true) | |
|-- document: array (nullable = true) | |
| |-- element: struct (containsNull = true) | |
| | |-- annotatorType: string (nullable = true) | |
| | |-- begin: integer (nullable = false) | |
| | |-- end: integer (nullable = false) | |
| | |-- result: string (nullable = true) | |
| | |-- metadata: map (nullable = true) | |
| | | |-- key: string | |
| | | |-- value: string (valueContainsNull = true) | |
|-- sentence: array (nullable = true) | |
| |-- element: struct (containsNull = true) | |
| | |-- annotatorType: string (nullable = true) | |
| | |-- begin: integer (nullable = false) | |
| | |-- end: integer (nullable = false) | |
| | |-- result: string (nullable = true) | |
| | |-- metadata: map (nullable = true) | |
| | | |-- key: string | |
| | | |-- value: string (valueContainsNull = true) | |
|-- token: array (nullable = true) | |
| |-- element: struct (containsNull = true) | |
| | |-- annotatorType: string (nullable = true) | |
| | |-- begin: integer (nullable = false) | |
| | |-- end: integer (nullable = false) | |
| | |-- result: string (nullable = true) | |
| | |-- metadata: map (nullable = true) | |
| | | |-- key: string | |
| | | |-- value: string (valueContainsNull = true) | |
|-- normalized: array (nullable = true) | |
| |-- element: struct (containsNull = true) | |
| | |-- annotatorType: string (nullable = true) | |
| | |-- begin: integer (nullable = false) | |
| | |-- end: integer (nullable = false) | |
| | |-- result: string (nullable = true) | |
| | |-- metadata: map (nullable = true) | |
| | | |-- key: string | |
| | | |-- value: string (valueContainsNull = true) | |
|-- stem: array (nullable = true) | |
| |-- element: struct (containsNull = true) | |
| | |-- annotatorType: string (nullable = true) | |
| | |-- begin: integer (nullable = false) | |
| | |-- end: integer (nullable = false) | |
| | |-- result: string (nullable = true) | |
| | |-- metadata: map (nullable = true) | |
| | | |-- key: string | |
| | | |-- value: string (valueContainsNull = true) | |
|-- pos: array (nullable = true) | |
| |-- element: struct (containsNull = true) | |
| | |-- annotatorType: string (nullable = true) | |
| | |-- begin: integer (nullable = false) | |
| | |-- end: integer (nullable = false) | |
| | |-- result: string (nullable = true) | |
| | |-- metadata: map (nullable = true) | |
| | | |-- key: string | |
| | | |-- value: string (valueContainsNull = true) | |
|-- tokens_array: array (nullable = true) | |
| |-- element: string (containsNull = true) | |
*/ | |
pipeLineDF.count | |
// res58: Long = 5208697 | |
pipeLineDF.select("title", "pos.result").show(100, false) | |
/* | |
+-------------------------------------------------------------------+--------------------------------------+ | |
|title |result | | |
+-------------------------------------------------------------------+--------------------------------------+ | |
|The Sneetches and Other Stories |[DT, NNP, CC, JJ, NNS] | | |
|Seocho-dong |[NNP] | | |
|Pelargonium sidoides |[NNP, NNS] | | |
|El Gran Combo de Puerto Rico |[NNP, NNP, NNP, IN, NNP, NNP] | | |
|Love Is the Plan the Plan Is Death |[NN, VBZ, DT, NNP, DT, NNP, VBZ, NN] | | |
|Pratima Kumari |[NNP, NNP] | | |
|List of tropical and subtropical moist broadleaf forests ecoregions|[NN, IN, JJ, CC, JJ, NN, NN, NNS, NNS]| | |
|2003–04 Segunda División |[CD, NNP, NNP] | | |
|Lifetouch |[NNP] | | |
|Metrostar |[NNP] | | |
|CANPASS |[NNP] | | |
|Fallen Angel (UK TV series) |[NNP, NNP, (, NNP, NNP, NN, )] | | |
|Kuni-kyō |[NNP] | | |
|Barham Salih |[NNP, NNP] | | |
|Chokher Bali (film) |[NNP, NNP, (, NN, )] | | |
|Durio dulcis |[NNP, NN] | | |
|Florentine painting |[NNP, NN] | | |
|Zoidogamy |[NNP] | | |
|PO postcode area |[NNP, NN, NN] | | |
|Eveleigh, New South Wales |[NNP, ,, NNP, NNP, NNP] | | |
|Android Nim |[NNP, NNP] | | |
|Kyle Dunnigan |[NNP, NNP] | | |
|Jawad Bashir |[NNP, NNP] | | |
|Continental O-190 |[NNP, NNP] | | |
|List of 3D graphics libraries |[NN, IN, CD, NNS, NNS] | | |
|British Universities and Colleges Sport |[JJ, NNP, CC, NNP, NNP] | | |
|Horns of Hattin |[NNP, IN, NNP] | | |
|Systemic risk |[NNP, NN] | | |
|Ho Ching |[NNP, NNP] | | |
|Blake's Lock |[NNP, POS, NNP] | | |
|Vincent Buckley |[NNP, NNP] | | |
|Steve Bozek |[NNP, NNP] | | |
|The Bird and the Worm |[DT, NNP, CC, DT, NNP] | | |
|MP3Gain |[NNP] | | |
|Lost City of the Jungle |[NNP, NNP, IN, DT, NNP] | | |
|Bible College of Malaysia |[NNP, NNP, IN, NNP] | | |
|Grease duct |[NNP, NN] | | |
|Air America (TV series) |[NNP, NNP, (, NN, NN, )] | | |
|Water Framework Directive |[NNP, NNP, NNP] | | |
|Regent Hotel |[NNP, NNP] | | |
|One-shot (comics) |[NNP, (, NNS, )] | | |
|Before We Were So Rudely Interrupted |[IN, PRP, VBD, RB, RB, NNP] | | |
|Lindauer Dornier |[NNP, NNP] | | |
|Mariner Software |[NNP, NNP] | | |
|The Fisher-Girl and the Crab |[DT, NNP, CC, DT, NNP] | | |
|7-orthoplex |[NN] | | |
|French military mission to Japan (1872–80) |[JJ, JJ, NN, TO, NNP, (, CD, )] | | |
|Hui Liangyu |[NNP, NNP] | | |
|Christine Arron |[NNP, NNP] | | |
|Moose test |[NNP, NN] | | |
|Arrasando (song) |[NNP, (, NN, )] | | |
|Daydream (1964 film) |[NNP, (, CD, NN, )] | | |
|Anecdote of Men by the Thousands |[NNP, IN, NN, IN, DT, NNS] | | |
|Strain (biology) |[NNP, (, NN, )] | | |
|Haustrinae |[NNP] | | |
|Cirrus Aircraft |[NNP, NNP] | | |
|Syracuse High School (Syracuse, Utah) |[NNP, NNP, NNP, (, NNP, ,, NNP, )] | | |
|Mezamashi TV |[NNP, NN] | | |
|Vermont statistical areas |[NNP, JJ, NNS] | | |
|Portugal during World War I |[NNP, IN, NNP, NNP, PRP] | | |
|Cycles (The Doobie Brothers album) |[NNP, (, DT, NNP, NNP, NN, )] | | |
|Inferior frontal sulcus |[NNP, JJ, NN] | | |
|Saskatchewan Highway 41 |[NNP, NNP, CD] | | |
|Barony Rosendal |[NNP, NNP] | | |
|Mishima ware |[NNP, NN] | | |
|Ijon Tichy |[NNP, NNP] | | |
|Wilusa |[NNP] | | |
|Thomas Dybdahl |[NNP, NNP] | | |
|Adam Gardiner |[NNP, NNP] | | |
|Fournier RF-9 |[NNP, NNP] | | |
|Ola Sundell |[NNP, NNP] | | |
|My Barbarian |[PRP$, NN] | | |
|2004–05 Iraqi Premier League |[CD, NNP, NNP, NNP] | | |
|Jean Wade Rindlaub |[NNP, NNP, NNP] | | |
|Miskel Spillman |[NNP, NNP] | | |
|Bonytail chub |[NNP, NN] | | |
|Japanese Journal of Applied Physics |[JJ, NNP, IN, NNP, NNP] | | |
|Disembowelment (band) |[NNP, (, NN, )] | | |
|Brethren of the Coast |[NNP, IN, DT, NNP] | | |
|Fly (exercise) |[NNP, (, NN, )] | | |
|Mathilde Krim |[NNP, NNP] | | |
|Usman Tariq |[NNP, NNP] | | |
|Christopher Plunkett, 1st Baron of Dunsany |[NNP, NNP, ,, CD, NNP, IN, NNP] | | |
|Wanna Get to Know You |[NNP, VB, TO, VB, PRP] | | |
|Yaxuna |[NNP] | | |
|Glass (Index Case album) |[NN, (, NNP, NN, NN, )] | | |
|Christy Hemme |[NNP, NNP] | | |
|Zod |[NNP] | | |
|River City High |[NNP, NNP, NNP] | | |
|William Fleming High School |[NNP, NNP, NNP, NNP] | | |
|Wee Waa |[NNP, NNP] | | |
|I3 |[NNP] | | |
|Stephen V. Cole |[NNP, NNP, NNP] | | |
|Royal Australian Army Nursing Corps |[NNP, JJ, NNP, NNP, NNP] | | |
|The Courier |[DT, NNP] | | |
|Olof Johansson |[NNP, NNP] | | |
|Solicitor General of the United States |[NNP, NNP, IN, DT, NNP, NNPS] | | |
|MWR |[NNP] | | |
|Michael Boyer |[NNP, NNP] | | |
|Common Fund for Commodities |[NNP, NNP, IN, NNP] | | |
+-------------------------------------------------------------------+--------------------------------------+ | |
only showing top 100 rows | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment