AdroitAnandAI/ml_pipeline.py

## ml_pipeline.py
# Configure ML pipeline with three stages: tokenizer, CountVec, and LR
# https://spark.apache.org/docs/latest/ml-pipeline.html

#Refer: https://spark.apache.org/docs/latest/ml-features#tokenizer
tokenizer = Tokenizer(inputCol="text", outputCol="words")

#Refer: https://spark.apache.org/docs/latest/ml-features.html#countvectorizer
cv = CountVectorizer(inputCol=tokenizer.getOutputCol(), \
        outputCol="features", minDF=2.0)
lr = LogisticRegression(maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, cv, lr])

# Fit the pipeline to training documents.
model = pipeline.fit(training)
	# Configure ML pipeline with three stages: tokenizer, CountVec, and LR
	# https://spark.apache.org/docs/latest/ml-pipeline.html

	#Refer: https://spark.apache.org/docs/latest/ml-features#tokenizer
	tokenizer = Tokenizer(inputCol="text", outputCol="words")

	#Refer: https://spark.apache.org/docs/latest/ml-features.html#countvectorizer
	cv = CountVectorizer(inputCol=tokenizer.getOutputCol(), \
	outputCol="features", minDF=2.0)
	lr = LogisticRegression(maxIter=10, regParam=0.001)
	pipeline = Pipeline(stages=[tokenizer, cv, lr])

	# Fit the pipeline to training documents.
	model = pipeline.fit(training)