Skip to content

Instantly share code, notes, and snippets.

@mitallast
Created November 28, 2015 10:05
Show Gist options
  • Save mitallast/2930494e09e44865e6d5 to your computer and use it in GitHub Desktop.
Save mitallast/2930494e09e44865e6d5 to your computer and use it in GitHub Desktop.
Apache spark text classifier
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.feature import HashingTF
textFile = sc.textFile("sells.csv")
htf = HashingTF(100000)
data = textFile.map(lambda line: line.split(',', 1)).map(lambda parts: LabeledPoint(parts[0], htf.transform(parts[1].split(" "))))
d_train, d_test = data.randomSplit([0.6, 0.4])
model = NaiveBayes.train(d_train)
prediction_and_labels = d_test.map(lambda point: (model.predict(point.features), point.label))
correct = prediction_and_labels.filter(lambda (predicted, actual): predicted == actual)
accuracy = correct.count() / float(testh.count())
print "Classifier correctly predicted category " + str(accuracy * 100) + " percent of the time"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment