Skip to content

Instantly share code, notes, and snippets.

@sharmanirek
Created August 27, 2019 20:49
Show Gist options
  • Save sharmanirek/172c9408e8393462ae54dfae83764413 to your computer and use it in GitHub Desktop.
Save sharmanirek/172c9408e8393462ae54dfae83764413 to your computer and use it in GitHub Desktop.
Pyspark MLLib || GMM clustering Error
Display the source blob
Display the rendered blob
Raw
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pyspark
import sys
print("python version = {}".format(sys.version))
print("pyspark version = {}".format(pyspark.__version__))
# In[2]:
from pyspark.sql.types import DoubleType, StringType, IntegerType, ArrayType
from pyspark.ml.clustering import GaussianMixture
from pyspark.sql import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.mllib.linalg import Vectors, VectorUDT, DenseVector
from pyspark.sql.functions import udf
sc = SparkSession.builder.appName('GMM_Demo').master("local[*]").getOrCreate()
sql = SQLContext(sc)
df = sc.read.csv("GMM_demo_data.csv", header = True)
# In[3]:
# new_schema = ArrayType(DoubleType(), containsNull=False)
new_schema = ArrayType(DoubleType(), containsNull=False)
string_parse = udf(lambda x: [float(elem.replace('[','')
.replace(']','')
.strip()) for elem in x.split(",")], new_schema)
df = df.withColumn('features', string_parse('pcaFeatures'))
# In[45]:
model = GaussianMixture(k=7, seed = 1234, featuresCol = 'features',
predictionCol='predictionCol', probabilityCol = 'probabilityCol')
output = model.fit(df).transform(df)
# In[50]:
output.select('probabilityCol').toPandas()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment