colbyford/AutoML_SparkDataFrame-to-Numpy.py

## AutoML_SparkDataFrame-to-Numpy.py
## PySpark Part
from pyspark.ml import PipelineModel
from pyspark.sql.functions import col

dataset = spark.read.format("csv") \
               .options(header = True, inferSchema = True) \
               .load("/mnt/myfile.csv")

pipeline = PipelineModel.load("/mnt/pipeline/")
dataset = pipeline.transform(dataset)

train = dataset.where(col("data_split") == "train").select(col("label"), col("features"))
test = dataset.where(col("data_split") == "test").select(col("label"), col("features"))

## Numpy Part
## Training Data
pdtrain = train.toPandas()
trainseries = pdtrain['features'].apply(lambda x : np.array(x.toArray())).as_matrix().reshape(-1,1)
X_train = np.apply_along_axis(lambda x : x[0], 1, trainseries)
y_train = pdtrain['label'].values.reshape(-1,1).ravel()

## Test Data
pdtest = test.toPandas()
testseries = pdtest['features'].apply(lambda x : np.array(x.toArray())).as_matrix().reshape(-1,1)
X_test = np.apply_along_axis(lambda x : x[0], 1, testseries)
y_test = pdtest['label'].values.reshape(-1,1).ravel()

print(y_test)
	## PySpark Part
	from pyspark.ml import PipelineModel
	from pyspark.sql.functions import col

	dataset = spark.read.format("csv") \
	.options(header = True, inferSchema = True) \
	.load("/mnt/myfile.csv")

	pipeline = PipelineModel.load("/mnt/pipeline/")
	dataset = pipeline.transform(dataset)

	train = dataset.where(col("data_split") == "train").select(col("label"), col("features"))
	test = dataset.where(col("data_split") == "test").select(col("label"), col("features"))

	## Numpy Part
	## Training Data
	pdtrain = train.toPandas()
	trainseries = pdtrain['features'].apply(lambda x : np.array(x.toArray())).as_matrix().reshape(-1,1)
	X_train = np.apply_along_axis(lambda x : x[0], 1, trainseries)
	y_train = pdtrain['label'].values.reshape(-1,1).ravel()

	## Test Data
	pdtest = test.toPandas()
	testseries = pdtest['features'].apply(lambda x : np.array(x.toArray())).as_matrix().reshape(-1,1)
	X_test = np.apply_along_axis(lambda x : x[0], 1, testseries)
	y_test = pdtest['label'].values.reshape(-1,1).ravel()

	print(y_test)