Skip to content

Instantly share code, notes, and snippets.

Lakshay lakshay-arora

Block or report user

Report or block lakshay-arora

Hide content and notifications from this user.

Learn more about blocking users

Contact Support about this user’s behavior.

Learn more about reporting abuse

Report abuse
View GitHub Profile
View pipeline_5_pyspark.py
# create a sample data without the labels
sample_data_test = spark.createDataFrame([
(3.0, 'Z', 'S10', 40),
(1.0, 'X', 'E10', 20),
(4.0, 'A', 'S20', 10),
(3.0, 'A', 'S10', 20),
(4.0, 'X', 'D10', 30),
(1.0, 'Z', 'E10', 20),
(4.0, 'A', 'S10', 30),
], ['feature_1', 'feature_2', 'feature_3', 'feature_4'])
View pipeline_4_pyspark.py
# define stage 1: transform the column feature_2 to numeric
stage_1 = StringIndexer(inputCol= 'feature_2', outputCol= 'feature_2_index')
# define stage 2: transform the column feature_3 to numeric
stage_2 = StringIndexer(inputCol= 'feature_3', outputCol= 'feature_3_index')
# define stage 3: one hot encode the numeric versions of feature 2 and 3 generated from stage 1 and stage 2
stage_3 = OneHotEncoderEstimator(inputCols=[stage_1.getOutputCol(), stage_2.getOutputCol()],
outputCols= ['feature_2_encoded', 'feature_3_encoded'])
# define stage 4: create a vector of all the features required to train the logistic regression model
stage_4 = VectorAssembler(inputCols=['feature_1', 'feature_2_encoded', 'feature_3_encoded', 'feature_4'],
outputCol='features')
View pipeline_3_pyspark.py
from pyspark.ml.classification import LogisticRegression
# create a sample dataframe with 4 features and 1 label column
sample_data_train = spark.createDataFrame([
(2.0, 'A', 'S10', 40, 1.0),
(1.0, 'X', 'E10', 25, 1.0),
(4.0, 'X', 'S20', 10, 0.0),
(3.0, 'Z', 'S10', 20, 0.0),
(4.0, 'A', 'E10', 30, 1.0),
(2.0, 'Z', 'S10', 40, 0.0),
View pipeline_2_pyspark.py
# define stage 1 : transform the column category_1 to numeric
stage_1 = StringIndexer(inputCol= 'category_1', outputCol= 'category_1_index')
# define stage 2 : transform the column category_2 to numeric
stage_2 = StringIndexer(inputCol= 'category_2', outputCol= 'category_2_index')
# define stage 3 : one hot encode the numeric category_2 column
stage_3 = OneHotEncoderEstimator(inputCols=['category_2_index'], outputCols=['category_2_OHE'])
# setup the pipeline
pipeline = Pipeline(stages=[stage_1, stage_2, stage_3])
View pipeline_1_pyspark.py
from pyspark.ml import Pipeline
# create a sample dataframe
sample_df = spark.createDataFrame([
(1, 'L101', 'R'),
(2, 'L201', 'C'),
(3, 'D111', 'R'),
(4, 'F210', 'R'),
(5, 'D110', 'C')
], ['id', 'category_1', 'category_2'])
View vector_assembler.py
from pyspark.ml.feature import VectorAssembler
# specify the input and output columns of the vector assembler
assembler = VectorAssembler(inputCols=['Isboundary',
'Iswicket',
'Over',
'Runs',
'Batsman_Index',
'Bowler_Index',
'Batsman_OHE',
View ohe_pyspark.py
# create object and specify input and output column
OHE = OneHotEncoderEstimator(inputCols=['Batsman_Index', 'Bowler_Index'],outputCols=['Batsman_OHE', 'Bowler_OHE'])
# transform the data
my_data = OHE.fit(my_data).transform(my_data)
# view and transform the data
my_data.select('Batsman_Name', 'Batsman_Index', 'Batsman_OHE', 'Bowler_Name', 'Bowler_Index', 'Bowler_OHE').show(10)
View string_index.py
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator
# create object of StringIndexer class and specify input and output column
SI_batsman = StringIndexer(inputCol='Batsman_Name',outputCol='Batsman_Index')
SI_bowler = StringIndexer(inputCol='Bowler_Name',outputCol='Bowler_Index')
# transform the data
my_data = SI_batsman.fit(my_data).transform(my_data)
my_data = SI_bowler.fit(my_data).transform(my_data)
View describe_pyspark.py
# get the summary of the numerical columns
my_data.select('Isball', 'Isboundary', 'Runs').describe().show()
View value_counts_pyspark.py
# value counts of Batsman_Name column
my_data.groupBy('Batsman_Name').count().show()
You can’t perform that action at this time.