Jakob Salomonsson JakobLS

## pipeline-crossVal.py
# Select numerical features
numerical_features = list(sh_data.columns[1:])

# Define the steps in the numerical pipeline
numerical_pipeline = Pipeline(steps=[('numerical_selector', FeatureSelector(numerical_features)),
                                     ('PercentileTransformer', PercentileTransformer(percentile=0.90,
                                                                                     healthy_class=9)),
                                     ('StandardScaler', StandardScaler())])


## ensemble-model-evaluation.py
# Select predictors and target variable
X = sh_data.iloc[:, 2:]
Y = sh_data['Tumor type']

# Create a final meta estimator for the stacking classifier
final_estimator = LogisticRegression(max_iter=500)

# Create Stacking Classifier with 6 estimators
stclf = ensemble.StackingClassifier(estimators=[e for e in zip(to_stack.keys(),
                                                               to_stack.values())],

## single-model-evaluation.py
# Select predictors and target variable
X = sh_data[numerical_features]
Y = sh_data['Tumor type']

# Split into train and test sets
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2, random_state=89, stratify=Y)

# Create classifiers
logReg = LogisticRegression(max_iter=1000)
knn = KNeighborsClassifier()

## custom-transformation.py
class FeatureSelector(BaseEstimator, TransformerMixin):
    ''' Custom Transformer that extracts columns passed as arguments to its constructor
    '''
    # Class constructor
    def __init__(self, feature_names):
        self.feature_names = feature_names

    # Return itself
    def fit(self, X, y=None):
        return self

## train-best-on-full.py
# Specify the best model parameters from previous step
gb_best = GBTRegressor(maxDepth=5,
                       subsamplingRate=0.7)

# Put everything in a Pipeline
pipeline = Pipeline(stages=[target, featureAssembler, gb_best])

# Train the model
final_model = pipeline.fit(df_train)

## train-test-multiple.py
def evaluate_model(model, trainSet, testSet):
    """ Function for evaluating a model on a train and test set"""

    # Evaluate model on train set
    pred_train = model.transform(trainSet)
    rmse_train = evaluator.evaluate(pred_train, {evaluator.metricName: "rmse"})

    # Evaluate model on test set
    pred_test = model.transform(testSet)
    rmse_test = evaluator.evaluate(pred_test, {evaluator.metricName: "rmse"})

## train-linreg.py
# Instantiate a Linear regression model with default parameters
lr = LinearRegression()

# Put everything in a Pipeline
pipeline = Pipeline(stages=[target, featureAssembler, lr])

# Train the model
baseline = pipeline.fit(df_train)

## select-features.py
# Specify target variable
target = StringIndexer(inputCol="Rating",
                       outputCol="label")

# Specify features
input_cols = ["YearOfRelease",
              "MovieID",
              "CustomerID",
              "RatingYear",
              "RatingMonth",

## train-test-split.py
# Split into train and test sets
splits = df.randomSplit([0.8, 0.2], seed=747)
df_train = splits[0]
df_test = splits[1]

## hashing.py
# Create two hashers - one for MovieID and one for CustomerID
hasherM = FeatureHasher().setInputCols(["MovieID"]) \
                        .setOutputCol("HashedMovieID")
hasherC = FeatureHasher().setInputCols(["CustomerID"]) \
                        .setOutputCol("HashedCustomerID")

# Apply the hashers on the dataset
df = hasherM.transform(df)
df = hasherC.transform(df)
	# Select numerical features
	numerical_features = list(sh_data.columns[1:])

	# Define the steps in the numerical pipeline
	numerical_pipeline = Pipeline(steps=[('numerical_selector', FeatureSelector(numerical_features)),
	('PercentileTransformer', PercentileTransformer(percentile=0.90,
	healthy_class=9)),
	('StandardScaler', StandardScaler())])
	# Select predictors and target variable
	X = sh_data.iloc[:, 2:]
	Y = sh_data['Tumor type']

	# Create a final meta estimator for the stacking classifier
	final_estimator = LogisticRegression(max_iter=500)

	# Create Stacking Classifier with 6 estimators
	stclf = ensemble.StackingClassifier(estimators=[e for e in zip(to_stack.keys(),
	to_stack.values())],
	# Select predictors and target variable
	X = sh_data[numerical_features]
	Y = sh_data['Tumor type']

	# Split into train and test sets
	trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2, random_state=89, stratify=Y)

	# Create classifiers
	logReg = LogisticRegression(max_iter=1000)
	knn = KNeighborsClassifier()
	class FeatureSelector(BaseEstimator, TransformerMixin):
	''' Custom Transformer that extracts columns passed as arguments to its constructor
	'''
	# Class constructor
	def __init__(self, feature_names):
	self.feature_names = feature_names

	# Return itself
	def fit(self, X, y=None):
	return self
	# Specify the best model parameters from previous step
	gb_best = GBTRegressor(maxDepth=5,
	subsamplingRate=0.7)

	# Put everything in a Pipeline
	pipeline = Pipeline(stages=[target, featureAssembler, gb_best])

	# Train the model
	final_model = pipeline.fit(df_train)
	def evaluate_model(model, trainSet, testSet):
	""" Function for evaluating a model on a train and test set"""

	# Evaluate model on train set
	pred_train = model.transform(trainSet)
	rmse_train = evaluator.evaluate(pred_train, {evaluator.metricName: "rmse"})

	# Evaluate model on test set
	pred_test = model.transform(testSet)
	rmse_test = evaluator.evaluate(pred_test, {evaluator.metricName: "rmse"})
	# Instantiate a Linear regression model with default parameters
	lr = LinearRegression()

	# Put everything in a Pipeline
	pipeline = Pipeline(stages=[target, featureAssembler, lr])

	# Train the model
	baseline = pipeline.fit(df_train)
	# Specify target variable
	target = StringIndexer(inputCol="Rating",
	outputCol="label")

	# Specify features
	input_cols = ["YearOfRelease",
	"MovieID",
	"CustomerID",
	"RatingYear",
	"RatingMonth",
	# Split into train and test sets
	splits = df.randomSplit([0.8, 0.2], seed=747)
	df_train = splits[0]
	df_test = splits[1]
	# Create two hashers - one for MovieID and one for CustomerID
	hasherM = FeatureHasher().setInputCols(["MovieID"]) \
	.setOutputCol("HashedMovieID")
	hasherC = FeatureHasher().setInputCols(["CustomerID"]) \
	.setOutputCol("HashedCustomerID")

	# Apply the hashers on the dataset
	df = hasherM.transform(df)
	df = hasherC.transform(df)