Skip to content

Instantly share code, notes, and snippets.

View JakobLS's full-sized avatar
🙂

Jakob Salomonsson JakobLS

🙂
  • Gyodi AB
  • Madrid
  • 07:26 (UTC +01:00)
View GitHub Profile
# Select numerical features
numerical_features = list(sh_data.columns[1:])
# Define the steps in the numerical pipeline
numerical_pipeline = Pipeline(steps=[('numerical_selector', FeatureSelector(numerical_features)),
('PercentileTransformer', PercentileTransformer(percentile=0.90,
healthy_class=9)),
('StandardScaler', StandardScaler())])
# Select predictors and target variable
X = sh_data.iloc[:, 2:]
Y = sh_data['Tumor type']
# Create a final meta estimator for the stacking classifier
final_estimator = LogisticRegression(max_iter=500)
# Create Stacking Classifier with 6 estimators
stclf = ensemble.StackingClassifier(estimators=[e for e in zip(to_stack.keys(),
to_stack.values())],
# Select predictors and target variable
X = sh_data[numerical_features]
Y = sh_data['Tumor type']
# Split into train and test sets
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2, random_state=89, stratify=Y)
# Create classifiers
logReg = LogisticRegression(max_iter=1000)
knn = KNeighborsClassifier()
class FeatureSelector(BaseEstimator, TransformerMixin):
''' Custom Transformer that extracts columns passed as arguments to its constructor
'''
# Class constructor
def __init__(self, feature_names):
self.feature_names = feature_names
# Return itself
def fit(self, X, y=None):
return self
# Specify the best model parameters from previous step
gb_best = GBTRegressor(maxDepth=5,
subsamplingRate=0.7)
# Put everything in a Pipeline
pipeline = Pipeline(stages=[target, featureAssembler, gb_best])
# Train the model
final_model = pipeline.fit(df_train)
def evaluate_model(model, trainSet, testSet):
""" Function for evaluating a model on a train and test set"""
# Evaluate model on train set
pred_train = model.transform(trainSet)
rmse_train = evaluator.evaluate(pred_train, {evaluator.metricName: "rmse"})
# Evaluate model on test set
pred_test = model.transform(testSet)
rmse_test = evaluator.evaluate(pred_test, {evaluator.metricName: "rmse"})
# Instantiate a Linear regression model with default parameters
lr = LinearRegression()
# Put everything in a Pipeline
pipeline = Pipeline(stages=[target, featureAssembler, lr])
# Train the model
baseline = pipeline.fit(df_train)
# Specify target variable
target = StringIndexer(inputCol="Rating",
outputCol="label")
# Specify features
input_cols = ["YearOfRelease",
"MovieID",
"CustomerID",
"RatingYear",
"RatingMonth",
# Split into train and test sets
splits = df.randomSplit([0.8, 0.2], seed=747)
df_train = splits[0]
df_test = splits[1]
# Create two hashers - one for MovieID and one for CustomerID
hasherM = FeatureHasher().setInputCols(["MovieID"]) \
.setOutputCol("HashedMovieID")
hasherC = FeatureHasher().setInputCols(["CustomerID"]) \
.setOutputCol("HashedCustomerID")
# Apply the hashers on the dataset
df = hasherM.transform(df)
df = hasherC.transform(df)