patrickbrus

## CAM.py
def get_class_activation_map(model, img):
    '''
    this function computes the class activation map

    Inputs:
        1) model (tensorflow model) : trained model
        2) img (numpy array of shape (224, 224, 3)) : input image
    '''

    # expand dimension to fit the image to a network accepted input size

## imputer.py
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# split into train and test sets
# data_df_converted is the dataframe containing all features
df_train, df_test = train_test_split(data_df_converted, test_size=0.10, random_state=42)
df_train, df_val = train_test_split(df_train, test_size=0.3, random_state=42)

# create the iterative imputer model
imputer = IterativeImputer(max_iter=20, random_state=42, verbose=1)

## upload_to_S3.py
import boto3
import sagemaker
import os


# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
# region
region = boto3.Session().region_name

## train_linear_regression.py
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"
script_path = 'source/train_linear_regression.py'

sklearn_linear_regression = SKLearn(
    entry_point=script_path,
    framework_version=FRAMEWORK_VERSION,
    instance_type="ml.c4.xlarge",
    role=role,

## decison_tree_and_bayes_optimizer.py
# Define a model
tree = DecisionTreeRegressor(criterion="mse")

# Create the Bayesion optimization object
opt = BayesSearchCV(
    tree,
    {
        "max_depth": (5, 15),
        "splitter": ["best", "random"],
    },

## random_forest_and_bayes_optimizer.py
# Define a model
forest = RandomForestRegressor(criterion="mse", n_jobs=n_jobs)

# Create the Bayesion optimization object
opt = BayesSearchCV(
    forest,
    {
        "max_depth": (5, 15),
        "n_estimators": (10, 50),
        "bootstrap": [True, False]

## xgboost_sagemaker.py
from sagemaker.tuner import IntegerParameter
from sagemaker.tuner import ContinuousParameter
from sagemaker.tuner import HyperparameterTuner
from sagemaker.inputs import TrainingInput

# Define exploration boundaries (default suggested values from Amazon SageMaker Documentation)
hyperparameter_ranges = {
    'alpha': ContinuousParameter(0, 1000, scaling_type="Auto"),
    'eta': ContinuousParameter(0.1, 0.5, scaling_type='Logarithmic'),
    'gamma':ContinuousParameter(0, 5, scaling_type='Auto'),

## imports_plus_pandas_read.py
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# read in data from csv file
df = pd.read_csv(r"data\healthcare-dataset-stroke-data.csv")

print(df.head()) # helpful as first dive into data and features

## EDA_hists_and_value_counts.py
# create a list of categorical features to loop over
list_categorical_features = df.columns[(df.dtypes == np.object).values].to_list()

# create a list of numerical features to loop over
list_numerical_features = df.columns[np.logical_not((df.dtypes == np.object).values)].to_list()

# plot on histogram for each numerical feature
for numerical_feature in list_numerical_features:
    plt.title(f"Histogram of {numerical_feature}")
    plt.hist(df[numerical_feature])

## EDA_seaborn_part1.py
# create a histplot of age with hue stroke to check influence of age on class stroke
sns.histplot(data=df, x="age", hue="stroke", multiple="stack")

# create histplot of gender to check if there is any influence of gender on class stroke
sns.histplot(data=df, x="gender", hue="stroke", multiple="stack")

# create histplot of smoking_satus and check influence on class stroke
sns.histplot(data=df, x="smoking_status", hue="stroke", multiple="stack")

# here: can be interesting to also print the percentage of stroke patients of each category
	def get_class_activation_map(model, img):
	'''
	this function computes the class activation map

	Inputs:
	1) model (tensorflow model) : trained model
	2) img (numpy array of shape (224, 224, 3)) : input image
	'''

	# expand dimension to fit the image to a network accepted input size
	from sklearn.model_selection import train_test_split
	from sklearn.experimental import enable_iterative_imputer
	from sklearn.impute import IterativeImputer
	# split into train and test sets
	# data_df_converted is the dataframe containing all features
	df_train, df_test = train_test_split(data_df_converted, test_size=0.10, random_state=42)
	df_train, df_val = train_test_split(df_train, test_size=0.3, random_state=42)

	# create the iterative imputer model
	imputer = IterativeImputer(max_iter=20, random_state=42, verbose=1)
	import boto3
	import sagemaker
	import os


	# session and role
	sagemaker_session = sagemaker.Session()
	role = sagemaker.get_execution_role()
	# region
	region = boto3.Session().region_name
	from sagemaker.sklearn.estimator import SKLearn

	FRAMEWORK_VERSION = "0.23-1"
	script_path = 'source/train_linear_regression.py'

	sklearn_linear_regression = SKLearn(
	entry_point=script_path,
	framework_version=FRAMEWORK_VERSION,
	instance_type="ml.c4.xlarge",
	role=role,
	# Define a model
	tree = DecisionTreeRegressor(criterion="mse")

	# Create the Bayesion optimization object
	opt = BayesSearchCV(
	tree,
	{
	"max_depth": (5, 15),
	"splitter": ["best", "random"],
	},
	# Define a model
	forest = RandomForestRegressor(criterion="mse", n_jobs=n_jobs)

	# Create the Bayesion optimization object
	opt = BayesSearchCV(
	forest,
	{
	"max_depth": (5, 15),
	"n_estimators": (10, 50),
	"bootstrap": [True, False]
	from sagemaker.tuner import IntegerParameter
	from sagemaker.tuner import ContinuousParameter
	from sagemaker.tuner import HyperparameterTuner
	from sagemaker.inputs import TrainingInput

	# Define exploration boundaries (default suggested values from Amazon SageMaker Documentation)
	hyperparameter_ranges = {
	'alpha': ContinuousParameter(0, 1000, scaling_type="Auto"),
	'eta': ContinuousParameter(0.1, 0.5, scaling_type='Logarithmic'),
	'gamma':ContinuousParameter(0, 5, scaling_type='Auto'),
	import numpy as np
	import matplotlib.pyplot as plt
	import pandas as pd
	import seaborn as sns

	# read in data from csv file
	df = pd.read_csv(r"data\healthcare-dataset-stroke-data.csv")

	print(df.head()) # helpful as first dive into data and features
	# create a list of categorical features to loop over
	list_categorical_features = df.columns[(df.dtypes == np.object).values].to_list()

	# create a list of numerical features to loop over
	list_numerical_features = df.columns[np.logical_not((df.dtypes == np.object).values)].to_list()

	# plot on histogram for each numerical feature
	for numerical_feature in list_numerical_features:
	plt.title(f"Histogram of {numerical_feature}")
	plt.hist(df[numerical_feature])
	# create a histplot of age with hue stroke to check influence of age on class stroke
	sns.histplot(data=df, x="age", hue="stroke", multiple="stack")

	# create histplot of gender to check if there is any influence of gender on class stroke
	sns.histplot(data=df, x="gender", hue="stroke", multiple="stack")

	# create histplot of smoking_satus and check influence on class stroke
	sns.histplot(data=df, x="smoking_status", hue="stroke", multiple="stack")

	# here: can be interesting to also print the percentage of stroke patients of each category