Harpreet Sahota harpreetsahota204

## autoencoder.py
experiment = Experiment(workspace='team-comet-ml', project_name='cc-clustering')
experiment.add_tag("autoencoder")

autoencoder_network = Sequential([encoder_network, decoder_network])
autoencoder_network.compile(optimizer= 'adam', loss='mean_squared_error')
autoencoder_network.fit(creditcard_df_scaled, creditcard_df_scaled, batch_size = 128, epochs = 150,  verbose = 0)

pred_df = pd.DataFrame(encoder_network.predict(creditcard_df_scaled), columns=['encoding1', 'encoding2'])
pred_df.to_csv('encoded_df.csv')

## decoder.py
decoder_network = Sequential(
[
Dense(2, activation="selu", kernel_initializer = 'lecun_normal'),
Dense(4, activation="selu", kernel_initializer = 'lecun_normal'),
Dense(8, activation="selu", kernel_initializer = 'lecun_normal'),
Dense(17, activation="selu", kernel_initializer = 'lecun_normal'),
]
)

## encoder.py
encoder_network = Sequential(
[
Dense(17, activation="selu", kernel_initializer = 'lecun_normal'),
Dense(8, activation="selu", kernel_initializer = 'lecun_normal'),
Dense(4, activation="selu", kernel_initializer = 'lecun_normal'),
Dense(2, activation="selu", kernel_initializer = 'lecun_normal'),
]
)

## find-clusters.py
def find_clusters(df:pd.DataFrame, file:str):
"""
Run an experiment to find 3, 4, and 5 clusters.
Parameters:
df: The dataframe on which clustering will take place
file: A string to help add tags, and identifying information for the experiment
"""
for k in range(3,6,1):
file_string = file + "_" + str(k)
experiment = Experiment(workspace='team-comet-ml', project_name='cc-clustering')

## comet-artifact-logger.py
cc_df.drop(columns='CUST_ID', inplace=True)
cc_df.to_csv('cc_df_imputed.csv')

# Since k-means uses Euclidean distance, it would be a good to scale the data
scaler = StandardScaler()
creditcard_df_scaled = scaler.fit_transform(cc_df)
save('cc-data-scaled.npy', creditcard_df_scaled)

data_artifacts = {
'cc_df':{'df':'cc_df_imputed.csv',

## comet-artifact-logger.py
cc_df.drop(columns='CUST_ID', inplace=True)
cc_df.to_csv('cc_df_imputed.csv')
# Since k-means uses Euclidean distance, it would be a good to scale the data
scaler = StandardScaler()
creditcard_df_scaled = scaler.fit_transform(cc_df)
save('cc-data-scaled.npy', creditcard_df_scaled)
data_artifacts = {
'cc_df':{'df':'cc_df_imputed.csv',
'type':'data-model',
'alias':['raw-features'],

## imput-missing.py
cc_df.loc[(cc_df['MINIMUM_PAYMENTS'].isnull() == True), 'MINIMUM_PAYMENTS'] = cc_df['MINIMUM_PAYMENTS'].median()
cc_df = cc_df[cc_df['CREDIT_LIMIT'].isnull() == False]

## plot-credit-limit2.py
cc_df[cc_df['MINIMUM_PAYMENTS'].isnull() == False]['CREDIT_LIMIT'].plot(kind='box')

## plot-credit-limit.py
cc_df[cc_df['MINIMUM_PAYMENTS'].isnull() ['CREDIT_LIMIT'].plot(kind='box')

## show-report.py
report.show_notebook(w=900, h=500, scale=0.8)
	experiment = Experiment(workspace='team-comet-ml', project_name='cc-clustering')
	experiment.add_tag("autoencoder")

	autoencoder_network = Sequential([encoder_network, decoder_network])
	autoencoder_network.compile(optimizer= 'adam', loss='mean_squared_error')
	autoencoder_network.fit(creditcard_df_scaled, creditcard_df_scaled, batch_size = 128, epochs = 150, verbose = 0)

	pred_df = pd.DataFrame(encoder_network.predict(creditcard_df_scaled), columns=['encoding1', 'encoding2'])
	pred_df.to_csv('encoded_df.csv')
	decoder_network = Sequential(
	[
	Dense(2, activation="selu", kernel_initializer = 'lecun_normal'),
	Dense(4, activation="selu", kernel_initializer = 'lecun_normal'),
	Dense(8, activation="selu", kernel_initializer = 'lecun_normal'),
	Dense(17, activation="selu", kernel_initializer = 'lecun_normal'),
	]
	)
	encoder_network = Sequential(
	[
	Dense(17, activation="selu", kernel_initializer = 'lecun_normal'),
	Dense(8, activation="selu", kernel_initializer = 'lecun_normal'),
	Dense(4, activation="selu", kernel_initializer = 'lecun_normal'),
	Dense(2, activation="selu", kernel_initializer = 'lecun_normal'),
	]
	)
	def find_clusters(df:pd.DataFrame, file:str):
	"""
	Run an experiment to find 3, 4, and 5 clusters.
	Parameters:
	df: The dataframe on which clustering will take place
	file: A string to help add tags, and identifying information for the experiment
	"""
	for k in range(3,6,1):
	file_string = file + "_" + str(k)
	experiment = Experiment(workspace='team-comet-ml', project_name='cc-clustering')
	cc_df.drop(columns='CUST_ID', inplace=True)
	cc_df.to_csv('cc_df_imputed.csv')

	# Since k-means uses Euclidean distance, it would be a good to scale the data
	scaler = StandardScaler()
	creditcard_df_scaled = scaler.fit_transform(cc_df)
	save('cc-data-scaled.npy', creditcard_df_scaled)

	data_artifacts = {
	'cc_df':{'df':'cc_df_imputed.csv',
	cc_df.loc[(cc_df['MINIMUM_PAYMENTS'].isnull() == True), 'MINIMUM_PAYMENTS'] = cc_df['MINIMUM_PAYMENTS'].median()
	cc_df = cc_df[cc_df['CREDIT_LIMIT'].isnull() == False]