Skip to content

Instantly share code, notes, and snippets.

View harpreetsahota204's full-sized avatar

Harpreet Sahota harpreetsahota204

View GitHub Profile
experiment = Experiment(workspace='team-comet-ml', project_name='cc-clustering')
experiment.add_tag("autoencoder")
autoencoder_network = Sequential([encoder_network, decoder_network])
autoencoder_network.compile(optimizer= 'adam', loss='mean_squared_error')
autoencoder_network.fit(creditcard_df_scaled, creditcard_df_scaled, batch_size = 128, epochs = 150, verbose = 0)
pred_df = pd.DataFrame(encoder_network.predict(creditcard_df_scaled), columns=['encoding1', 'encoding2'])
pred_df.to_csv('encoded_df.csv')
decoder_network = Sequential(
[
Dense(2, activation="selu", kernel_initializer = 'lecun_normal'),
Dense(4, activation="selu", kernel_initializer = 'lecun_normal'),
Dense(8, activation="selu", kernel_initializer = 'lecun_normal'),
Dense(17, activation="selu", kernel_initializer = 'lecun_normal'),
]
)
encoder_network = Sequential(
[
Dense(17, activation="selu", kernel_initializer = 'lecun_normal'),
Dense(8, activation="selu", kernel_initializer = 'lecun_normal'),
Dense(4, activation="selu", kernel_initializer = 'lecun_normal'),
Dense(2, activation="selu", kernel_initializer = 'lecun_normal'),
]
)
def find_clusters(df:pd.DataFrame, file:str):
"""
Run an experiment to find 3, 4, and 5 clusters.
Parameters:
df: The dataframe on which clustering will take place
file: A string to help add tags, and identifying information for the experiment
"""
for k in range(3,6,1):
file_string = file + "_" + str(k)
experiment = Experiment(workspace='team-comet-ml', project_name='cc-clustering')
cc_df.drop(columns='CUST_ID', inplace=True)
cc_df.to_csv('cc_df_imputed.csv')
# Since k-means uses Euclidean distance, it would be a good to scale the data
scaler = StandardScaler()
creditcard_df_scaled = scaler.fit_transform(cc_df)
save('cc-data-scaled.npy', creditcard_df_scaled)
data_artifacts = {
'cc_df':{'df':'cc_df_imputed.csv',
cc_df.drop(columns='CUST_ID', inplace=True)
cc_df.to_csv('cc_df_imputed.csv')
# Since k-means uses Euclidean distance, it would be a good to scale the data
scaler = StandardScaler()
creditcard_df_scaled = scaler.fit_transform(cc_df)
save('cc-data-scaled.npy', creditcard_df_scaled)
data_artifacts = {
'cc_df':{'df':'cc_df_imputed.csv',
'type':'data-model',
'alias':['raw-features'],
cc_df.loc[(cc_df['MINIMUM_PAYMENTS'].isnull() == True), 'MINIMUM_PAYMENTS'] = cc_df['MINIMUM_PAYMENTS'].median()
cc_df = cc_df[cc_df['CREDIT_LIMIT'].isnull() == False]
cc_df[cc_df['MINIMUM_PAYMENTS'].isnull() == False]['CREDIT_LIMIT'].plot(kind='box')
cc_df[cc_df['MINIMUM_PAYMENTS'].isnull() ['CREDIT_LIMIT'].plot(kind='box')
report.show_notebook(w=900, h=500, scale=0.8)