makispl/logistic_regression.py

## logistic_regression.py
# Switch to a copy of the labeled dataframe
df_no_nuls_2 = df_no_nuls.copy()

# Randomise the df
shuffled_rows = np.random.permutation(df_no_nuls_2.index)
df_no_nuls_2 = df_no_nuls_2.loc[shuffled_rows]

# Split to train and test datasets
train = df_no_nuls_2.iloc[:int(df_no_nuls_2.shape[0]*0.8)].copy()
test = df_no_nuls_2.iloc[int(df_no_nuls_2.shape[0]*0.8):].copy().reset_index()

# Subset to the numerical columns we are about to use on the ML algorithm
train_data = train[['rating', 'alcohol', 'age']].copy()
test_data = test[['rating', 'alcohol', 'age']].copy()

# List the unique clasters
unique_clusters = train['cluster'].unique()
unique_clusters.sort()
models = {}

# Train each binary classification model
for cluster in unique_clusters:
    X = train[['rating', 'alcohol', 'age']].copy()
    y = train['cluster'] == cluster

    model = LogisticRegression()
    model.fit(X, y)
    models[cluster] = model

testing_probs = pd.DataFrame(columns=unique_clusters)

# Test the models
for cluster in unique_clusters:
    X_test = test[['rating', 'alcohol', 'age']].copy()
    testing_probs[cluster] = models[cluster].predict_proba(X_test)[:,1]

# Label the new data
test['pred_cluster'] = testing_probs.idxmax(axis=1)

# Evaluate the model
accuracy = (test['cluster'] == test['pred_cluster']).sum() / test.shape[0]
	# Switch to a copy of the labeled dataframe
	df_no_nuls_2 = df_no_nuls.copy()

	# Randomise the df
	shuffled_rows = np.random.permutation(df_no_nuls_2.index)
	df_no_nuls_2 = df_no_nuls_2.loc[shuffled_rows]

	# Split to train and test datasets
	train = df_no_nuls_2.iloc[:int(df_no_nuls_2.shape[0]*0.8)].copy()
	test = df_no_nuls_2.iloc[int(df_no_nuls_2.shape[0]*0.8):].copy().reset_index()

	# Subset to the numerical columns we are about to use on the ML algorithm
	train_data = train[['rating', 'alcohol', 'age']].copy()
	test_data = test[['rating', 'alcohol', 'age']].copy()

	# List the unique clasters
	unique_clusters = train['cluster'].unique()
	unique_clusters.sort()
	models = {}

	# Train each binary classification model
	for cluster in unique_clusters:
	X = train[['rating', 'alcohol', 'age']].copy()
	y = train['cluster'] == cluster

	model = LogisticRegression()
	model.fit(X, y)
	models[cluster] = model

	testing_probs = pd.DataFrame(columns=unique_clusters)

	# Test the models
	for cluster in unique_clusters:
	X_test = test[['rating', 'alcohol', 'age']].copy()
	testing_probs[cluster] = models[cluster].predict_proba(X_test)[:,1]

	# Label the new data
	test['pred_cluster'] = testing_probs.idxmax(axis=1)

	# Evaluate the model
	accuracy = (test['cluster'] == test['pred_cluster']).sum() / test.shape[0]