Plegas Gerasimos makispl

## bayesian_gm.py
# find the optimum number of clusters
bgm = BayesianGaussianMixture(n_components=10, n_init=7, max_iter=1000)
bgm.fit(pca_scores)
np.round(bgm.weights_, 2)

## pca4clustering.py
# read in the training data
plays_df = pd.read_csv('../data/interim/plays_17_18_19_pre_proc_train.csv',
                       converters={'GAME_ID': lambda x: str(x)})

# switch to the for-normalisation-features
data_stnd = data.copy()

# instantiate, fit, transform scaler
scaler = MinMaxScaler()
data_stnd = scaler.fit_transform(data_stnd)

## kmeans.py
# Instantiate a KMeans model with 4 clusters, fit and predict cluster indices
kmeans_pca = KMeans(n_clusters=4, init='random', random_state=1)
kmeans_pca.fit_predict(pca_scores)
plays_km_df['km_cluster'] = kmeans_pca.labels_

# concat plays_km_df with the pca components
plays_pca_km_df = pd.concat([plays_km_df.reset_index(drop=True), pd.DataFrame(
    data=pca_scores, columns=['pca_1', 'pca_2', 'pca_3', 'pca_4'])], axis=1)

# visualize clusters

## locate_elbow.py
# Switch to a new dataframe instance
# for the k-memans implementation
plays_km_df = plays_df.copy()

# Calculate the wcss
max_clusters = 11
wcss = list()

for k in range(1, max_clusters):
    kmeans = KMeans(n_clusters=k, init='random', random_state=1)

## plays_all_descriptive_stats.tsv

          
            
            OFF_RATING
            AST_PCT
            AST_TOV
            TM_TOV_PCT
            EFG_PCT
            TS_PCT
            POSS

            
              count
              62893.000000
              62893.000000
              62893.000000
              62893.000000
              62893.000000
              62893.000000
              62893.000000

            
              mean
              104.810268
              0.138187
              1.037766
              10.242790
              0.487106
              0.519022
              47.786987

            
              std
              23.251576
              0.140800
              1.579515
              12.341289
              0.281903
              0.270811
              21.401744

            
              min
              0.000000
              0.000000
              0.000000
              0.000000
              0.000000
              0.000000
              0.000000

            
              25%
              93.800000
              0.000000
              0.000000
              0.000000
              0.333000
              0.375000
              33.000000

            
              50%
              106.300000
              0.111000
              0.330000
              8.300000
              0.500000
              0.532000
              50.000000

            
              75%
              118.200000
              0.208000
              1.500000
              15.400000
              0.667000
              0.680000
              65.000000

            
              max
              350.000000
              1.000000
              17.000000
              100.000000
              1.500000
              1.500000
              121.000000

## plays_95th_descriptive_stats.tsv

          
            
            OFF_RATING
            AST_PCT
            AST_TOV
            TM_TOV_PCT
            EFG_PCT
            TS_PCT
            POSS

            
              count
              3056.000000
              3056.000000
              3056.000000
              3056.000000
              3056.000000
              3056.000000
              3056.000000

            
              mean
              104.781872
              0.527235
              2.929741
              9.501309
              0.472252
              0.509817
              48.632199

            
              std
              23.812656
              0.142249
              2.974245
              8.700416
              0.274866
              0.267038
              25.562033

            
              min
              22.200000
              0.406000
              0.000000
              0.000000
              0.000000
              0.000000
              1.000000

            
              25%
              93.000000
              0.440000
              0.000000
              0.000000
              0.333000
              0.383000
              28.000000

            
              50%
              105.900000
              0.500000
              2.310000
              8.700000
              0.500000
              0.532000
              53.000000

            
              75%
              118.200000
              0.545000
              4.000000
              14.300000
              0.625000
              0.666000
              70.000000

            
              max
              250.000000
              1.000000
              17.000000
              66.700000
              1.500000
              1.500000
              121.000000

## features_explained.tsv

          
            Column
            Definition
            Group

            
              OFF_RATING
              Points scored per 100 possessions
              group_1

            
              AST_PCT
              Ratio of player assists to teammate field goals made
              group_1

            
              AST_TOV
              Ratio of assists to turnovers
              group_1

            
              TM_TOV_PCT
              Ratio of turnovers to possessions
              group_1

            
              EFG_PCT
              Weighted field goal percentage with 3-point field goals weighted 1.5 times 2-point field goals
              group_1

            
              TS_PCT
              Shooting percentage accounting for free throws and three-point field goals
              group_1

            
              POSS
              Possession
              group_1

            
              MIN
              Minutes played
              group_2

            
              AST_RATIO
              Assists per 100 possessions
              group_2

## plays_df.csv
GAME_ID,GAME_DATE,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,E_OFF_RATING,OFF_RATING,E_DEF_RATING,DEF_RATING,E_NET_RATING,NET_RATING,AST_PCT,AST_TOV,AST_RATIO,OREB_PCT,DREB_PCT,REB_PCT,TM_TOV_PCT,EFG_PCT,TS_PCT,USG_PCT,E_USG_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE
0021700002,2017-10-17,1610612745,HOU,Houston,2772,Trevor Ariza,Trevor,F,,37:39,107.3,111.3,121.4,121.3,-14.1,-10.0,0.152,5.0,33.3,0.023,0.156,0.079,6.7,0.444,0.444,0.109,0.111,103.83,101.99,84.99,80,0.063
0021700002,2017-10-17,1610612745,HOU,Houston,201583,Ryan Anderson,Ryan,F,,33:17,117.8,127.4,119.4,119.7,-1.7,7.7,0.031,0.0,7.7,0.079,0.2,0.127,0.0,0.542,0.542,0.141,0.141,108.25,103.84,86.53,73,0.064
0021700002,2017-10-17,1610612745,HOU,Houston,203991,Clint Capela,Clint,C,,18:10,95.4,95.1,153.0,155.0,-57.6,-59.9,0.0,0.0,0.0,0.05,0.25,0.125,16.7,0.6,0.575,0.267,0.273,107.54,107.01,89.17,41,0.092
0021700002,2017-10-17,1610612745,HOU,Houston,201935,James Harden,James,G,,36:24,117.6,126.0,126.2,124.7,-8.6,

## mature_whiskies_price.py
display(df_num[df_num['age'] > 30]['price'].sort_values(ascending=False).head(10))

## logistic_regression.py
# Switch to a copy of the labeled dataframe
df_no_nuls_2 = df_no_nuls.copy()

# Randomise the df
shuffled_rows = np.random.permutation(df_no_nuls_2.index)
df_no_nuls_2 = df_no_nuls_2.loc[shuffled_rows]

# Split to train and test datasets
train = df_no_nuls_2.iloc[:int(df_no_nuls_2.shape[0]*0.8)].copy()
test = df_no_nuls_2.iloc[int(df_no_nuls_2.shape[0]*0.8):].copy().reset_index()
	# find the optimum number of clusters
	bgm = BayesianGaussianMixture(n_components=10, n_init=7, max_iter=1000)
	bgm.fit(pca_scores)
	np.round(bgm.weights_, 2)
	# read in the training data
	plays_df = pd.read_csv('../data/interim/plays_17_18_19_pre_proc_train.csv',
	converters={'GAME_ID': lambda x: str(x)})

	# switch to the for-normalisation-features
	data_stnd = data.copy()

	# instantiate, fit, transform scaler
	scaler = MinMaxScaler()
	data_stnd = scaler.fit_transform(data_stnd)
	# Instantiate a KMeans model with 4 clusters, fit and predict cluster indices
	kmeans_pca = KMeans(n_clusters=4, init='random', random_state=1)
	kmeans_pca.fit_predict(pca_scores)
	plays_km_df['km_cluster'] = kmeans_pca.labels_

	# concat plays_km_df with the pca components
	plays_pca_km_df = pd.concat([plays_km_df.reset_index(drop=True), pd.DataFrame(
	data=pca_scores, columns=['pca_1', 'pca_2', 'pca_3', 'pca_4'])], axis=1)

	# visualize clusters
	# Switch to a new dataframe instance
	# for the k-memans implementation
	plays_km_df = plays_df.copy()

	# Calculate the wcss
	max_clusters = 11
	wcss = list()

	for k in range(1, max_clusters):
	kmeans = KMeans(n_clusters=k, init='random', random_state=1)
	OFF_RATING	AST_PCT	AST_TOV	TM_TOV_PCT	EFG_PCT	TS_PCT	POSS
count	62893.000000	62893.000000	62893.000000	62893.000000	62893.000000	62893.000000	62893.000000
mean	104.810268	0.138187	1.037766	10.242790	0.487106	0.519022	47.786987
std	23.251576	0.140800	1.579515	12.341289	0.281903	0.270811	21.401744
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	93.800000	0.000000	0.000000	0.000000	0.333000	0.375000	33.000000
50%	106.300000	0.111000	0.330000	8.300000	0.500000	0.532000	50.000000
75%	118.200000	0.208000	1.500000	15.400000	0.667000	0.680000	65.000000
max	350.000000	1.000000	17.000000	100.000000	1.500000	1.500000	121.000000
Column	Definition	Group
OFF_RATING	Points scored per 100 possessions	group_1
AST_PCT	Ratio of player assists to teammate field goals made	group_1
AST_TOV	Ratio of assists to turnovers	group_1
TM_TOV_PCT	Ratio of turnovers to possessions	group_1
EFG_PCT	Weighted field goal percentage with 3-point field goals weighted 1.5 times 2-point field goals	group_1
TS_PCT	Shooting percentage accounting for free throws and three-point field goals	group_1
POSS	Possession	group_1
MIN	Minutes played	group_2
AST_RATIO	Assists per 100 possessions	group_2
	GAME_ID,GAME_DATE,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,NICKNAME,START_POSITION,COMMENT,MIN,E_OFF_RATING,OFF_RATING,E_DEF_RATING,DEF_RATING,E_NET_RATING,NET_RATING,AST_PCT,AST_TOV,AST_RATIO,OREB_PCT,DREB_PCT,REB_PCT,TM_TOV_PCT,EFG_PCT,TS_PCT,USG_PCT,E_USG_PCT,E_PACE,PACE,PACE_PER40,POSS,PIE
	0021700002,2017-10-17,1610612745,HOU,Houston,2772,Trevor Ariza,Trevor,F,,37:39,107.3,111.3,121.4,121.3,-14.1,-10.0,0.152,5.0,33.3,0.023,0.156,0.079,6.7,0.444,0.444,0.109,0.111,103.83,101.99,84.99,80,0.063
	0021700002,2017-10-17,1610612745,HOU,Houston,201583,Ryan Anderson,Ryan,F,,33:17,117.8,127.4,119.4,119.7,-1.7,7.7,0.031,0.0,7.7,0.079,0.2,0.127,0.0,0.542,0.542,0.141,0.141,108.25,103.84,86.53,73,0.064
	0021700002,2017-10-17,1610612745,HOU,Houston,203991,Clint Capela,Clint,C,,18:10,95.4,95.1,153.0,155.0,-57.6,-59.9,0.0,0.0,0.0,0.05,0.25,0.125,16.7,0.6,0.575,0.267,0.273,107.54,107.01,89.17,41,0.092
	0021700002,2017-10-17,1610612745,HOU,Houston,201935,James Harden,James,G,,36:24,117.6,126.0,126.2,124.7,-8.6,
	# Switch to a copy of the labeled dataframe
	df_no_nuls_2 = df_no_nuls.copy()

	# Randomise the df
	shuffled_rows = np.random.permutation(df_no_nuls_2.index)
	df_no_nuls_2 = df_no_nuls_2.loc[shuffled_rows]

	# Split to train and test datasets
	train = df_no_nuls_2.iloc[:int(df_no_nuls_2.shape[0]*0.8)].copy()
	test = df_no_nuls_2.iloc[int(df_no_nuls_2.shape[0]*0.8):].copy().reset_index()