Skip to content

Instantly share code, notes, and snippets.

View liannewriting's full-sized avatar

Lianne & Justin @ Just into Data liannewriting

View GitHub Profile
@liannewriting
liannewriting / gist:db702ab91861dc77280deabcbf79b4a1
Last active January 17, 2020 16:11
data cleaning - Exploratory analysis
df.shape
df.dtypes
df.columns
df[column].value_counts(dropna=False) # string
df[column].describe() # numeric
@liannewriting
liannewriting / load_the_data.py
Created January 18, 2020 22:57
sports_betting202001
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge
# load the data.
df = pd.read_csv('hockey_games.csv', skiprows=1, names=['date', 'visitor', 'visitor_goals', 'home', 'home_goals'])
# make the date column into a date format.
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
@liannewriting
liannewriting / add_new_features.py
Last active August 8, 2021 20:30
sports_betting202001
df['goal_difference'] = df['home_goals'] - df['visitor_goals']
@liannewriting
liannewriting / add_new_features_home.py
Created January 18, 2020 23:19
sports_betting202001
# create new variables to show home team win or loss result
df['home_win'] = np.where(df['goal_difference'] > 0, 1, 0)
df['home_loss'] = np.where(df['goal_difference'] < 0, 1, 0)
@liannewriting
liannewriting / transform_the_data.py
Created January 18, 2020 23:20
sports_betting202001
df_visitor = pd.get_dummies(df['visitor'], dtype=np.int64)
df_home = pd.get_dummies(df['home'], dtype=np.int64)
@liannewriting
liannewriting / get_final_dataset.py
Last active January 20, 2020 16:44
sports_betting202001
# subtract home from visitor
df_model = df_home.sub(df_visitor)
df_model['goal_difference'] = df['goal_difference']
@liannewriting
liannewriting / fit_ridge_model.py
Last active January 24, 2020 14:10
sports_betting202001
df_train = df_model # not required but I like to rename my dataframe with the name train.
lr = Ridge(alpha=0.001)
X = df_train.drop(['goal_difference'], axis=1)
y = df_train['goal_difference']
lr.fit(X, y)
@liannewriting
liannewriting / display_result.py
Created January 18, 2020 23:23
sports_betting202001
df_ratings = pd.DataFrame(data={'team': X.columns, 'rating': lr.coef_})
df_ratings
@liannewriting
liannewriting / read_explore_data.py
Last active November 9, 2021 17:11
data_cleaning_202001
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure
@liannewriting
liannewriting / missing_data_find1.py
Last active January 21, 2020 15:41
data_cleaning_202001
cols = df.columns[:30] # first 30 columns
colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing.
sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colours))