Lianne & Justin @ Just into Data liannewriting

## gist:db702ab91861dc77280deabcbf79b4a1
df.shape
df.dtypes
df.columns
df[column].value_counts(dropna=False) # string
df[column].describe() # numeric

## load_the_data.py
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Ridge

# load the data.
df = pd.read_csv('hockey_games.csv', skiprows=1, names=['date', 'visitor', 'visitor_goals', 'home', 'home_goals'])

# make the date column into a date format.
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

## add_new_features.py
df['goal_difference'] = df['home_goals'] - df['visitor_goals']

## add_new_features_home.py
# create new variables to show home team win or loss result
df['home_win'] = np.where(df['goal_difference'] > 0, 1, 0)
df['home_loss'] = np.where(df['goal_difference'] < 0, 1, 0)

## transform_the_data.py
df_visitor = pd.get_dummies(df['visitor'], dtype=np.int64)
df_home = pd.get_dummies(df['home'], dtype=np.int64)

## get_final_dataset.py
# subtract home from visitor
df_model = df_home.sub(df_visitor)
df_model['goal_difference'] = df['goal_difference']

## fit_ridge_model.py
df_train = df_model # not required but I like to rename my dataframe with the name train.

lr = Ridge(alpha=0.001)
X = df_train.drop(['goal_difference'], axis=1)
y = df_train['goal_difference']

lr.fit(X, y)

## display_result.py
df_ratings = pd.DataFrame(data={'team': X.columns, 'rating': lr.coef_})
df_ratings

## read_explore_data.py
# import packages
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure

## missing_data_find1.py
cols = df.columns[:30] # first 30 columns
colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing.
sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colours))
	df.shape
	df.dtypes
	df.columns
	df[column].value_counts(dropna=False) # string
	df[column].describe() # numeric
	import pandas as pd
	import numpy as np
	from sklearn.metrics import accuracy_score
	from sklearn.linear_model import Ridge

	# load the data.
	df = pd.read_csv('hockey_games.csv', skiprows=1, names=['date', 'visitor', 'visitor_goals', 'home', 'home_goals'])

	# make the date column into a date format.
	df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
	# create new variables to show home team win or loss result
	df['home_win'] = np.where(df['goal_difference'] > 0, 1, 0)
	df['home_loss'] = np.where(df['goal_difference'] < 0, 1, 0)
	df_visitor = pd.get_dummies(df['visitor'], dtype=np.int64)
	df_home = pd.get_dummies(df['home'], dtype=np.int64)
	# subtract home from visitor
	df_model = df_home.sub(df_visitor)
	df_model['goal_difference'] = df['goal_difference']
	df_train = df_model # not required but I like to rename my dataframe with the name train.

	lr = Ridge(alpha=0.001)
	X = df_train.drop(['goal_difference'], axis=1)
	y = df_train['goal_difference']

	lr.fit(X, y)
	df_ratings = pd.DataFrame(data={'team': X.columns, 'rating': lr.coef_})
	df_ratings
	# import packages
	import pandas as pd
	import numpy as np
	import seaborn as sns

	import matplotlib.pyplot as plt
	import matplotlib.mlab as mlab
	import matplotlib
	plt.style.use('ggplot')
	from matplotlib.pyplot import figure
	cols = df.columns[:30] # first 30 columns
	colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing.
	sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colours))