Lianne & Justin @ Just into Data liannewriting

## gist:db702ab91861dc77280deabcbf79b4a1
df.shape
df.dtypes
df.columns
df[column].value_counts(dropna=False) # string
df[column].describe() # numeric

## add_new_features_home.py
# create new variables to show home team win or loss result
df['home_win'] = np.where(df['goal_difference'] > 0, 1, 0)
df['home_loss'] = np.where(df['goal_difference'] < 0, 1, 0)

## transform_the_data.py
df_visitor = pd.get_dummies(df['visitor'], dtype=np.int64)
df_home = pd.get_dummies(df['home'], dtype=np.int64)

## display_result.py
df_ratings = pd.DataFrame(data={'team': X.columns, 'rating': lr.coef_})
df_ratings

## get_final_dataset.py
# subtract home from visitor
df_model = df_home.sub(df_visitor)
df_model['goal_difference'] = df['goal_difference']

## missing_data_find1.py
cols = df.columns[:30] # first 30 columns
colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing.
sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colours))

## missing_data_find2.py
# if it's a larger dataset and the visualization takes too long can do this.
# % of missing.
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

## missing_data_indicator1.py
# life_sq has a lot of missing values.
# life_sq: living area in square meters, excluding loggias, balconies and other non-residential areas
df['life_sq'].value_counts(dropna=False)


## missing_data_indicator2.py
df['life_sq'].describe()

## missing_data_indicator3.py
# create an ismissing indicator variable for life_sq.
df['life_sq_ismissing'] = df['life_sq'].isnull()
df['life_sq_ismissing'].value_counts(dropna=False)
	df.shape
	df.dtypes
	df.columns
	df[column].value_counts(dropna=False) # string
	df[column].describe() # numeric
	# create new variables to show home team win or loss result
	df['home_win'] = np.where(df['goal_difference'] > 0, 1, 0)
	df['home_loss'] = np.where(df['goal_difference'] < 0, 1, 0)
	df_visitor = pd.get_dummies(df['visitor'], dtype=np.int64)
	df_home = pd.get_dummies(df['home'], dtype=np.int64)
	df_ratings = pd.DataFrame(data={'team': X.columns, 'rating': lr.coef_})
	df_ratings
	# subtract home from visitor
	df_model = df_home.sub(df_visitor)
	df_model['goal_difference'] = df['goal_difference']
	cols = df.columns[:30] # first 30 columns
	colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing.
	sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colours))
	# if it's a larger dataset and the visualization takes too long can do this.
	# % of missing.
	for col in df.columns:
	pct_missing = np.mean(df[col].isnull())
	print('{} - {}%'.format(col, round(pct_missing*100)))
	# life_sq has a lot of missing values.
	# life_sq: living area in square meters, excluding loggias, balconies and other non-residential areas
	df['life_sq'].value_counts(dropna=False)
	# create an ismissing indicator variable for life_sq.
	df['life_sq_ismissing'] = df['life_sq'].isnull()
	df['life_sq_ismissing'].value_counts(dropna=False)