Skip to content

Instantly share code, notes, and snippets.

View liannewriting's full-sized avatar

Lianne & Justin @ Just into Data liannewriting

View GitHub Profile
@liannewriting
liannewriting / gist:db702ab91861dc77280deabcbf79b4a1
Last active January 17, 2020 16:11
data cleaning - Exploratory analysis
df.shape
df.dtypes
df.columns
df[column].value_counts(dropna=False) # string
df[column].describe() # numeric
@liannewriting
liannewriting / add_new_features_home.py
Created January 18, 2020 23:19
sports_betting202001
# create new variables to show home team win or loss result
df['home_win'] = np.where(df['goal_difference'] > 0, 1, 0)
df['home_loss'] = np.where(df['goal_difference'] < 0, 1, 0)
@liannewriting
liannewriting / transform_the_data.py
Created January 18, 2020 23:20
sports_betting202001
df_visitor = pd.get_dummies(df['visitor'], dtype=np.int64)
df_home = pd.get_dummies(df['home'], dtype=np.int64)
@liannewriting
liannewriting / display_result.py
Created January 18, 2020 23:23
sports_betting202001
df_ratings = pd.DataFrame(data={'team': X.columns, 'rating': lr.coef_})
df_ratings
@liannewriting
liannewriting / get_final_dataset.py
Last active January 20, 2020 16:44
sports_betting202001
# subtract home from visitor
df_model = df_home.sub(df_visitor)
df_model['goal_difference'] = df['goal_difference']
@liannewriting
liannewriting / missing_data_find1.py
Last active January 21, 2020 15:41
data_cleaning_202001
cols = df.columns[:30] # first 30 columns
colours = ['#000099', '#ffff00'] # specify the colours - yellow is missing. blue is not missing.
sns.heatmap(df[cols].isnull(), cmap=sns.color_palette(colours))
@liannewriting
liannewriting / missing_data_find2.py
Created January 21, 2020 15:42
data_cleaning_202001
# if it's a larger dataset and the visualization takes too long can do this.
# % of missing.
for col in df.columns:
pct_missing = np.mean(df[col].isnull())
print('{} - {}%'.format(col, round(pct_missing*100)))
@liannewriting
liannewriting / missing_data_indicator1.py
Created January 21, 2020 15:43
data_cleaning_202001
# life_sq has a lot of missing values.
# life_sq: living area in square meters, excluding loggias, balconies and other non-residential areas
df['life_sq'].value_counts(dropna=False)
@liannewriting
liannewriting / missing_data_indicator2.py
Created January 21, 2020 15:45
data_cleaning_202001
df['life_sq'].describe()
@liannewriting
liannewriting / missing_data_indicator3.py
Created January 21, 2020 15:45
data_cleaning_202001
# create an ismissing indicator variable for life_sq.
df['life_sq_ismissing'] = df['life_sq'].isnull()
df['life_sq_ismissing'].value_counts(dropna=False)