Skip to content

Instantly share code, notes, and snippets.

View liannewriting's full-sized avatar

Lianne & Justin @ Just into Data liannewriting

View GitHub Profile
@liannewriting
liannewriting / missing_data_find2.py
Created January 21, 2020 15:42
data_cleaning_202001
# if it's a larger dataset and the visualization takes too long can do this.
# % of missing.
for col in df.columns:
pct_missing = np.mean(df[col].isnull())
print('{} - {}%'.format(col, round(pct_missing*100)))
@liannewriting
liannewriting / missing_data_indicator1.py
Created January 21, 2020 15:43
data_cleaning_202001
# life_sq has a lot of missing values.
# life_sq: living area in square meters, excluding loggias, balconies and other non-residential areas
df['life_sq'].value_counts(dropna=False)
@liannewriting
liannewriting / missing_data_indicator2.py
Created January 21, 2020 15:45
data_cleaning_202001
df['life_sq'].describe()
@liannewriting
liannewriting / missing_data_indicator3.py
Created January 21, 2020 15:45
data_cleaning_202001
# create an ismissing indicator variable for life_sq.
df['life_sq_ismissing'] = df['life_sq'].isnull()
df['life_sq_ismissing'].value_counts(dropna=False)
@liannewriting
liannewriting / missing_data_indicator4.py
Created January 21, 2020 15:46
data_cleaning_202001
df[['life_sq', 'life_sq_ismissing']]
@liannewriting
liannewriting / missing_data_imputation1.py
Last active January 21, 2020 19:48
data_cleaning_202001
# replace missing values with the median.
med = df['life_sq'].median()
print(med)
df['life_sq'] = df['life_sq'].fillna(med)
@liannewriting
liannewriting / missing_data_imputation2.py
Last active January 21, 2020 15:54
data_cleaning_202001
# impute the missing values and create the missing value indicator variables for each numeric column.
df_numeric = df.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values
for col in numeric_cols:
missing = df[col].isnull()
num_missing = np.sum(missing)
if num_missing > 0: # only do the imputation for the columns that have missing values.
print('imputing missing values for: {}'.format(col))
@liannewriting
liannewriting / missing_data_imputation3.py
Last active January 21, 2020 15:54
data_cleaning_202001
# impute the missing values and create the missing value indicator variables for each non-numeric column.
df_non_numeric = df.select_dtypes(exclude=[np.number])
non_numeric_cols = df_non_numeric.columns.values
for col in non_numeric_cols:
missing = df[col].isnull()
num_missing = np.sum(missing)
if num_missing > 0: # only do the imputation for the columns that have missing values.
print('imputing missing values for: {}'.format(col))
@liannewriting
liannewriting / missing_data_dropping1.py
Last active January 27, 2020 16:33
data_cleaning_202001
# first create missing indicator for features with missing data
for col in df.columns:
missing = df[col].isnull()
num_missing = np.sum(missing)
if num_missing > 0:
print('created missing indicator for: {}'.format(col))
df['{}_ismissing'.format(col)] = missing
@liannewriting
liannewriting / missing_data_dropping2.py
Last active January 21, 2020 15:48
data_cleaning_202001
# drop rows with a lot of missing values.
ind_missing = df[df['num_missing'] > 35].index
df_less_missing_rows = df.drop(ind_missing, axis=0)