Skip to content

Instantly share code, notes, and snippets.

View liannewriting's full-sized avatar

Lianne & Justin @ Just into Data liannewriting

View GitHub Profile
@liannewriting
liannewriting / missing_data_indicator4.py
Created January 21, 2020 15:46
data_cleaning_202001
df[['life_sq', 'life_sq_ismissing']]
@liannewriting
liannewriting / missing_data_dropping2.py
Last active January 21, 2020 15:48
data_cleaning_202001
# drop rows with a lot of missing values.
ind_missing = df[df['num_missing'] > 35].index
df_less_missing_rows = df.drop(ind_missing, axis=0)
@liannewriting
liannewriting / missing_data_imputation2.py
Last active January 21, 2020 15:54
data_cleaning_202001
# impute the missing values and create the missing value indicator variables for each numeric column.
df_numeric = df.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values
for col in numeric_cols:
missing = df[col].isnull()
num_missing = np.sum(missing)
if num_missing > 0: # only do the imputation for the columns that have missing values.
print('imputing missing values for: {}'.format(col))
@liannewriting
liannewriting / missing_data_imputation3.py
Last active January 21, 2020 15:54
data_cleaning_202001
# impute the missing values and create the missing value indicator variables for each non-numeric column.
df_non_numeric = df.select_dtypes(exclude=[np.number])
non_numeric_cols = df_non_numeric.columns.values
for col in non_numeric_cols:
missing = df[col].isnull()
num_missing = np.sum(missing)
if num_missing > 0: # only do the imputation for the columns that have missing values.
print('imputing missing values for: {}'.format(col))
@liannewriting
liannewriting / duplicate_data_rows.py
Created January 21, 2020 19:24
data_cleaning_202001
# we know that column 'id' is unique, but what if we drop it?
df_dedupped = df.drop('id', axis=1).drop_duplicates()
# there were duplicate rows
print(df.shape)
print(df_dedupped.shape)
@liannewriting
liannewriting / duplicate_data_drop.py
Created January 21, 2020 19:24
data_cleaning_202001
# drop duplicates based on an subset of variables.
key = ['timestamp', 'full_sq', 'life_sq', 'floor', 'build_year', 'num_room', 'price_doc']
df_dedupped2 = df.drop_duplicates(subset=key)
print(df.shape)
print(df_dedupped2.shape)
@liannewriting
liannewriting / outlier_barchart.py
Created January 21, 2020 19:29
data_cleaning_202001
# bar chart -  distribution of a categorical variable
df['ecology'].value_counts().plot.bar()
@liannewriting
liannewriting / string_lower_case1.py
Created January 21, 2020 19:30
data_cleaning_202001
df['sub_area'].value_counts(dropna=False)
@liannewriting
liannewriting / string_lower_case2.py
Created January 21, 2020 19:31
data_cleaning_202001
# make everything lower case.
df['sub_area_lower'] = df['sub_area'].str.lower()
df['sub_area_lower'].value_counts(dropna=False)
@liannewriting
liannewriting / categories_group1.py
Last active January 21, 2020 19:34
data_cleaning_202001
# group some categories together.
df['ecology'].value_counts(dropna=False)