Skip to content

Instantly share code, notes, and snippets.

View terrah27's full-sized avatar

Tara terrah27

  • Graham Healthcare Group
  • Pittsburgh, PA
  • X @terrah27
View GitHub Profile
# Impute with scikit-learn SimpleImputer
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(imputed_df)
imputed_df = imputer.transform(imputed_df)
# check number of missing values
print(f"Missing Values Before Flagging: {df['OCCUPATION_TYPE'].isnull().sum()}")
# check values of OCCUPATION_TYPE feature
print(df['OCCUPATION_TYPE'].value_counts())
# replace values with flag 1=data present 0=data missing
df['OCCUPATION_TYPE'] = np.where(df['OCCUPATION_TYPE'].isnull(), # condition
1, # value if true
0 # value if false
# impute using fillna
# make a copy of dataframe for example purposes
imputed_df = df_threshold.copy()
# list of columns to impute
impute_cols = ['AMT_REQ_CREDIT_BUREAU_YEAR',
'AMT_REQ_CREDIT_BUREAU_HOUR',
'AMT_REQ_CREDIT_BUREAU_DAY',
'AMT_REQ_CREDIT_BUREAU_WEEK',
'AMT_REQ_CREDIT_BUREAU_MON',
# remove columns base on percentage of missing values
def drop_missing_values(dataframe, threshold):
# create list of features with missing values over threshold
to_drop = [col for col in dataframe if \
(dataframe[col].isnull().sum()/len(dataframe) >= threshold)]
print('Columns to drop: ' , (len(to_drop)))
# Drop features
dataframe = dataframe.drop(columns=to_drop)
# create a function to show missing value info
def get_missing_values_info(df):
# find missing values in each column
count_missing = df.isnull().sum()
# get missing values as percent
percent_missing = (100 * count_missing / df.shape[0]).round(1)
# Make dataframe with the results
# find percent of columns with missing values and compare to total number of columns
cols_missing_vals = len(missing_values_list)
df_cols = df.shape[1]
print(f'Columns With Missing Values: {cols_missing_vals}')
print(f'Total Columns: {df_cols}')
print(f'Percent of Columns with Missing Values: {round(cols_missing_vals/df_cols*100,1)}%')
>>> Columns With Missing Values: 67
>>> Total Columns: 122
# rename columns extra columns
worker_df_renamed = worker_df.rename(columns={'Worker Name':'worker_name',
'Worker DOB':'worker_dob',
'Termination Date':'termination_date',
'Team':'team',
'Worker Status':'worker_status',
'Hire Date':'hire_date',
'Worker ID':'worker_id'}, errors='raise')
worker_df_renamed.head()
# we have to be careful to assign the column names in the correct order
worker_df.columns = ['worker name', 'worker id', 'hire date', 'worker status', 'team']
worker_df.head()
# assign column headers directly
worker_df.columns = ['worker_id', 'worker_name', 'hire_date', 'worker_status', 'team']
worker_df.head()
# rename columns extra columns
worker_df_renamed = worker_df.rename(columns={'Worker Name':'worker_name',
'Worker DOB':'worker_dob',
'Termination Date':'termination_date',
'Team':'team',
'Worker Status':'worker_status',
'Hire Date':'hire_date',
'Worker ID':'worker_id'})
worker_df_renamed.head()