Tara terrah27

## missing_imputer.py
# Impute with scikit-learn SimpleImputer
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit(imputed_df)
imputed_df = imputer.transform(imputed_df)

## missing_flag.py
# check number of missing values
print(f"Missing Values Before Flagging: {df['OCCUPATION_TYPE'].isnull().sum()}")

# check values of OCCUPATION_TYPE feature
print(df['OCCUPATION_TYPE'].value_counts())

# replace values with flag 1=data present 0=data missing
df['OCCUPATION_TYPE'] = np.where(df['OCCUPATION_TYPE'].isnull(), # condition
                                1, # value if true
                                0 # value if false

## impute_mean.py
# impute using fillna
# make a copy of dataframe for example purposes
imputed_df = df_threshold.copy()

# list of columns to impute
impute_cols = ['AMT_REQ_CREDIT_BUREAU_YEAR',
               'AMT_REQ_CREDIT_BUREAU_HOUR',
               'AMT_REQ_CREDIT_BUREAU_DAY',
               'AMT_REQ_CREDIT_BUREAU_WEEK',
               'AMT_REQ_CREDIT_BUREAU_MON',

## drop_missing.py
# remove columns base on percentage of missing values
def drop_missing_values(dataframe, threshold):

    # create list of features with missing values over threshold
    to_drop = [col for col in dataframe if \
               (dataframe[col].isnull().sum()/len(dataframe) >= threshold)]

    print('Columns to drop: ' , (len(to_drop)))
    # Drop features
    dataframe = dataframe.drop(columns=to_drop)

## missing_vals_function.py
# create a function to show missing value info
def get_missing_values_info(df):

    # find missing values in each column
    count_missing = df.isnull().sum()

    # get missing values as percent
    percent_missing = (100 * count_missing / df.shape[0]).round(1)

    # Make dataframe with the results

## missing_values.py
# find percent of columns with missing values and compare to total number of columns
cols_missing_vals = len(missing_values_list)
df_cols = df.shape[1]

print(f'Columns With Missing Values: {cols_missing_vals}')
print(f'Total Columns: {df_cols}')
print(f'Percent of Columns with Missing Values: {round(cols_missing_vals/df_cols*100,1)}%')

>>> Columns With Missing Values: 67
>>> Total Columns: 122

## rename4.py
# rename columns extra columns
worker_df_renamed = worker_df.rename(columns={'Worker Name':'worker_name',
                                              'Worker DOB':'worker_dob',
                                              'Termination Date':'termination_date',
                                              'Team':'team',
                                              'Worker Status':'worker_status',
                                              'Hire Date':'hire_date',
                                              'Worker ID':'worker_id'}, errors='raise')
worker_df_renamed.head()

## rename_direct_out_of_order.py
# we have to be careful to assign the column names in the correct order
worker_df.columns = ['worker name', 'worker id', 'hire date', 'worker status', 'team']
worker_df.head()

## rename_direct.py
# assign column headers directly
worker_df.columns = ['worker_id', 'worker_name', 'hire_date', 'worker_status', 'team']
worker_df.head()

## rename3.py
# rename columns extra columns
worker_df_renamed = worker_df.rename(columns={'Worker Name':'worker_name',
                                              'Worker DOB':'worker_dob',
                                              'Termination Date':'termination_date',
                                              'Team':'team',
                                              'Worker Status':'worker_status',
                                              'Hire Date':'hire_date',
                                              'Worker ID':'worker_id'})
worker_df_renamed.head()
	# Impute with scikit-learn SimpleImputer
	from sklearn.impute import SimpleImputer
	imputer = SimpleImputer(strategy='most_frequent')
	imputer.fit(imputed_df)
	imputed_df = imputer.transform(imputed_df)
	# check number of missing values
	print(f"Missing Values Before Flagging: {df['OCCUPATION_TYPE'].isnull().sum()}")

	# check values of OCCUPATION_TYPE feature
	print(df['OCCUPATION_TYPE'].value_counts())

	# replace values with flag 1=data present 0=data missing
	df['OCCUPATION_TYPE'] = np.where(df['OCCUPATION_TYPE'].isnull(), # condition
	1, # value if true
	0 # value if false
	# impute using fillna
	# make a copy of dataframe for example purposes
	imputed_df = df_threshold.copy()

	# list of columns to impute
	impute_cols = ['AMT_REQ_CREDIT_BUREAU_YEAR',
	'AMT_REQ_CREDIT_BUREAU_HOUR',
	'AMT_REQ_CREDIT_BUREAU_DAY',
	'AMT_REQ_CREDIT_BUREAU_WEEK',
	'AMT_REQ_CREDIT_BUREAU_MON',
	# remove columns base on percentage of missing values
	def drop_missing_values(dataframe, threshold):

	# create list of features with missing values over threshold
	to_drop = [col for col in dataframe if \
	(dataframe[col].isnull().sum()/len(dataframe) >= threshold)]

	print('Columns to drop: ' , (len(to_drop)))
	# Drop features
	dataframe = dataframe.drop(columns=to_drop)
	# create a function to show missing value info
	def get_missing_values_info(df):

	# find missing values in each column
	count_missing = df.isnull().sum()

	# get missing values as percent
	percent_missing = (100 * count_missing / df.shape[0]).round(1)

	# Make dataframe with the results
	# find percent of columns with missing values and compare to total number of columns
	cols_missing_vals = len(missing_values_list)
	df_cols = df.shape[1]

	print(f'Columns With Missing Values: {cols_missing_vals}')
	print(f'Total Columns: {df_cols}')
	print(f'Percent of Columns with Missing Values: {round(cols_missing_vals/df_cols*100,1)}%')

	>>> Columns With Missing Values: 67
	>>> Total Columns: 122
	# rename columns extra columns
	worker_df_renamed = worker_df.rename(columns={'Worker Name':'worker_name',
	'Worker DOB':'worker_dob',
	'Termination Date':'termination_date',
	'Team':'team',
	'Worker Status':'worker_status',
	'Hire Date':'hire_date',
	'Worker ID':'worker_id'}, errors='raise')
	worker_df_renamed.head()
	# we have to be careful to assign the column names in the correct order
	worker_df.columns = ['worker name', 'worker id', 'hire date', 'worker status', 'team']
	worker_df.head()
	# assign column headers directly
	worker_df.columns = ['worker_id', 'worker_name', 'hire_date', 'worker_status', 'team']
	worker_df.head()