Skip to content

Instantly share code, notes, and snippets.

@netsatsawat
Created May 19, 2019 11:51
Show Gist options
  • Save netsatsawat/e449f6c426b7737f1cef42318928ebee to your computer and use it in GitHub Desktop.
Save netsatsawat/e449f6c426b7737f1cef42318928ebee to your computer and use it in GitHub Desktop.
Feature engineering function
def create_generation_feature(age_val: int) -> str:
"""
Function to convert age value onto generation string
@Args:
age_val (int): the age value from data frame
Return:
String output specifies the generation
"""
out = ''
if age_val < 37:
out = 'Millenials'
elif age_val < 54:
out = 'Generation X'
elif age_val < 73:
out = 'Boomers'
else:
out = 'Silent'
return out
def create_job_hop_index(df: pd.DataFrame, total_exp_col: str,
num_prev_com_col: str) -> (int, float):
"""
Function to compute the job hopper index and identify the first job or not
@Args:
df (pd.DataFrame): pandas data frame
total_exp_col (str): the name of the column containing total experience
num_prev_com_col (str): the name of the column containing numbers of previous companies worked
Return:
The job hopper index
"""
first_job_ind = np.where(df[num_prev_com_col] == 0, 1, 0)
job_hop_idx = np.where(df[num_prev_com_col] == 0,
0.,
df[total_exp_col] / df[num_prev_com_col]
)
return first_job_ind, job_hop_idx
def compute_compa_ratio_feature(df: pd.DataFrame, salary_col: str,
lookup_df: pd.DataFrame=_MEDIAN_INCOME_LOOKUP,
col_lookup_list: list=_LOOKUP_COL):
"""
Function to compute the compa-ratio of each employee.
This is computed by each employee divided by the median salary of the industry
@Args:
df: pandas data frame with monthly salary of each employee
salary_col: a name of pandas data frame storing income
lookup_df: pandas data frame stored the median of salary
col_lookup_list: A list of column names
Return:
pandas data frame
"""
merge_df = df.reset_index().merge(_MEDIAN_INCOME_LOOKUP,
on=_LOOKUP_COL,
how="left").set_index('index')
merge_df['compa_ratio'] = merge_df[salary_col] / merge_df['MedianIncome']
merge_df.drop(['MedianIncome', 'Count'], axis=1, inplace=True)
return merge_df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment