Created
May 19, 2019 11:51
-
-
Save netsatsawat/e449f6c426b7737f1cef42318928ebee to your computer and use it in GitHub Desktop.
Feature engineering function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def create_generation_feature(age_val: int) -> str: | |
""" | |
Function to convert age value onto generation string | |
@Args: | |
age_val (int): the age value from data frame | |
Return: | |
String output specifies the generation | |
""" | |
out = '' | |
if age_val < 37: | |
out = 'Millenials' | |
elif age_val < 54: | |
out = 'Generation X' | |
elif age_val < 73: | |
out = 'Boomers' | |
else: | |
out = 'Silent' | |
return out | |
def create_job_hop_index(df: pd.DataFrame, total_exp_col: str, | |
num_prev_com_col: str) -> (int, float): | |
""" | |
Function to compute the job hopper index and identify the first job or not | |
@Args: | |
df (pd.DataFrame): pandas data frame | |
total_exp_col (str): the name of the column containing total experience | |
num_prev_com_col (str): the name of the column containing numbers of previous companies worked | |
Return: | |
The job hopper index | |
""" | |
first_job_ind = np.where(df[num_prev_com_col] == 0, 1, 0) | |
job_hop_idx = np.where(df[num_prev_com_col] == 0, | |
0., | |
df[total_exp_col] / df[num_prev_com_col] | |
) | |
return first_job_ind, job_hop_idx | |
def compute_compa_ratio_feature(df: pd.DataFrame, salary_col: str, | |
lookup_df: pd.DataFrame=_MEDIAN_INCOME_LOOKUP, | |
col_lookup_list: list=_LOOKUP_COL): | |
""" | |
Function to compute the compa-ratio of each employee. | |
This is computed by each employee divided by the median salary of the industry | |
@Args: | |
df: pandas data frame with monthly salary of each employee | |
salary_col: a name of pandas data frame storing income | |
lookup_df: pandas data frame stored the median of salary | |
col_lookup_list: A list of column names | |
Return: | |
pandas data frame | |
""" | |
merge_df = df.reset_index().merge(_MEDIAN_INCOME_LOOKUP, | |
on=_LOOKUP_COL, | |
how="left").set_index('index') | |
merge_df['compa_ratio'] = merge_df[salary_col] / merge_df['MedianIncome'] | |
merge_df.drop(['MedianIncome', 'Count'], axis=1, inplace=True) | |
return merge_df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment