Created
January 8, 2019 03:37
-
-
Save finnqiao/dd3ab972d88629b50b108f0e9721271e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Label encode categorical variables. | |
label_encoder = LabelEncoder() | |
mappings = [] | |
# Desired label orders for categorical columns. | |
educ_order = ['unknown', 'illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'professional.course', 'university.degree'] | |
month_order = ['mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'] | |
day_order = ['mon', 'tue', 'wed', 'thu', 'fri'] | |
# using cat.codes for order, one hot for high cardinality and weak case of cardinality. | |
def ordered_labels(df, col, order): | |
df[col] = df[col].astype('category') | |
df[col] = df[col].cat.reorder_categories(order, ordered=True) | |
df[col] = df[col].cat.codes.astype(int) | |
# Use dummy variables for occupation | |
X_df = pd.concat([X_df, pd.get_dummies(X_df['job'])],axis=1).drop('job',axis=1) | |
# Use ordered cat.codes for days, months, and education | |
ordered_labels(X_df, 'education', educ_order) | |
ordered_labels(X_df, 'month', month_order) | |
ordered_labels(X_df, 'day_of_week', day_order) | |
# Same label encoding for rest since low cardinality | |
for i, col in enumerate(X_df): | |
if X_df[col].dtype == 'object': | |
X_df[col] = label_encoder.fit_transform(np.array(X_df[col].astype(str)).reshape((-1,))) | |
mappings.append(dict(zip(label_encoder.classes_, range(1, len(label_encoder.classes_)+1)))) | |
X_df.head() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment