Skip to content

Instantly share code, notes, and snippets.

@finnqiao
Created January 8, 2019 03:37
Show Gist options
  • Save finnqiao/dd3ab972d88629b50b108f0e9721271e to your computer and use it in GitHub Desktop.
Save finnqiao/dd3ab972d88629b50b108f0e9721271e to your computer and use it in GitHub Desktop.
# Label encode categorical variables.
label_encoder = LabelEncoder()
mappings = []
# Desired label orders for categorical columns.
educ_order = ['unknown', 'illiterate', 'basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'professional.course', 'university.degree']
month_order = ['mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
day_order = ['mon', 'tue', 'wed', 'thu', 'fri']
# using cat.codes for order, one hot for high cardinality and weak case of cardinality.
def ordered_labels(df, col, order):
df[col] = df[col].astype('category')
df[col] = df[col].cat.reorder_categories(order, ordered=True)
df[col] = df[col].cat.codes.astype(int)
# Use dummy variables for occupation
X_df = pd.concat([X_df, pd.get_dummies(X_df['job'])],axis=1).drop('job',axis=1)
# Use ordered cat.codes for days, months, and education
ordered_labels(X_df, 'education', educ_order)
ordered_labels(X_df, 'month', month_order)
ordered_labels(X_df, 'day_of_week', day_order)
# Same label encoding for rest since low cardinality
for i, col in enumerate(X_df):
if X_df[col].dtype == 'object':
X_df[col] = label_encoder.fit_transform(np.array(X_df[col].astype(str)).reshape((-1,)))
mappings.append(dict(zip(label_encoder.classes_, range(1, len(label_encoder.classes_)+1))))
X_df.head()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment