Skip to content

Instantly share code, notes, and snippets.

@jurand71
Created June 16, 2022 10:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jurand71/46d7e641193a562bdbd6143f6b0dba19 to your computer and use it in GitHub Desktop.
Save jurand71/46d7e641193a562bdbd6143f6b0dba19 to your computer and use it in GitHub Desktop.
# Import libraries
import numpy as np
import pandas as pd
# Display all columns
pd.set_option('display.max_columns', None)
# Import Houseprice data from GitHub
df = pd.read_csv('https://github.com/jurand71/datasets/raw/master/HouseSalePriceCompetition/houseprice.csv')
# Determine categorical variables in the dataset
categorical_variables = [var for var in df.columns if df[var].dtype == 'O']
# Let's explore the cardinality in variables
categories = {}
for cat_variable in categorical_variables:
categories[df[cat_variable].name] = list(df[cat_variable].unique())
# Three variables were chosen from categorical variables for OneHotEncoder
usecols = ['HeatingQC','KitchenQual','CentralAir']
df = df[usecols]
# Import OneHotEncoder class
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(categories='auto',
drop='first', # to return k-1, drop=false to return k dummies
sparse=False,
handle_unknown='error') # helps deal with rare labels)
enc.fit(df.fillna('Missing'))
# Learned categories in dataset
enc.categories_
# Transform encoding for dataset
enc_data = enc.transform(df.fillna('Missing'))
# Convert array to dataframe
pd.DataFrame(enc_data).head()
# Retrieve the feature names
enc.get_feature_names_out()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment