Skip to content

Instantly share code, notes, and snippets.

@jurand71
Created June 16, 2022 10:30
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save jurand71/597a48c0f28d404845934c824c58046f to your computer and use it in GitHub Desktop.
# Import libraries
import numpy as np
import pandas as pd
# Display all columns
pd.set_option('display.max_columns', None)
# Import Houseprice data from GitHub
data = pd.read_csv('https://github.com/jurand71/datasets/raw/master/HouseSalePriceCompetition/houseprice.csv')
# Three variables were chosen from categorical variables for OneHotEncoder
usecols = ['Neighborhood','Exterior1st','Exterior2nd']
data = data[usecols]
# How many categories are in selected variables
for col in usecols:
print(col,': ',len(data[col].unique()))
# Find top categories and encode with one-hot encoding method
def find_top_categories(df, variable, count=10):
return [
x for x in df[variable].value_counts().sort_values(ascending=False).head(count).index
]
def onehot_encode(df, variable, top_categories):
for label in top_categories:
df[variable + '_' + label] = np.where(df[variable] == label,1,0)
for var in usecols:
top_categories = find_top_categories(data, var)
onehot_encode(data, var, top_categories)
data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment