Skip to content

Instantly share code, notes, and snippets.

@spitz-dan-l
Created November 18, 2014 22:58
Show Gist options
  • Save spitz-dan-l/f494674b855eeea9f998 to your computer and use it in GitHub Desktop.
Save spitz-dan-l/f494674b855eeea9f998 to your computer and use it in GitHub Desktop.
One-hot encoding demo
from sklearn.feature_extraction import DictVectorizer
import pandas
def demo1(input_csv, new_csv):
"""Demonstration of one-hot encoding through Scikit-Learn's DictVectorizer object."""
df = pandas.read_csv(input_csv) #read csv into a DataFrame object
#create the one hot encoder
one_hot_encoder = DictVectorizer(sparse=False)
#one hot encoder builds its internal mapping from string -> column index
#using the data in df
one_hot_encoder.fit(df_to_dicts(df)) #takes a long time on large datasets
#note- the output of transform() is a numpy array, NOT a DataFrame
one_hot_encoded_array = one_hot_encoder.transform(df_to_dicts(df))
print(one_hot_encoded_array)
# ...later, we have new data never seen by our model before...
df2 = pandas.read_csv(new_csv)
# we are reusing the same one_hot_encoder object as before. it gracefully handles never-before-seen categorical values
# and maps all input values to the correct columns.
new_one_hot_encoded_array = one_hot_encoder.transform(df_to_dicts(df2))
print(new_one_hot_encoded_array)
def df_to_dicts(df):
"""helper function for feeding the data from a DataFrame into a DictVectorizer"""
for (i, r) in df.iterrows():
yield r.to_dict()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment