Skip to content

Instantly share code, notes, and snippets.

@yijia2413
Created November 7, 2018 06:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yijia2413/088ad1fc70f8c3d9dd3e471953f06076 to your computer and use it in GitHub Desktop.
Save yijia2413/088ad1fc70f8c3d9dd3e471953f06076 to your computer and use it in GitHub Desktop.
onehot encoder & label encoder 使用
# labelencoder 和 onehotencoder 处理测试集中未出现的样本会报错,需要进行处理
import pandas as pd
df1 = pd.read_csv("data/xxx.csv", encoding = 'utf-8', error_bad_lines=False, warn_bad_lines=False)
df1 = df1.iloc[:, :-1]
df1 = df1.drop(['xxx', 'aaa'], axis=1)
df1 = df1.fillna('0')
# 下面的代码可以直接将 整个 df labelencoder
from collections import defaultdict
d = defaultdict(LabelEncoder)
fit = df1.apply(lambda x: d[x.name].fit_transform(x))
# label encoder 需要保存下来,而且还只能分column保存,恶心
le_list = []
for i in range(len(df1.columns)-1):
le_list.append(LabelEncoder())
# le.fit(np.unique(df1.values))
for i, feature in enumerate(list(df1.columns)[0:-1]):
df1[feature] = le_list[i].fit_transform(df1[feature].astype(str).values)
# 保存
for i,v in enumerate(le_list):
dst = 'model/tmp-model/proc_le{}.pkl'.format(i)
pickle.dump(le_list[i], open(dst, 'wb'))
# 加载
le_load = []
for i in range(4):
dst = 'model/tmp-model/proc_le{}.pkl'.format(i)
le_load.append(pickle.load(open(dst, 'rb')))
# 利用加载后的编码格式,编码新的df
for i, col in enumerate(df3.columns):
df3[col] = le_load[i].transform(df3[col])
X = df.iloc[:, :-1]
y = df.iloc[:, -1:]
# 保存编码
import pickle
# declare OneHotEncoder from sklearn
oneHot = OneHotEncoder(categories='auto', handle_unknown='ignore')
# fit our X to oneHot encoder
oneHot.fit(X)
pickle.dump(oneHot, open('model/tmp-model/onehot.pkl', 'wb'))
# transform
X = oneHot.transform(X).toarray()
# fit our y to oneHot encoder
oneHot.fit(y)
# transform
y = oneHot.transform(y).toarray()
# 加载编码
enc = pickle.load(open('model/tmp-model/proc_onehot.pkl', 'rb'))
testX = newdf.iloc[:, :-1]
testy = newdf.iloc[:, -1:]
testX = testX.fillna('0')
testX_enc = enc.transform(testX)
testX_enc = testX_enc.toarray()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment