Created
November 7, 2018 06:50
-
-
Save yijia2413/088ad1fc70f8c3d9dd3e471953f06076 to your computer and use it in GitHub Desktop.
onehot encoder & label encoder 使用
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# labelencoder 和 onehotencoder 处理测试集中未出现的样本会报错,需要进行处理 | |
import pandas as pd | |
df1 = pd.read_csv("data/xxx.csv", encoding = 'utf-8', error_bad_lines=False, warn_bad_lines=False) | |
df1 = df1.iloc[:, :-1] | |
df1 = df1.drop(['xxx', 'aaa'], axis=1) | |
df1 = df1.fillna('0') | |
# 下面的代码可以直接将 整个 df labelencoder | |
from collections import defaultdict | |
d = defaultdict(LabelEncoder) | |
fit = df1.apply(lambda x: d[x.name].fit_transform(x)) | |
# label encoder 需要保存下来,而且还只能分column保存,恶心 | |
le_list = [] | |
for i in range(len(df1.columns)-1): | |
le_list.append(LabelEncoder()) | |
# le.fit(np.unique(df1.values)) | |
for i, feature in enumerate(list(df1.columns)[0:-1]): | |
df1[feature] = le_list[i].fit_transform(df1[feature].astype(str).values) | |
# 保存 | |
for i,v in enumerate(le_list): | |
dst = 'model/tmp-model/proc_le{}.pkl'.format(i) | |
pickle.dump(le_list[i], open(dst, 'wb')) | |
# 加载 | |
le_load = [] | |
for i in range(4): | |
dst = 'model/tmp-model/proc_le{}.pkl'.format(i) | |
le_load.append(pickle.load(open(dst, 'rb'))) | |
# 利用加载后的编码格式,编码新的df | |
for i, col in enumerate(df3.columns): | |
df3[col] = le_load[i].transform(df3[col]) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
X = df.iloc[:, :-1] | |
y = df.iloc[:, -1:] | |
# 保存编码 | |
import pickle | |
# declare OneHotEncoder from sklearn | |
oneHot = OneHotEncoder(categories='auto', handle_unknown='ignore') | |
# fit our X to oneHot encoder | |
oneHot.fit(X) | |
pickle.dump(oneHot, open('model/tmp-model/onehot.pkl', 'wb')) | |
# transform | |
X = oneHot.transform(X).toarray() | |
# fit our y to oneHot encoder | |
oneHot.fit(y) | |
# transform | |
y = oneHot.transform(y).toarray() | |
# 加载编码 | |
enc = pickle.load(open('model/tmp-model/proc_onehot.pkl', 'rb')) | |
testX = newdf.iloc[:, :-1] | |
testy = newdf.iloc[:, -1:] | |
testX = testX.fillna('0') | |
testX_enc = enc.transform(testX) | |
testX_enc = testX_enc.toarray() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment