Skip to content

Instantly share code, notes, and snippets.

@Coldsp33d
Created June 27, 2019 21:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Coldsp33d/eef541de023f8d47bc41b2210ed8564e to your computer and use it in GitHub Desktop.
Save Coldsp33d/eef541de023f8d47bc41b2210ed8564e to your computer and use it in GitHub Desktop.
def load_data(datafile, encoder=None):
data = pd.read_csv(datafile, header=0, low_memory=False)
data_y = data[['job_performance']]
data_x = data.drop(['job_performance'], axis=1)
data_x.replace([np.inf, -np.inf], np.nan, inplace=True)
data_x.fillna(data_x.mean(), inplace=True)
if not encoder:
encoder = OneHotEncoder(handle_unknown='ignore', categories='auto')
fnc = encoder.fit_transform
else:
fnc = encoder.transform
data_x2 = data_x.drop(data_x.select_dtypes(object).columns, axis=1)
dummies = pd.DataFrame(
fnc(data_x.select_dtypes(object)
.iloc[:,:287]
.fillna(0)
.astype(str)).toarray(),
index=data_x.index,
dtype=int)
data_x = pd.concat([data_x2, dummies], axis=1)
return (data_x, data_y, encoder)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment