Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save korkridake/bb45698edea69906ee22bd985090bb0f to your computer and use it in GitHub Desktop.
Save korkridake/bb45698edea69906ee22bd985090bb0f to your computer and use it in GitHub Desktop.
MLOps Ep.4 Productionizing Training Script (Load and preprocess data)
# Load data in Pandas
df = dataset.to_pandas_dataframe()
print(df.shape)
df.head()
# Preprocess data
df = df.drop_duplicates()
df = df.drop(["dateCrawled","dateCreated","lastSeen", "seller", "name", "postalCode"] , axis = 1)
df["notRepairedDamage"] = df["notRepairedDamage"].fillna("nein")
df["fuelType"] = df["fuelType"].fillna("benzin")
df["model"] = df["model"].fillna("golf")
vehicleType = df["vehicleType"].unique()
fuelType = df["fuelType"].unique()
vehicleType = vehicleType[vehicleType != np.array(None)]
fuelType = fuelType[fuelType != np.array(None)]
d = {}
for i in fuelType :
m = 0
for j in vehicleType :
if df[(df.vehicleType == j) & (df.fuelType == i)].shape[0] > m :
m = df[(df.vehicleType == j) & (df.fuelType == i)].shape[0]
d[i] = j
for i in fuelType :
df.loc[(df.fuelType == i) & (df.vehicleType.isnull()) ,"vehicleType" ] = d[i]
gearbox = df["gearbox"].unique()
brand = df["brand"].unique()
gearbox = gearbox[gearbox != np.array(None)]
brand = brand[brand != np.array(None)]
d = {}
for i in brand :
m = 0
for j in gearbox :
if df[(df.gearbox == j) & (df.brand == i)].shape[0] > m:
m = df[(df.gearbox == j) & (df.brand == i)].shape[0]
d[i] = j
for i in brand :
df.loc[(df.brand == i) & (df.gearbox.isnull()) ,"gearbox"] = d[i]
df = df.dropna()
df["offerType"] = LabelEncoder().fit_transform(df["offerType"])
df["vehicleType"] = LabelEncoder().fit_transform(df["vehicleType"])
df["fuelType"] = LabelEncoder().fit_transform(df["fuelType"])
df["gearbox"] = LabelEncoder().fit_transform(df["gearbox"])
df["notRepairedDamage"] = LabelEncoder().fit_transform(df["notRepairedDamage"])
df["brand"] = LabelEncoder().fit_transform(df["brand"])
df["model"] = LabelEncoder().fit_transform(df["model"])
df["abtest"] = LabelEncoder().fit_transform(df["abtest"])
df = df[(df.yearOfRegistration < 2017) & (df.yearOfRegistration > 1950)]
df = df[(df.price > 100) & (df.price < 200000) ]
y = df["price"]
X = df.drop("price",axis=1)
# Split Data into Training and Validation Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1234)
data = {"train": {"X": X_train, "y": y_train},
"test": {"X": X_test, "y": y_test}}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment