This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
train_dataset = nvt.Dataset([os.path.join(WORKING_DIR, "train.parquet")]) | |
valid_dataset = nvt.Dataset([os.path.join(WORKING_DIR, "valid.parquet")]) | |
# fit the workflow with our dataset | |
workflow.fit(train_dataset) | |
# define the dtypes | |
CATEGORICAL_COLUMNS = ["userId", "movieId"] | |
LABEL_COLUMNS = ["rating"] | |
dict_dtypes = {} | |
for col in CATEGORICAL_COLUMNS: | |
dict_dtypes[col] = np.int64 | |
for col in LABEL_COLUMNS: | |
dict_dtypes[col] = np.float32 | |
# transform data | |
workflow.transform(train_dataset).to_parquet( | |
output_path=os.path.join(WORKING_DIR, "train"), | |
shuffle=nvt.io.Shuffle.PER_PARTITION, | |
cats=["userId", "movieId", "genres"], | |
labels=["rating"], | |
dtypes=dict_dtypes, | |
) | |
# save the workflow | |
workflow.save(os.path.join(WORKING_DIR, "workflow")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment