krsnewwave/nvtabular_movielens_demo.py Secret

## nvtabular_movielens_demo.py
# movies - id, genres
# train and valid are ratings
# with the following schema:
# userid, moveid, rating (1-5)
# all are stored in parquet format

# join the columns userid and movie id
# wait for the "train" and "valid" datasets later...
joined = ["userId", "movieId"] >> nvt.ops.JoinExternal(movies, on=["movieId"])

# convert users and movies to categoricals
cat_features = joined >> nvt.ops.Categorify()

# convert explicit ratings (4 & 5) as implicit (1)
ratings = nvt.ColumnGroup(["rating"]) >> nvt.ops.LambdaOp(lambda col: (col > 3).astype("int8"))

output = cat_features + ratings

# workflow is like a pipeline in sklearn
workflow = nvt.Workflow(output)
output.graph
	# movies - id, genres
	# train and valid are ratings
	# with the following schema:
	# userid, moveid, rating (1-5)
	# all are stored in parquet format

	# join the columns userid and movie id
	# wait for the "train" and "valid" datasets later...
	joined = ["userId", "movieId"] >> nvt.ops.JoinExternal(movies, on=["movieId"])

	# convert users and movies to categoricals
	cat_features = joined >> nvt.ops.Categorify()

	# convert explicit ratings (4 & 5) as implicit (1)
	ratings = nvt.ColumnGroup(["rating"]) >> nvt.ops.LambdaOp(lambda col: (col > 3).astype("int8"))

	output = cat_features + ratings

	# workflow is like a pipeline in sklearn
	workflow = nvt.Workflow(output)
	output.graph