justxor/dsd.yml

## dsd.yml
# This is a sample config file that you can use as a reference for building your own.

# The "features" section describes how to map your input events into ML features that Metarank understands.
# See doc/feature_extractors.md for a general overview of possible things you can do, and doc/features
# for detailed per-extractor configuration options.

# These features can be shared between multiple models, so if you have a model A using features 1-2-3 and
# a model B using features 1-2, then all three features will be computed only once.
# You need to explicitely include a feture in the model configuration for Metarank to use it.
features:
  - name: popularity
    type: number
    scope: item
    source: item.popularity
    # TTL and refresh fields are part of every feature extractor that Metarank supports.
    # The purpose of TTL is to configure data retention period, so in a case when there were no
    # feature updates for a long time, it will eventually be dropped.
    ttl: 60d
    # Refresh parameter is used to downsample the amount of feature updates emitted. For example,
    # there is a window_counter feature extractor, which can be used to count a number of clicks that happened for
    # an item. Incrementing such a counter for a single day is an extremely lightweight operation, but computing
    # window sums is not. As it's not always required to receive up-to-date counter values in ML models,
    # these window sums can be updated only eventually (like once per hour), which improves the throughput a lot
    # (but results in a slightly stale data during the inference process)
    refresh: 1h

  - name: genre
    type: string
    scope: item
    source: item.genres
    values:
      - drama
      - comedy
      - thriller

models:
  # see supported-ranking-models.md for more details on supported ML models

  default: # name of the model, used in the inference process as a part of path, like /rank/default
    type: lambdamart # for now, only lambdamart is supported
    path: /tmp/xgboost.model # schemas like s3:// are also supported
    backend:
      type: xgboost # supported values: xgboost, lightgbm
      iterations: 100 # optional (default 100), number of interations while training the model
      seed: 0 # optional (default = random), a seed to make training deterministic
    weights: # types and weights of interactions used in the model training
      click: 1 # you can increase the weight of some events to hint model to optimize more for them
    features: # features from the previous section used in the model
      - popularity
      - genre

# where to take events from, and where to write intermediate data.
bootstrap:
  workdir: file:///tmp/bootstrap
  # how many CPUs use to process the data
  # parallelism: 1

  # inject synthetic "user examined these items in ranking" impression events
  # see doc/click-models.md for details, enabled by default
  # syntheticImpression:
  #   enabled: true
  #   eventName: impression # can be customized

  source:
    type: file
    path: file:///ranklens/events/

    # possible options: file, kafka, pulsar

    # type: kafka
    # brokers: [broker1, broker2]
    # topic: events
    # groupId: metarank
    # offset: earliest|id=<numerical offset id>|latest|ts=<unixtime>|last=<duration>

    # type: pulsar
    # serviceUrl: <pulsar service URL>
    # adminUrl: <pulsar service HTTP admin URL>
    # topic: events
    # subscriptionName: metarank
    # subscriptionType: exclusive # options are exclusive, shared, failover
    # offset: earliest|id=<numerical offset id>|latest|ts=<unixtime>|last=<duration>

    # offset options are:
    # earliest - start from the first stored message in the topic
    # id=<offset id> - start from a specific offset id
    # latest - consume only events that came recently (after we connected)
    # ts=<timestamp> - start from a specific absolute timestamp in the past
    # last=<duration> - consume only events happened within the defined relative duration
    #   duration supports the following patterns: 1s, 1m, 1h, 1d

# options to use for the inference process
inference:
  port: 8080
  host: "0.0.0.0"
  source: # a source of events to be processed in real-time
    # possible options are: kafka, pulsar, rest
    # kafka and pulsar definitions are the same as for the bootstrap.source section
    type: rest
    bufferSize: 1000 # optional (default 10000), size of an internal buffer for feedback events
    host: localhost # hostname of merarank API
    port: 8080 # its port

  state: # a place to store the feature values for the ML inference
    # Local memory
    # A node-local in-memory storage without any persistence. Loads latest feature values from the bootstrap process
    # automatically, so you don't need to run the Upload job with this type of storage.
    # Suitable only for local testing, as in case of a restart it will loose all the data and start with the values from
    # the bootstrap stage.
    type: memory
    format: json # Metarank supports json and protobuf: json is human-readable, protobuf is much more compact

    # Remote redis, with persistence.
    # Requires the Upload job to be run to upload all the historical feature values
    # computed during the bootstrap inro Redis.
    # type: redis
    # host: localhost
    # format: json
	# This is a sample config file that you can use as a reference for building your own.

	# The "features" section describes how to map your input events into ML features that Metarank understands.
	# See doc/feature_extractors.md for a general overview of possible things you can do, and doc/features
	# for detailed per-extractor configuration options.

	# These features can be shared between multiple models, so if you have a model A using features 1-2-3 and
	# a model B using features 1-2, then all three features will be computed only once.
	# You need to explicitely include a feture in the model configuration for Metarank to use it.
	features:
	- name: popularity
	type: number
	scope: item
	source: item.popularity
	# TTL and refresh fields are part of every feature extractor that Metarank supports.
	# The purpose of TTL is to configure data retention period, so in a case when there were no
	# feature updates for a long time, it will eventually be dropped.
	ttl: 60d
	# Refresh parameter is used to downsample the amount of feature updates emitted. For example,
	# there is a window_counter feature extractor, which can be used to count a number of clicks that happened for
	# an item. Incrementing such a counter for a single day is an extremely lightweight operation, but computing
	# window sums is not. As it's not always required to receive up-to-date counter values in ML models,
	# these window sums can be updated only eventually (like once per hour), which improves the throughput a lot
	# (but results in a slightly stale data during the inference process)
	refresh: 1h

	- name: genre
	type: string
	scope: item
	source: item.genres
	values:
	- drama
	- comedy
	- thriller

	models:
	# see supported-ranking-models.md for more details on supported ML models

	default: # name of the model, used in the inference process as a part of path, like /rank/default
	type: lambdamart # for now, only lambdamart is supported
	path: /tmp/xgboost.model # schemas like s3:// are also supported
	backend:
	type: xgboost # supported values: xgboost, lightgbm
	iterations: 100 # optional (default 100), number of interations while training the model
	seed: 0 # optional (default = random), a seed to make training deterministic
	weights: # types and weights of interactions used in the model training
	click: 1 # you can increase the weight of some events to hint model to optimize more for them
	features: # features from the previous section used in the model
	- popularity
	- genre

	# where to take events from, and where to write intermediate data.
	bootstrap:
	workdir: file:///tmp/bootstrap
	# how many CPUs use to process the data
	# parallelism: 1

	# inject synthetic "user examined these items in ranking" impression events
	# see doc/click-models.md for details, enabled by default
	# syntheticImpression:
	# enabled: true
	# eventName: impression # can be customized

	source:
	type: file
	path: file:///ranklens/events/

	# possible options: file, kafka, pulsar

	# type: kafka
	# brokers: [broker1, broker2]
	# topic: events
	# groupId: metarank
	# offset: earliest\|id=<numerical offset id>\|latest\|ts=<unixtime>\|last=<duration>

	# type: pulsar
	# serviceUrl: <pulsar service URL>
	# adminUrl: <pulsar service HTTP admin URL>
	# topic: events
	# subscriptionName: metarank
	# subscriptionType: exclusive # options are exclusive, shared, failover
	# offset: earliest\|id=<numerical offset id>\|latest\|ts=<unixtime>\|last=<duration>

	# offset options are:
	# earliest - start from the first stored message in the topic
	# id=<offset id> - start from a specific offset id
	# latest - consume only events that came recently (after we connected)
	# ts=<timestamp> - start from a specific absolute timestamp in the past
	# last=<duration> - consume only events happened within the defined relative duration
	# duration supports the following patterns: 1s, 1m, 1h, 1d

	# options to use for the inference process
	inference:
	port: 8080
	host: "0.0.0.0"
	source: # a source of events to be processed in real-time
	# possible options are: kafka, pulsar, rest
	# kafka and pulsar definitions are the same as for the bootstrap.source section
	type: rest
	bufferSize: 1000 # optional (default 10000), size of an internal buffer for feedback events
	host: localhost # hostname of merarank API
	port: 8080 # its port

	state: # a place to store the feature values for the ML inference
	# Local memory
	# A node-local in-memory storage without any persistence. Loads latest feature values from the bootstrap process
	# automatically, so you don't need to run the Upload job with this type of storage.
	# Suitable only for local testing, as in case of a restart it will loose all the data and start with the values from
	# the bootstrap stage.
	type: memory
	format: json # Metarank supports json and protobuf: json is human-readable, protobuf is much more compact

	# Remote redis, with persistence.
	# Requires the Upload job to be run to upload all the historical feature values
	# computed during the bootstrap inro Redis.
	# type: redis
	# host: localhost
	# format: json