javasoze/basicConf.yml

## basicConf.yml
# base directory for storage
baseDir: myDataDir       # e.g. /data/index

# how to handle old time-slices
retention:
  numDays: number_days    # number of days before expiring time-slices. This number can be a fraction e.g. 0.5 days
  expirationHandler:
    type: purge          # purge the expired time-slices

# cluster configuration
cluster:
  name: clusterName      # name of the cluster, e.g. myNginxLogs
  url: zookeeper_urls    # zookeeper urls, e.g. zk1:2181,zk2,2181

# indexer configuration
# We suggest default indexing values are kept
indexer:
  # the following parameters configure how often to flush the timeslice,
  # the indexer will trigger a flush when either one of the conditions are met
  maxDocsPerSegment: number_events       # maximum number of events to gather before flushing the timeslice, default 1,000,000
  maxIndexingDelayInSec: number_seconds  # how long to wait before flushing the timeslices, default 45 seconds
  numIndexingThreads: number_threads     # number of indexing threads, default 2

## firehose_cloudwatch.yml
# aws cloudwatch firehose that reads from a logGrouop and logStream
type: cloudwatch
params:
  group:  myLogGroup    # log group
  stream: myLogStream   # log stream

## firehose_dir.yml
# Recursively streaming in all files of a given pattern in a specified directory, where each file is line-delimited
type: dir
  params:
    baseDir: mydir          # directory name, e.g. /var/logs
    pattern: filePattern    # regex pattern for file names, e.g. ^(.*)nginx(.*).log

## firehose_file.yml
# streams from a line-delimited file
type: file
  params:
    file: myFile     # file name

## firehose_http.yml
# Pushed based firehose listening on the /v1/firehose/http/insert end-point in a Dashbase server
type: http

## firehose_kafka.yml
# Kafka 0.10.x firehose (https://kafka.apache.org/)
type: kafka10
params:
  hosts: brokerHosts          # broker hosts
  groupId: myGroupId          # consumer group id
  topic: myTopic              # kafka topic
  pollIntervalMs: interval    # poll interval in ms, default 1000ms
  partitions: partitionList   # list of partitions to stream, if empty, will be managed automatically
  kafkaProps: props           # additional kafka properties, see: https://kafka.apache.org/documentation/#consumerconfigs

## firehose_nsq.yml
# firehose that streams from a NSQ queue
type: nsq
params:
  lookupAddress: host    # nsq host, default: localhost
  lookupPort: port       # nsq port, default: 4161
  topic: myTopic         # topic to read from
  channel: myChannel     # channel
  queueSize: 1000000     # queue buffer size: default 1,000,000

## firehose_s3.yml
# aws s3 firehose that reads a s3 bucket
type: s3
params:
  bucket: myBucket             # bucket name
  prefix: myService/logs       # optional prefix of sub-buckets
  start-after: 2017-04-09/     # optional parameter to start from a order list of files, useful for files organized with dates
  pattern: "((19|20)\\d\\d)-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])"  # only match file names matching the pattern
  gzipped: true                # optinal flag to apply gzip decoding on the files

## firehose_syslog.yml
# syslog firehose that listens on a port with provided protocol
type: syslog
params:
  port: 32376    # port to listen on, default 32376
  protocol: udp  # udp/tcp, default udp

## parser_csv.yml
# parses a line of csv (comma separated value), with schema defined in the pattern section
# e.g.
# 127.0.0.1,blah blah,2017/07/30,this is a message,200,Safari Webkit
type: csv
params:
  pattern:
    timeFormat: yyyy/MM/dd
    schema:
      - name: host
        type: meta
      - name: -
      - name: time
        type: time
      - name: message
        type: text
      - name: responseCode
        type: meta
      - name: agent
        type: text

## parser_json.yml
# parses a json string
type: json
params:
  flatten:                # if configured, json will be flattened, details visit: https://github.com/wnameless/json-flattener
    separator: '.'        # separator char, default: '.'
    keepArray: false
    unicode: false
  pattern:                # by default, string values will be configured as text, and numbers will be configured as numeric
    schema:               # the schema overrides the defaults on a field
      - name: timestamp   # look at the "timestamp" field for time value.
                          # If not specified, auto time is configured, which takes current system time
        type: time
      - name: myMeta      # overrides a field to a different type
        type: meta
    timeFormat: epoch     # time format if provided in the data, e.g. epoch, iso, or a specific format string


## parser_keyvalue.yml
# parser a line of key,value pairs delimited by the specified delimiter, key and values are separated by a separator
# e.g. key1:val1,key2:val2, if separator is ":" and delimiter is ","
type: keyvalue
params:
  separator: ":"
  delimiter: ","
  pattern:
    timeFormat: epoch
    schema:
      - name: type
        type: meta
      - name: time
        type: time
      - name: message
        type: text
      - name: number
        type: numeric

## parser_regex.yml
# parses an arbitrary string with regular expression
# this is an example of parsing nginx access and error logs
type: regex
params:
  patterns:               # a list of patterns, the parser will evaluate in order until one is matched.
    - regex: "^([\\d.]+) (\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(\\S+) (.+?)\" (\\d{3}) (\\d+) (\\d+) \"([^\"]+)\" \"([^\"]+)\""
      schema:
        - name: host
          type: meta
        - type: '-'      # '-' tells parser to ignore this group
        - type: '-'
        - type: '-'
        - type: time
        - name: request
          type: text
        - name: response
          type: meta
        - name: bytesSent
          type: numeric
        - type: '-'
        - type: '-'
        - name: agent
          type: text
      timeFormat: "dd/MMM/yyyy:HH:mm:ss Z"      # time format
    - regex: "^(\\d{4}/\\d{2}/\\d{2} \\d{2}:\\d{2}:\\d{2}) \\[(\\S+)\\] (\\d+)\\#(\\d+)\\: \\*(\\d+) (.+) client: (.+), server: (.+)"
      schema:
        - type: time
        - name: level
          type: meta
        - type: '-'
        - type: '-'
        - type: '-'
        - name: message
          type: text
        - name: host
          type: meta
      timeFormat: "yyyy/MM/dd HH:mm:ss"
      additionalColumns:                      # additional manual columns to create if matches
        - name: error
          type: meta
          value: true

## sample_conf.yml
#dashbase configuration
dashbase:

  # cluster configuration
  cluster:
    name: my_logs
    url: localhost:2181

  # base directory
  baseDir: /data/index

  # firehose configuration
  firehose:
    type: dir
    params:
      baseDir: /data/input/nginx
      pattern: ^(.*)nginx(.*).log

  # indexing parameters (optional)
  indexer:
    maxDocsPerSegment: 1000000

  # retention parameters
  retention:
    expirationHandler:
      params:
      type: purge
    numDays: 14

  # parser configuraiton
  parser:
    flatten:
      separator: '.'
    pattern:                # by default, string values will be configured as text, and numbers will be configured as numeric
      schema:               # the schema overrides the defaults on a field
        - name: timestamp   # look at the "timestamp" field for time value.
                          # If not specified, auto time is configured, which takes current system time
          type: time
        - name: myMeta      # overrides a field to a different type
          type: meta
      timeFormat: epoch

# dropwizard server configuration
server:
  applicationConnectors:
      - type: http
        port: 7888
  applicationContextPath: /
  adminContextPath: /admin
  adminConnectors:
      - type: http
        port: 7887

# dropwizard logging configuration
logging:
  level: INFO
  appenders:
      - type: file
        currentLogFilename: logs/dashbase.log
        archive: true
        archivedLogFilenamePattern: logs/dashbase-%d.log
        archivedFileCount: 9
        timeZone: UTC
	# base directory for storage
	baseDir: myDataDir # e.g. /data/index

	# how to handle old time-slices
	retention:
	numDays: number_days # number of days before expiring time-slices. This number can be a fraction e.g. 0.5 days
	expirationHandler:
	type: purge # purge the expired time-slices

	# cluster configuration
	cluster:
	name: clusterName # name of the cluster, e.g. myNginxLogs
	url: zookeeper_urls # zookeeper urls, e.g. zk1:2181,zk2,2181

	# indexer configuration
	# We suggest default indexing values are kept
	indexer:
	# the following parameters configure how often to flush the timeslice,
	# the indexer will trigger a flush when either one of the conditions are met
	maxDocsPerSegment: number_events # maximum number of events to gather before flushing the timeslice, default 1,000,000
	maxIndexingDelayInSec: number_seconds # how long to wait before flushing the timeslices, default 45 seconds
	numIndexingThreads: number_threads # number of indexing threads, default 2
	# aws cloudwatch firehose that reads from a logGrouop and logStream
	type: cloudwatch
	params:
	group: myLogGroup # log group
	stream: myLogStream # log stream
	# Recursively streaming in all files of a given pattern in a specified directory, where each file is line-delimited
	type: dir
	params:
	baseDir: mydir # directory name, e.g. /var/logs
	pattern: filePattern # regex pattern for file names, e.g. ^(.)nginx(.).log
	# streams from a line-delimited file
	type: file
	params:
	file: myFile # file name
	# Pushed based firehose listening on the /v1/firehose/http/insert end-point in a Dashbase server
	type: http
	# Kafka 0.10.x firehose (https://kafka.apache.org/)
	type: kafka10
	params:
	hosts: brokerHosts # broker hosts
	groupId: myGroupId # consumer group id
	topic: myTopic # kafka topic
	pollIntervalMs: interval # poll interval in ms, default 1000ms
	partitions: partitionList # list of partitions to stream, if empty, will be managed automatically
	kafkaProps: props # additional kafka properties, see: https://kafka.apache.org/documentation/#consumerconfigs
	# firehose that streams from a NSQ queue
	type: nsq
	params:
	lookupAddress: host # nsq host, default: localhost
	lookupPort: port # nsq port, default: 4161
	topic: myTopic # topic to read from
	channel: myChannel # channel
	queueSize: 1000000 # queue buffer size: default 1,000,000
	# aws s3 firehose that reads a s3 bucket
	type: s3
	params:
	bucket: myBucket # bucket name
	prefix: myService/logs # optional prefix of sub-buckets
	start-after: 2017-04-09/ # optional parameter to start from a order list of files, useful for files organized with dates
	pattern: "((19\|20)\\d\\d)-(0[1-9]\|1[012])-(0[1-9]\|[12][0-9]\|3[01])" # only match file names matching the pattern
	gzipped: true # optinal flag to apply gzip decoding on the files
	# syslog firehose that listens on a port with provided protocol
	type: syslog
	params:
	port: 32376 # port to listen on, default 32376
	protocol: udp # udp/tcp, default udp
	# parses a line of csv (comma separated value), with schema defined in the pattern section
	# e.g.
	# 127.0.0.1,blah blah,2017/07/30,this is a message,200,Safari Webkit
	type: csv
	params:
	pattern:
	timeFormat: yyyy/MM/dd
	schema:
	- name: host
	type: meta
	- name: -
	- name: time
	type: time
	- name: message
	type: text
	- name: responseCode
	type: meta
	- name: agent
	type: text