Skip to content

Instantly share code, notes, and snippets.

@lgoldstein
Created May 17, 2015 10:10
Show Gist options
  • Save lgoldstein/898b1a92a9e435c29116 to your computer and use it in GitHub Desktop.
Save lgoldstein/898b1a92a9e435c29116 to your computer and use it in GitHub Desktop.
Logstash configuration causing 1.5.0 failure
# version=1.0.12
input {
stdin {
tags => [ "syslog-raw" ]
}
# syslog {
# tags => [ "syslog-raw" ]
# use_labels => false
# port => 5151
# }
# tcp {
# codec => "plain"
# data_timeout => -1
# host => "0.0.0.0"
# mode => "server"
# port => 5151
# tags => [ "syslog-raw" ]
# }
# udp {
# buffer_size => 8192
# codec => "plain"
# host => "0.0.0.0"
# port => 5151
# queue_size => 2000
# tags => [ "syslog-raw" ]
# workers => 4
# }
}
# 1st take care of any raw syslog messages
filter {
if ("syslog-raw" in [tags]) {
# From RFC5424: The syslog message has the following ABNF definition:
#
# SYSLOG-MSG = HEADER SP STRUCTURED-DATA [SP MSG]
# HEADER = PRI VERSION SP TIMESTAMP SP HOSTNAME SP APP-NAME SP PROCID SP MSGID
# PRI = "<" PRIVAL ">"
# PRIVAL = 1*3DIGIT ; range 0 .. 191
# VERSION = NONZERO-DIGIT 0*2DIGIT
# HOSTNAME = NILVALUE / 1*255PRINTUSASCII
# APP-NAME = NILVALUE / 1*48PRINTUSASCII
# PROCID = NILVALUE / 1*128PRINTUSASCII
# MSGID = NILVALUE / 1*32PRINTUSASCII
# TIMESTAMP = NILVALUE / FULL-DATE "T" FULL-TIME
# FULL-DATE = DATE-FULLYEAR "-" DATE-MONTH "-" DATE-MDAY
# DATE-FULLYEAR = 4DIGIT
# DATE-MONTH = 2DIGIT ; 01-12
# DATE-MDAY = 2DIGIT ; 01-28, 01-29, 01-30, 01-31 based on ; month/year
# FULL-TIME = PARTIAL-TIME TIME-OFFSET
# PARTIAL-TIME = TIME-HOUR ":" TIME-MINUTE ":" TIME-SECOND [TIME-SECFRAC]
# TIME-HOUR = 2DIGIT ; 00-23
# TIME-MINUTE = 2DIGIT ; 00-59
# TIME-SECOND = 2DIGIT ; 00-59
# TIME-SECFRAC = "." 1*6DIGIT
# TIME-OFFSET = "Z" / TIME-NUMOFFSET
# TIME-NUMOFFSET = ("+" / "-") TIME-HOUR ":" TIME-MINUTE
# STRUCTURED-DATA = NILVALUE / 1*SD-ELEMENT
# SD-ELEMENT = "[" SD-ID *(SP SD-PARAM) "]"
# SD-PARAM = PARAM-NAME "=" %d34 PARAM-VALUE %d34
# SD-ID = SD-NAME
# PARAM-NAME = SD-NAME
# PARAM-VALUE = UTF-8-STRING ; characters '"', '\' and ; ']' MUST be escaped.
# SD-NAME = 1*32PRINTUSASCII ; except '=', SP, ']', %d34 (")
# MSG = MSG-ANY / MSG-UTF8
# MSG-ANY = *OCTET ; not starting with BOM
# MSG-UTF8 = BOM UTF-8-STRING
# BOM = %xEF.BB.BF
# UTF-8-STRING = *OCTET ; UTF-8 string as specified in RFC 3629
# OCTET = %d00-255
# SP = %d32
# PRINTUSASCII = %d33-126
# NONZERO-DIGIT = %d49-57
# DIGIT = %d48 / NONZERO-DIGIT
# NILVALUE = "-"
grok {
# see <logstash-install-dir>/patterns/linux-syslog
match => { "message" => "%{SYSLOG5424LINE}" }
add_tag => [ "syslog5424-compliant" ]
}
date {
# season to taste for your own syslog format(s)
match => [ "syslog5424_ts",
"MMM d HH:mm:ss",
"MMM dd HH:mm:ss",
"ISO8601"
]
add_tag => [ "syslog5424-timestamp" ]
}
grok {
match => { "syslog5424_sd" => "%{USERNAME:tenant}@%{POSINT:enterprise} +%{GREEDYDATA:syslog5424_sd_elements}]" }
add_tag => [ "syslog5424-tenant-and-enterprise" ]
}
# TODO need an improved pattern to allow for extra elements
# NOTE: we use separate grokkers in order not to depenend on order of elements
grok {
match => { "syslog5424_sd_elements" => "access=\"%{DATA:access}\"" }
add_tag => [ "syslog5424-access" ]
}
grok {
match => { "syslog5424_sd_elements" => "origin=\"%{PROG:objecttype}\"" }
add_tag => [ "syslog5424-origin" ]
}
grok {
match => { "syslog5424_sd_elements" => "agentid=\"%{DATA:agentid}\"" }
add_tag => [ "syslog5424-agentid" ]
}
if ("syslog5424-compliant" in [tags])
and ("syslog5424-timestamp" in [tags])
and ("syslog5424-tenant-and-enterprise" in [tags])
and ("syslog5424-origin" in [tags])
and ("syslog5424-agentid" in [tags])
and ("syslog5424-access" in [tags]) {
# Uncomment if want to carry it over
#mutate {
# # Normalize values of some known fields
# convert => [
# "syslog5424_pri", "integer",
# "syslog5424_ver", "integer"
# ]
#}
mutate {
# replace the field data since we parsed the message header
replace => [ "message", "%{syslog5424_msg}" ]
# remove the fields we don't carry over to the next filter
remove_field => [
"syslog5424_pri",
"syslog5424_ver",
"syslog5424_ts",
"syslog5424_sd",
"syslog5424_msg",
"syslog5424_sd_elements",
"syslog5424_app",
"syslog5424_proc",
"priority",
"severity",
"facility",
"facility_label",
"severity_label"
]
# canonicalize some fields
rename => [
"syslog5424_app", "application",
"syslog5424_host", "agenthost"
]
# update the tags to include the origin in order to trigger the next filter in chain
add_tag => [ "%{objecttype}" ]
# mark the fact that it is no longer a raw syslog line + remove the tags we used to detect OK parsing
remove_tag => [ "syslog-raw",
"syslog5424-compliant",
"syslog5424-timestamp",
"syslog5424-tenant-and-enterprise",
"syslog5424-access",
"syslog5424-origin",
"syslog5424-agentid",
"_grokparsefailure"
]
}
} else {
mutate {
add_tag => [ "syslog5424-parse-error" ]
}
}
}
}
# Now use conditional per-flavor filters
filter {
if ("nginx-access" in [tags]) or ("apache2-access" in [tags]) {
grok {
match => { "message" => "%{COMBINEDAPACHELOG}" }
add_tag => [ "http-access-combined-log" ]
}
date {
match => [ "timestamp", "dd/MMM/yyyy:HH:mm:ss Z" ]
add_tag => [ "http-access-timestamp" ]
}
if ("http-access-combined-log" in [tags])
and ("http-access-timestamp" in [tags]) {
# strip quotes
grok {
match => { "agent" => "^\"%{GREEDYDATA:useragent}\"$" }
}
mutate {
# Add some fields we need
add_field => [
"protocol", "HTTP"
]
# Remove fields we don't need
remove_field => [
"ident",
"agent",
"referrer",
"host",
"timestamp"
]
# strip quotes
gsub => [
"agent", "^\"", ""
]
# Canonicalize some existing fields - TODO add a 'timestamp' field with milliseconds value
rename => [
"clientip", "remoteaddr",
"verb", "method",
"request", "uri",
"auth", "remoteuser",
"httpversion", "version",
"bytes", "responsesize",
"response", "statuscode"
]
convert => [
"responsesize", "integer",
"statuscode", "integer"
]
uppercase => [ "method" ]
# remove helper tags
remove_tag => [ "%{objecttype}", "http-access-combined-log", "http-access-timestamp" ]
# mark the message as complete
add_tag => [ "http-access-ready" ]
}
} else {
mutate {
add_tag => [ "http-access-parse-error" ]
}
}
}
}
# see https://docs.aws.amazon.com/ElasticLoadBalancing/latest/DeveloperGuide/access-log-collection.html
filter {
if ("aws-lb" in [tags]) {
grok {
match => { "message" => "^%{TIMESTAMP_ISO8601:timestamp} %{HOSTNAME:lb-name} %{IPORHOST:remoteaddr}:%{POSINT} %{IPORHOST:correlation}:%{POSINT} %{BASE10NUM:req-time} %{BASE10NUM:backend-time} %{BASE10NUM:rsp-time} %{NUMBER:lb-code} %{NUMBER:statuscode} %{NUMBER:requestsize} %{NUMBER:responsesize} \"%{WORD:method} %{URIPROTO:protocol}://(?:%{USER}(?::[^@]*)?@)?(?:%{IPORHOST:signature}(?::%{POSINT})?)?(?:%{NOTSPACE:uri}) HTTP/%{NUMBER:version}\"%{SPACE}$" }
add_tag => [ "aws-lb-entry" ]
}
date {
match => [ "timestamp", "ISO8601" ]
add_tag => [ "aws-lb-access-timestamp" ]
remove_field => [ "timestamp" ]
}
if ("aws-lb-entry" in [tags])
and ("aws-lb-access-timestamp" in [tags]) {
mutate {
convert => [
"requestsize", "integer",
"responsesize", "integer",
"statuscode", "integer",
"lb-code", "integer",
"req-time", "float",
"backend-time", "float",
"rsp-time", "float"
]
uppercase => [ "method", "protocol" ]
# The actual agent host is not the syslog agent but rather the LB itself
update => [ "agenthost", "%{lb-name}" ]
}
ruby {
code => "event['nanoseconds'] = ((event['req-time'] + event['backend-time'] + event['rsp-time']) * 1000000000).ceil;
event['duration'] = (event['nanoseconds'] / 1000000).ceil;"
}
mutate {
# remove no longer needed fields
remove_field => [ "req-time", "backend-time", "rsp-time", "lb-code", "host", "lb-name" ]
# remove helper tags
remove_tag => [ "aws-lb", "timestamp", "aws-lb-entry", "aws-lb-access-timestamp" ]
# mark the message as complete
add_tag => [ "http-access-ready" ]
}
} else {
mutate {
add_tag => [ "aws-lb-access-parse-error" ]
}
}
}
}
# deconstruct the URI into path, filename, filetype and query
filter {
if ("http-access-ready" in [tags]) and ([uri] =~ /.+/) {
grok {
match => { "uri" => "^%{GREEDYDATA:path}\?%{GREEDYDATA:query}$" }
add_tag => [ "uri-path-and-query" ]
}
# if don't have both then assume no query
if ("uri-path-and-query" not in [tags]) {
mutate {
add_field => [
"path", "%{uri}"
]
remove_tag => [ "_grokparsefailure" ]
}
} else {
mutate {
remove_tag => [ "uri-path-and-query" ]
}
}
grok {
match => { "path" => "^%{GREEDYDATA:pathPart}/%{GREEDYDATA:filename}\.(?<filetype>[a-zA-Z0-9]+)$" }
remove_tag => [ "uri-path-and-query" ]
add_tag => [ "file-name-and-type" ]
}
if ("file-name-and-type" in [tags]) {
if [pathPart] {
mutate {
replace => [ "path", "%{pathPart}" ]
remove_field => [ "pathPart" ]
remove_tag => [ "file-name-and-type" ]
}
} else {
mutate {
replace => [ "path", "/" ]
remove_tag => [ "file-name-and-type" ]
}
}
} else {
mutate {
remove_tag => [ "_grokparsefailure" ]
}
}
}
}
# resolve GeoIP information
filter {
if [remoteaddr] {
geoip {
source => "remoteaddr"
target => "location"
fields => [ "latitude", "longitude" ]
add_tag => [ "geoip-resolved" ]
}
}
if ("geoip-resolved" in [tags]) {
mutate {
# for some reason the 'geoip' filter adds this field even though we have not asked for it
remove_field => [ "[location][location]" ]
# normalize names to match Elasticsearch
rename => [
"[location][latitude]" , "[location][lat]",
"[location][longitude]" , "[location][lon]"
]
remove_tag => [ "geoip-resolved" ]
}
}
}
# if all ready then remove all non-essential fields and create canonical form
filter {
if ("http-access-ready" in [tags]) {
mutate {
add_field => [
"origin", "accesslog",
"flavor", "http"
]
remove_field => [
"@version",
"enterprise",
"access"
]
rename => [
"message", "label"
]
# sometimes an OK match can leave this as a leftover
remove_tag => [ "_grokparsefailure" ]
#ruby {
# code => "event['timevaluemsec'] = (event['@timestamp'].to_f * 1000.0).to_i;"
#}
# TODO re-format '@timestamp' field to contain GMT offset instead of 'Z'
#ruby {
# code => "event['@timestamp'] = event['@timestamp'].local('-08:00')"
#}
}
}
}
output {
if ("http-access-ready" in [tags]) {
stdout {
# codec => json_lines {}
codec => rubydebug {}
}
# elasticsearch_http {
# host => "localhost"
# # default, but bears re-listing just so we remember it
# port => 9200
# # default, but bears re-listing just so we remember it
# flush_size => 100
# idle_flush_time => 15
# index => "%{tenant}"
# index_type => "%{flavor}"
# manage_template => false
# template_name => "logstash"
# # default, but bears re-listing just so we remember it
# template_overwrite => false
# }
# } else if ("auth-access" in [tags]) {
# file {
# codec => "plain"
# flush_interval => 5
# gzip => false
# message_format => "%{message}"
# path => "/var/log/f2bauth.log"
# }
} else {
stdout {
# codec => rubydebug {}
codec => json_lines {}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment