ChristianOellers/RegEx-Snippets - Log analysis + Filtering.sh

## RegEx-Snippets - Log analysis + Filtering.sh
# Exemplary snippets I've been using in my own projects. Use for inspiration and take what you need.
# 'sh' file format is for syntax highlighting only: The RegEx parts should work in many scenarios.
#
# Run all snippets to normalize varying strings by replacing or removing characters (as you need).
# Once all strings are aligned, duplicates can be filtered and lines sorted.
# This leaves the log with a few distinct, unique errors that are to be considered for development.
# RegEx design aims to leave the log syntax intact, e.g., delimiters (,"').
# This might be relevant for proper syntax highlighting or use with advanced analyzers.

# ---

# Date + Time formats
# = 2000-01-01 00:00:00
# = [2000-01-01 00:00:00]
\[?(\d+-?){3}\s(\d+:?){3}\]?

# Serial numbers (letters)
# = XXXX-XXXX-XXXX
\w{4}-\w{4}-\w{4}

# Request URI from JSON part
# = "request_uri":"http://..."
("request_uri.*),

# API errors - Domain part
# = Authentication unsuccessful [xxx.xxx.xxx.xxx.COM] ". [] []
(unsuccessful\s\[.+\])(?=\s")

# SQL errors - Number part
# = ", 111]:  SQLSTATE[23000]
(?<=",)\s\d+(?=\])

# Symfony framework views - Number part
# = ::viewExample","id":"1"},"method"
(?<=viewExample","id":").*\d+

# Filters / Severities
# e.g. for highlighting, prioritizing
\.(debug|info)
\.warn(ing)?
\.critical
\.error(?=.*XXX)(?!.*404a) # - Only 'XXX', ignore '404s'
\.error(?!.*404) # - Ignore '404s'


## Text - Duplicate word finder (variants).sh
# \1 matches the 1st group (again) finding all duplicate results.

# Simple (yet inflexible for numbers > 9)
(\w+\s)\1

# Better (any numbers)
(\w+\s)\g{1}

# Named group (in this case: only match 3 occurrences of same word)
(?<NAME>\w+\s)\g{1}\k<NAME>

# ---

# Modifiers
gmi

# ---

# Test strings

# = This iS is some text TEXT text with a lot lot of double douBLE words. These should should be removed be BE BE be removed.
# = Testing <B><I>bold italic</I></B> text
# = Testing <B attr="test"><I>bold italic</I></B> text

# \1 matches the 1st group (again) finding all duplicate results.
# VARIANTS
(?<NAME>\w+\s)\g{1}\k<NAME>
(?<CAP>\w+\s)\g{1}


## Text - Token matching algorithms.sh
# Token matching regular expressions - Variants.
# - $ start from end of string (last input, caret position).

# Find A->B: 'Lorem ipsum A->BB dolor'.
(\w+)->(\w+)

# Finds 'AA->BB': 'Lorem AA->BB'.
[\w]{1,}\-\>([\-+\w]*)$

# \B excludes any whitespace (space, new line, ...).
# - Finds 'BB': 'Lorem :BB'.
\B:([\-+\w]*)$

# 1. Find any non-whitespace characters at end.
# 2. Find whitespace chars at end (check after word ends).
/\S+$/
/\s+$/
	# Exemplary snippets I've been using in my own projects. Use for inspiration and take what you need.
	# 'sh' file format is for syntax highlighting only: The RegEx parts should work in many scenarios.
	#
	# Run all snippets to normalize varying strings by replacing or removing characters (as you need).
	# Once all strings are aligned, duplicates can be filtered and lines sorted.
	# This leaves the log with a few distinct, unique errors that are to be considered for development.
	# RegEx design aims to leave the log syntax intact, e.g., delimiters (,"').
	# This might be relevant for proper syntax highlighting or use with advanced analyzers.

	# ---

	# Date + Time formats
	# = 2000-01-01 00:00:00
	# = [2000-01-01 00:00:00]
	\[?(\d+-?){3}\s(\d+:?){3}\]?

	# Serial numbers (letters)
	# = XXXX-XXXX-XXXX
	\w{4}-\w{4}-\w{4}

	# Request URI from JSON part
	# = "request_uri":"http://..."
	("request_uri.*),

	# API errors - Domain part
	# = Authentication unsuccessful [xxx.xxx.xxx.xxx.COM] ". [] []
	(unsuccessful\s\[.+\])(?=\s")

	# SQL errors - Number part
	# = ", 111]: SQLSTATE[23000]
	(?<=",)\s\d+(?=\])

	# Symfony framework views - Number part
	# = ::viewExample","id":"1"},"method"
	(?<=viewExample","id":").*\d+

	# Filters / Severities
	# e.g. for highlighting, prioritizing
	\.(debug\|info)
	\.warn(ing)?
	\.critical
	\.error(?=.XXX)(?!.404a) # - Only 'XXX', ignore '404s'
	\.error(?!.*404) # - Ignore '404s'
	# \1 matches the 1st group (again) finding all duplicate results.

	# Simple (yet inflexible for numbers > 9)
	(\w+\s)\1

	# Better (any numbers)
	(\w+\s)\g{1}

	# Named group (in this case: only match 3 occurrences of same word)
	(?<NAME>\w+\s)\g{1}\k<NAME>

	# ---

	# Modifiers
	gmi

	# ---

	# Test strings

	# = This iS is some text TEXT text with a lot lot of double douBLE words. These should should be removed be BE BE be removed.
	# = Testing <B><I>bold italic</I></B> text
	# = Testing <B attr="test"><I>bold italic</I></B> text

	# \1 matches the 1st group (again) finding all duplicate results.
	# VARIANTS
	(?<NAME>\w+\s)\g{1}\k<NAME>
	(?<CAP>\w+\s)\g{1}
	# Token matching regular expressions - Variants.
	# - $ start from end of string (last input, caret position).

	# Find A->B: 'Lorem ipsum A->BB dolor'.
	(\w+)->(\w+)

	# Finds 'AA->BB': 'Lorem AA->BB'.
	[\w]{1,}\-\>([\-+\w]*)$

	# \B excludes any whitespace (space, new line, ...).
	# - Finds 'BB': 'Lorem :BB'.
	\B:([\-+\w]*)$

	# 1. Find any non-whitespace characters at end.
	# 2. Find whitespace chars at end (check after word ends).
	/\S+$/
	/\s+$/