p0pr0ck5/parse_modsec.pl

## parse_modsec.pl
#!/usr/bin/perl

use strict;
use warnings;

use JSON;

my @valid_directives = qw(SecRule SecAction SecDefaultAction);

sub valid_line {
	my ($line) = @_;

	# the directive must be the first element in the line
	# so if this does not match our whitelist we can't process it
	return if ! grep { $line =~ m/^$_ / } @valid_directives;

	return 1;
}

sub clean_input {
	my ($fh) = @_;

	my (@lines, @line_buf, $multi_line);

	while (<$fh>) {
		my $line = $_;
		chomp $line;

		# ignore comments and blank lines
		next if $line =~ m/^\s*$/;
		next if $line =~ m/^\s*#/;

		# trim whitespace
		$line =~ s/^\s*|\s*$//;

		# merge multi-line directives
		# ex.
		#
		# SecRule \
		#   ARGS \
		#   "foo" \
		#   "t:none, \
		#   block, \
		#   phase:1, \
		#   setvar:tx.foo=bar, \
		#   expirevar:tx.foo=60"
		#
		if ($line =~ m/\s*\\\s*$/) {
			# strip the multi-line ecape and surrounding whitespace
			$line =~ s/\s*\\\s*$//;

			push @line_buf, $line;
		} else {
			# either the end of a multi line directive or a standalone line
			# push the buffer to the return array and clear the buffer
			push @line_buf, $line;

			my $final_line = join ' ', @line_buf;

			push @lines, $final_line if valid_line($final_line);
			@line_buf = ();
		}
	}

	return @lines;
}

# take a line and return an array of tokens representing various rule parts
sub tokenize {
	my ($line) = @_;

	my @tokens;

	# so... this sucks
	# we have to make a few assumptions about our line
	# - tokens are whitespace separated
	# - tokens must be quoted with " if they contain spaces
	# - " chars within tokens must be escaped with \
	my $re_quoted   = qr/^"(.*?(?<!\\))"/;
	my $re_unquoted = qr/([^ ]+)/;

	# walk the given string and grab the next token
	# which may be either quoted or unquoted
	# from there, push the token to our list of fields
	# and strip it from the input line
	while ($line =~ $re_quoted || $line =~ $re_unquoted) {
		my $token = $1;
		push @tokens, $token;
		$line =~ s/"?\Q$token\E"?//;
		$line =~ s/^\s*//;
	}

	return @tokens;
}

# take an array of rule parts and return a hashref of parsed tokens
sub parse_tokens {
	my (@tokens) = @_;

	my ($entry, $directive, $vars, $operator, $opts);
	$entry = {};

	$directive = shift @tokens;
	if ($directive eq 'SecRule') {
		$vars     = shift @tokens;
		$operator = shift @tokens;
	}
	$opts = shift @tokens;

	die "Uh oh! We shouldn't have any fields left but we still have @tokens\n"
		if @tokens;

	$entry->{directive} = $directive;
	$entry->{vars}      = parse_vars($vars) if $vars;
	$entry->{operator}  = parse_operator($operator) if $operator;
	$entry->{opts}      = parse_options($opts) if $opts;

	return $entry;
}

sub parse_vars {
	my ($raw_var) = @_;

	my @vars = split '\|', $raw_var;

	my @parsed_vars;

	for my $var (@vars) {
		# variables may take a few forms
		# ! and & are optional metacharacters (mutually exclusive)
		# an optional ':foo' element may also exist

		my ($var, @rest) = split ':', $var;

		my $modifier = substr $var, 0, 1;
		my $specific = join ':', @rest;

		my $parsed = {};

		$parsed->{variable} = $var;
		$parsed->{specific} = $specific if $specific;

		if ($modifier =~ m/[&!]/) {
			$parsed->{modifier} = $modifier;
		}

		push @parsed_vars, $parsed;
	}

	return \@parsed_vars;
}

sub parse_operator {
	my ($raw_operator) = @_;

	# operators may be defined by the @ symbol
	# if one isnt' defined, 'rx' is the default
	# everything following in this token is the pattern
	#
	# using a regex here makes the parser a little more flexible
	# we could split on space, but that breaks if the operator
	# is not single space separated from the pattern, and splitting
	# on \s+ isn't possible because that could break the pattern
	# when joining back together
	$raw_operator =~ m/^\s*(?:(\!)?\@([a-zA-Z]+)\s+)?(.*)$/;

	my $negated  = $1;
	my $operator = $2 ? $2 : 'rx';
	my $pattern  = $3;

	my $parsed = {};

	$parsed->{negated}  = $negated if $negated;
	$parsed->{operator} = $operator;
	$parsed->{pattern}  = $pattern;

	return $parsed;
}

sub parse_options {
	my ($raw_options) = @_;

	my (@tokens, @parsed_options, @opt_buf, $sentinal);
	my @split_options = split ',', $raw_options;

	# options may take one of a few forms
	# standalone: deny
	# express a value: phase:1
	# express a quoted value: msg:'foo bar'
	#
	# because the quoted value in something like msg or logdata
	# may have commas, we cant simply split on comma
	# so we need to loop through and piece together tokens
	while (@split_options) {
		# take a chunk and add it the buffer array
		# once we know we've reached the end of an
		# option, we'll put the buffer elements
		# back together and add it to the final array
		my $chunk = shift @split_options;
		push @opt_buf, $chunk;

		# we're done chaining together chunks if:
		#
		# - we didnt have the potential to split
		#   meaning that the chunk didnt have a : or '
		#   and that the first member of the buffer
		#   did contain a '
		#
		# OR
		#
		# - we could have split but we know we're done
		# (we know this if the last member of the chunk is a ')
		$sentinal = 1 if (($chunk !~ m/'/ || $chunk !~ m/:/) &&
			! (scalar @opt_buf > 1 && $opt_buf[0] =~ m/'/)) ||
			$chunk =~ m/'$/;

		if ($sentinal) {
			push @tokens, join ',', @opt_buf;
			@opt_buf  = ();
			$sentinal = 0;
		}
	}

	# great, now that we have proper tokens
	# we can split any potential key value pairs
	# and add them to the final array
	for my $token (@tokens) {
		my ($opt, @value) = split /:/, $token;

		my $parsed = {};

		$parsed->{opt}   = $opt;
		$parsed->{value} = strip_encap_quotes(join ':', @value) if @value;

		push @parsed_options, $parsed;
	}

	return \@parsed_options;
}

sub strip_encap_quotes {
	my ($line) = @_;

	$line =~ s/^(['"])(.*)\1$/$2/;

	return $line;
}

sub main {
	my $fh = *STDIN;

	my @cleaned_lines = clean_input($fh);
	my @parsed_lines = map { parse_tokens(tokenize($_)) } @cleaned_lines;

	print to_json(\@parsed_lines);
}

main();
	#!/usr/bin/perl

	use strict;
	use warnings;

	use JSON;

	my @valid_directives = qw(SecRule SecAction SecDefaultAction);

	sub valid_line {
	my ($line) = @_;

	# the directive must be the first element in the line
	# so if this does not match our whitelist we can't process it
	return if ! grep { $line =~ m/^$_ / } @valid_directives;

	return 1;
	}

	sub clean_input {
	my ($fh) = @_;

	my (@lines, @line_buf, $multi_line);

	while (<$fh>) {
	my $line = $_;
	chomp $line;

	# ignore comments and blank lines
	next if $line =~ m/^\s*$/;
	next if $line =~ m/^\s*#/;

	# trim whitespace
	$line =~ s/^\s\|\s$//;

	# merge multi-line directives
	# ex.
	#
	# SecRule \
	# ARGS \
	# "foo" \
	# "t:none, \
	# block, \
	# phase:1, \
	# setvar:tx.foo=bar, \
	# expirevar:tx.foo=60"
	#
	if ($line =~ m/\s\\\s$/) {
	# strip the multi-line ecape and surrounding whitespace
	$line =~ s/\s\\\s$//;

	push @line_buf, $line;
	} else {
	# either the end of a multi line directive or a standalone line
	# push the buffer to the return array and clear the buffer
	push @line_buf, $line;

	my $final_line = join ' ', @line_buf;

	push @lines, $final_line if valid_line($final_line);
	@line_buf = ();
	}
	}

	return @lines;
	}

	# take a line and return an array of tokens representing various rule parts
	sub tokenize {
	my ($line) = @_;

	my @tokens;

	# so... this sucks
	# we have to make a few assumptions about our line
	# - tokens are whitespace separated
	# - tokens must be quoted with " if they contain spaces
	# - " chars within tokens must be escaped with \
	my $re_quoted = qr/^"(.*?(?<!\\))"/;
	my $re_unquoted = qr/([^ ]+)/;

	# walk the given string and grab the next token
	# which may be either quoted or unquoted
	# from there, push the token to our list of fields
	# and strip it from the input line
	while ($line =~ $re_quoted \|\| $line =~ $re_unquoted) {
	my $token = $1;
	push @tokens, $token;
	$line =~ s/"?\Q$token\E"?//;
	$line =~ s/^\s*//;
	}

	return @tokens;
	}

	# take an array of rule parts and return a hashref of parsed tokens
	sub parse_tokens {
	my (@tokens) = @_;

	my ($entry, $directive, $vars, $operator, $opts);
	$entry = {};

	$directive = shift @tokens;
	if ($directive eq 'SecRule') {
	$vars = shift @tokens;
	$operator = shift @tokens;
	}
	$opts = shift @tokens;

	die "Uh oh! We shouldn't have any fields left but we still have @tokens\n"
	if @tokens;

	$entry->{directive} = $directive;
	$entry->{vars} = parse_vars($vars) if $vars;
	$entry->{operator} = parse_operator($operator) if $operator;
	$entry->{opts} = parse_options($opts) if $opts;

	return $entry;
	}

	sub parse_vars {
	my ($raw_var) = @_;

	my @vars = split '\\|', $raw_var;

	my @parsed_vars;

	for my $var (@vars) {
	# variables may take a few forms
	# ! and & are optional metacharacters (mutually exclusive)
	# an optional ':foo' element may also exist

	my ($var, @rest) = split ':', $var;

	my $modifier = substr $var, 0, 1;
	my $specific = join ':', @rest;

	my $parsed = {};

	$parsed->{variable} = $var;
	$parsed->{specific} = $specific if $specific;

	if ($modifier =~ m/[&!]/) {
	$parsed->{modifier} = $modifier;
	}

	push @parsed_vars, $parsed;
	}

	return \@parsed_vars;
	}

	sub parse_operator {
	my ($raw_operator) = @_;

	# operators may be defined by the @ symbol
	# if one isnt' defined, 'rx' is the default
	# everything following in this token is the pattern
	#
	# using a regex here makes the parser a little more flexible
	# we could split on space, but that breaks if the operator
	# is not single space separated from the pattern, and splitting
	# on \s+ isn't possible because that could break the pattern
	# when joining back together
	$raw_operator =~ m/^\s(?:(\!)?\@([a-zA-Z]+)\s+)?(.)$/;

	my $negated = $1;
	my $operator = $2 ? $2 : 'rx';
	my $pattern = $3;

	my $parsed = {};

	$parsed->{negated} = $negated if $negated;
	$parsed->{operator} = $operator;
	$parsed->{pattern} = $pattern;

	return $parsed;
	}

	sub parse_options {
	my ($raw_options) = @_;

	my (@tokens, @parsed_options, @opt_buf, $sentinal);
	my @split_options = split ',', $raw_options;

	# options may take one of a few forms
	# standalone: deny
	# express a value: phase:1
	# express a quoted value: msg:'foo bar'
	#
	# because the quoted value in something like msg or logdata
	# may have commas, we cant simply split on comma
	# so we need to loop through and piece together tokens
	while (@split_options) {
	# take a chunk and add it the buffer array
	# once we know we've reached the end of an
	# option, we'll put the buffer elements
	# back together and add it to the final array
	my $chunk = shift @split_options;
	push @opt_buf, $chunk;

	# we're done chaining together chunks if:
	#
	# - we didnt have the potential to split
	# meaning that the chunk didnt have a : or '
	# and that the first member of the buffer
	# did contain a '
	#
	# OR
	#
	# - we could have split but we know we're done
	# (we know this if the last member of the chunk is a ')
	$sentinal = 1 if (($chunk !~ m/'/ \|\| $chunk !~ m/:/) &&
	! (scalar @opt_buf > 1 && $opt_buf[0] =~ m/'/)) \|\|
	$chunk =~ m/'$/;

	if ($sentinal) {
	push @tokens, join ',', @opt_buf;
	@opt_buf = ();
	$sentinal = 0;
	}
	}

	# great, now that we have proper tokens
	# we can split any potential key value pairs
	# and add them to the final array
	for my $token (@tokens) {
	my ($opt, @value) = split /:/, $token;

	my $parsed = {};

	$parsed->{opt} = $opt;
	$parsed->{value} = strip_encap_quotes(join ':', @value) if @value;

	push @parsed_options, $parsed;
	}

	return \@parsed_options;
	}

	sub strip_encap_quotes {
	my ($line) = @_;

	$line =~ s/^(['"])(.*)\1$/$2/;

	return $line;
	}

	sub main {
	my $fh = *STDIN;

	my @cleaned_lines = clean_input($fh);
	my @parsed_lines = map { parse_tokens(tokenize($_)) } @cleaned_lines;

	print to_json(\@parsed_lines);
	}

	main();