Skip to content

Instantly share code, notes, and snippets.

@sahandevs
Created October 19, 2021 14:10
Show Gist options
  • Save sahandevs/13851a26bbf68aa68543642972d0b81e to your computer and use it in GitHub Desktop.
Save sahandevs/13851a26bbf68aa68543642972d0b81e to your computer and use it in GitHub Desktop.
use super::{ast, parser};
use lexgen::lexer;
use std::iter::Peekable;
#[derive(Default, Clone)]
pub struct LexerState {
linebreak_next_rule: Option<_LexerRule>,
/// setvar:'abc=+1' <= true
/// setvar:abc=+1 <= false
wrapped_in_single_quotation: bool,
macro_expansion_depth: usize,
// FormatString states:
/// chars that breaks the format string
/// second parameter is the rule to switch
fstr_exit_points: Vec<(Token, _LexerRule)>,
/// is the next exit point escaped or not
fstr_skip_next_exit_point: bool,
}
lexer! {
pub _Lexer(LexerState) -> Token;
let ident = ($$alphanumeric | ['_'])+;
let whitespace = [' ' '\t'];
rule Init {
// whitespace skipping
($whitespace | '\n')+,
// comment
"#" => |lexer| { lexer.switch(_LexerRule::Comment) },
"SecMarker" => |lexer| { lexer.switch_and_return(_LexerRule::SecMarkerName, Token::SecMarker) },
"SecDefaultAction" => |lexer| { lexer.switch_and_return(_LexerRule::Actions, Token::SecDefaultAction) },
"SecAction" => |lexer| { lexer.switch_and_return(_LexerRule::Actions, Token::SecAction) },
"SecRule" => |lexer| { lexer.switch_and_return(_LexerRule::RuleVarsStart, Token::SecRule) },
_, // TODO: ignore unknown directives
}
rule RuleVarsStart {
$whitespace+,
"!" = Token::EXC,
"&" = Token::AMP,
"XML" => |lexer| {
lexer.switch_and_return(_LexerRule::RuleVarsXMLStart, Token::Ident("XML".into()))
},
$ident => |lexer| {
let s = lexer.match_();
lexer.switch_and_return(_LexerRule::RuleVars, Token::Ident(s.into()))
},
}
rule RuleVars {
"!" = Token::EXC,
"&" = Token::AMP,
"|" = Token::PIPE,
":" = Token::COLON,
// only XML:/a is special -_-.
// isn't modsec loverly?
"XML" => |lexer| {
lexer.switch_and_return(_LexerRule::RuleVarsXMLStart, Token::Ident("XML".into()))
},
"/" => |mut lexer| {
lexer.state().fstr_exit_points.push((Token::FSLASH, _LexerRule::RuleVars));
lexer.switch_and_return(_LexerRule::FormatString, Token::FSLASH)
},
"\"" => |lexer| {
lexer.switch_and_return(_LexerRule::Actions, Token::DQ)
},
$ident => |lexer| {
let s = lexer.match_();
lexer.return_(Token::Ident(s.into()))
},
" " => |lexer| {
lexer.switch_and_reset_match(_LexerRule::SecRuleOperator)
},
}
rule RuleVarsXMLStart {
":" = Token::COLON,
"/" => |lexer| {
lexer.switch_and_return(_LexerRule::RuleVarsXML, Token::FSLASH)
},
}
rule RuleVarsXML {
"|" => |lexer| {
lexer.switch_and_return(_LexerRule::RuleVars, Token::PIPE)
},
" " => |lexer| {
lexer.switch_and_reset_match(_LexerRule::SecRuleOperator)
},
_ => |lexer| {
let s = lexer.match_();
lexer.return_(Token::StringLiteralPart(s.into()))
},
}
rule SecRuleOperator {
$whitespace+ => |lexer| {
lexer.switch_and_reset_match(_LexerRule::SecRuleOperator)
},
"\\" => |mut lexer| {
lexer.state().linebreak_next_rule = Some(_LexerRule::SecRuleOperator);
lexer.switch_and_reset_match(_LexerRule::LineBreak)
},
$ident => |lexer| {
let s = lexer.match_();
lexer.switch_and_return(_LexerRule::Actions, Token::RuleIdent(s.into()))
},
"\"" => |lexer| {
lexer.switch_and_return(_LexerRule::SecRuleOperatorInner, Token::DQ)
},
}
rule SecRuleOperatorInner {
"!" = Token::EXC,
"@" = Token::AT,
$ident => |lexer| {
let s = lexer.match_();
lexer.return_(Token::Ident(s.into()))
},
" " => |mut lexer| {
lexer.state().fstr_exit_points.push((Token::DQ, _LexerRule::Actions));
lexer.switch_and_reset_match(_LexerRule::FormatString)
},
_ => |mut lexer| {
lexer.state().fstr_exit_points.push((Token::DQ, _LexerRule::Actions));
// we should not reset the match here
lexer.switch(_LexerRule::FormatString)
},
}
// SecDefaultAction "id:1234"
// ^^^^^^^^^^
rule Actions {
$whitespace,
['\\'] => |mut lexer| {
lexer.state().linebreak_next_rule = Some(_LexerRule::Actions);
lexer.switch(_LexerRule::LineBreak)
},
'"' => |lexer| { lexer.switch_and_return(_LexerRule::ActionsInner, Token::DQ) },
}
rule ActionsInner {
"capture" = Token::capture,
"block" = Token::block,
"chain" = Token::chain,
"deny" = Token::deny,
"log" = Token::log,
"nolog" = Token::nolog,
"id:" => |lexer| { lexer.switch_and_return(_LexerRule::NumberActionArg, Token::id) },
"rev:" => |lexer| { lexer.switch_and_return(_LexerRule::NumberActionArg, Token::rev) },
"severity:" => |lexer| { lexer.switch_and_return(_LexerRule::NumberActionArg, Token::severity) },
"phase:" => |lexer| { lexer.switch_and_return(_LexerRule::PhaseAction, Token::phase) },
"setvar:" => |lexer| { lexer.switch_and_return(_LexerRule::AssignActionArg, Token::setvar) },
"ctl:" => |lexer| { lexer.switch_and_return(_LexerRule::AssignActionArg, Token::ctl) },
"tag:" => |lexer| { lexer.switch_and_return(_LexerRule::FormatStringActionArg, Token::tag) },
"msg:" => |lexer| { lexer.switch_and_return(_LexerRule::FormatStringActionArg, Token::msg) },
"logdata:" => |lexer| { lexer.switch_and_return(_LexerRule::FormatStringActionArg, Token::logdata) },
"t:" => |lexer| { lexer.switch_and_return(_LexerRule::IdentActionArg, Token::t) },
// next action
// NOTE: before next action starts, we can have whitespace and linebreaks
"," = Token::COMMA,
$whitespace* '\\' => |mut lexer| {
lexer.state().linebreak_next_rule = Some(_LexerRule::ActionsInner);
lexer.switch_and_reset_match(_LexerRule::LineBreak)
},
// after the action part of a directive, directive is finished
'"' => |lexer| { lexer.switch_and_return(_LexerRule::Init, Token::DQ) },
}
rule IdentActionArg {
$ident => |lexer| {
let ident = lexer.match_().to_string();
lexer.switch_and_return(_LexerRule::ActionsInner, Token::Ident(ident))
},
}
rule RemoveVarArg {
"%{" = Token::VarOpen,
"}" = Token::VarClose,
"." = Token::DOT,
$ident => |lexer| {
let ident = lexer.match_().to_string();
lexer.return_(Token::Ident(ident))
},
"\"" => |lexer| {
lexer.switch_and_return(_LexerRule::Init, Token::DQ)
},
"," => |lexer| {
lexer.switch_and_return(_LexerRule::ActionsInner, Token::COMMA)
},
}
rule FormatStringActionArg {
"'" => |mut lexer| {
lexer.state().wrapped_in_single_quotation = true;
lexer.state().fstr_exit_points.push((Token::SQ, _LexerRule::ActionsInner));
lexer.switch_and_return(_LexerRule::FormatString, Token::SQ)
},
_ => |mut lexer| {
// It's import here not to reset the match so the [`_LexerRule::FormatString`]
// can capture it.
lexer.state().fstr_exit_points.push((Token::DQ, _LexerRule::Init));
if !lexer.state().wrapped_in_single_quotation {
lexer.state().fstr_exit_points.push((Token::COMMA, _LexerRule::ActionsInner));
}
lexer.switch(_LexerRule::FormatString)
},
}
rule AssignActionArg {
"!" $whitespace* => |lexer| {
lexer.switch_and_return(_LexerRule::RemoveVarArg, Token::EXC)
},
"'" => |mut lexer| {
lexer.state().wrapped_in_single_quotation = true;
lexer.state().fstr_exit_points.push((Token::SQ, _LexerRule::ActionsInner));
lexer.return_(Token::SQ)
},
"=" | "=+" | "=-" => |mut lexer| {
let s = lexer.match_();
let token = match s {
"=" => Token::Assign,
"=+" => Token::IncAssign,
"=-" => Token::DecAssign,
_ => unreachable!()
};
lexer.state().fstr_exit_points.push((Token::DQ, _LexerRule::Init));
if !lexer.state().wrapped_in_single_quotation {
lexer.state().fstr_exit_points.push((Token::COMMA, _LexerRule::ActionsInner));
}
lexer.switch_and_return(_LexerRule::FormatString, token)
},
"%{" = Token::VarOpen,
"}" = Token::VarClose,
"." = Token::DOT,
$ident => |lexer| {
let ident = lexer.match_().to_string();
lexer.return_(Token::Ident(ident))
},
}
rule FormatString {
("\\n" | "\\t" | "\\\\") => |lexer| {
let s = lexer.match_();
lexer.return_(Token::StringLiteralPart(s.into()))
},
"\\" => |mut lexer| {
if let Some(next_char) = lexer.peek() {
let next_char = next_char.to_string();
let exits = &lexer.state().fstr_exit_points;
for (exit, _) in exits {
if exit == &next_char.as_str() {
lexer.state().fstr_skip_next_exit_point = true;
return lexer.switch_and_reset_match(_LexerRule::FormatString);
}
}
lexer.return_(Token::__Failed(format!("cannot escape'{}'", next_char)))
} else {
lexer.return_(Token::__Failed("cannot escape <EOF>".into()))
}
},
"%{" => |lexer| {
lexer.switch_and_return(_LexerRule::FormatStringMacroExpansion, Token::VarOpen)
},
"\n" = Token::__Failed("newline is not allowed here".into()),
_ => |mut lexer| {
let s = lexer.match_();
let token = Token::StringLiteralPart(s.into());
let skip_exit_point = &mut lexer.state().fstr_skip_next_exit_point;
if *skip_exit_point {
*skip_exit_point = false;
return lexer.return_(token);
}
// check exit points
let exits = &lexer.state().fstr_exit_points;
let mut is_exit = None;
for (exit, exit_rule) in exits {
if exit == &s {
is_exit = Some((exit_rule.clone(), exit.clone()));
break;
}
}
if let Some((exit_rule, token)) = is_exit {
lexer.state().fstr_exit_points.clear();
lexer.state().wrapped_in_single_quotation = false;
return lexer.switch_and_return(exit_rule, token);
} else {
lexer.return_(Token::StringLiteralPart(s.into()))
}
},
}
rule FormatStringMacroExpansion {
"%{" => |mut lexer| {
lexer.state().macro_expansion_depth += 1;
lexer.return_(Token::VarOpen)
},
"}" => |mut lexer| {
if lexer.state().macro_expansion_depth == 0 {
lexer.switch_and_return(_LexerRule::FormatString, Token::VarClose)
} else {
lexer.state().macro_expansion_depth -= 1;
lexer.return_(Token::VarClose)
}
},
"." = Token::DOT,
$ident => |lexer| {
let ident = lexer.match_().to_string();
lexer.return_(Token::Ident(ident))
},
}
rule PhaseAction {
['1'-'6'] => |lexer| {
let value: ast::ActionPhase = lexer.match_().into();
lexer.switch_and_return(_LexerRule::ActionsInner, Token::PhaseNumber(value))
},
}
rule NumberActionArg {
$$numeric+ => |lexer| {
let value_str = lexer.match_();
let value = value_str.parse();
if let Ok(value) = value {
lexer.switch_and_return(_LexerRule::ActionsInner, Token::Number(value))
} else {
lexer.return_(Token::__Failed(format!("cannot parse {} into isize", value_str)))
}
},
}
// SecMarker test
// ^^^^
// SecMarker "test"
// ^^^^
rule SecMarkerName {
// wait for first character
$whitespace => |lexer| {
lexer.switch_and_reset_match(_LexerRule::SecMarkerName)
},
// it's a marker name with '"'
"\"" => |lexer| {
let _ = lexer.match_();
lexer.switch_and_return(_LexerRule::SecMarkerNameDQ, Token::DQ)
},
$ident => |lexer| {
let ident = lexer.match_().into();
lexer.switch_and_return(_LexerRule::Init, Token::Ident(ident))
},
}
// SecMarker "ident"
// ^^^^^
rule SecMarkerNameDQ {
"\"" => |lexer| { lexer.switch_and_return(_LexerRule::Init, Token::DQ) },
$ident => |lexer| {
let ident = lexer.match_().into();
lexer.return_(Token::Ident(ident))
},
}
rule LineBreak {
$whitespace => |lexer| { lexer.switch_and_reset_match(_LexerRule::LineBreak) },
'\n' $whitespace* => |mut lexer| {
if let Some(target) = lexer.state().linebreak_next_rule.take() {
lexer.switch_and_reset_match(target)
} else {
lexer.return_(Token::__Failed("linebreak_next_rule cannot be empty in this state".into()))
}
},
}
rule Comment {
"\n" => |lexer| {
lexer.switch(_LexerRule::Init)
},
_,
}
}
impl PartialEq<&str> for Token {
fn eq(&self, other: &&str) -> bool {
match (self, other) {
(Self::DQ, &"\"") | (Self::SQ, &"'") | (Self::FSLASH, &"/") | (Self::COMMA, &",") => {
true
}
_ => false,
}
}
}
#[allow(non_camel_case_types)]
#[derive(Debug, PartialEq, Eq, Clone)]
pub enum Token {
Ident(String),
// special token for situations like SecRule VAR test "log"
RuleIdent(String),
StringLiteralPart(String),
Number(isize),
// keywords
SecMarker,
SecDefaultAction,
SecAction,
SecRule,
// actions
chain,
setvar,
deny,
log,
nolog,
phase,
PhaseNumber(ast::ActionPhase),
pass,
id,
msg,
capture,
block,
logdata,
ctl,
t,
rev,
severity,
tag,
// special chars
SQ, // '
DQ, // "
AT, // @
EXC, // !
DOT, // .
COMMA, // ,
PIPE, // |
AMP, // &
COLON, // :
FSLASH, // /
// others
VarOpen, // %{
VarClose, // }
Assign, // =
IncAssign, // =+
DecAssign, // =-
// error
__Failed(String),
}
impl Token {
pub fn to_string(&self) -> String {
match self {
Token::Ident(x) => x.clone(),
Token::RuleIdent(x) => x.clone(),
Token::StringLiteralPart(x) => x.clone(),
_ => "".into(),
}
}
pub fn to_isize(&self) -> isize {
match self {
Token::Number(x) => *x,
_ => 0,
}
}
pub fn to_action_phase(&self) -> ast::ActionPhase {
match self {
Token::PhaseNumber(x) => x.clone(),
_ => ast::ActionPhase::ReqBody,
}
}
}
/// util type for merging the [`Token::StringLiteralPart`]s to a single [`Token::StringLiteralPart`]
pub struct Lexer<'a> {
lexer: Peekable<_Lexer<'a>>,
}
impl<'a> Lexer<'a> {
pub fn new(input: &'a str) -> Lexer<'a> {
Lexer {
lexer: _Lexer::new(input).peekable(),
}
}
}
impl<'a> Iterator for Lexer<'a> {
type Item = Result<(usize, Token, usize), parser::UserParseError>;
fn next(&mut self) -> Option<Self::Item> {
let next = self.lexer.next()?;
let next = if let Err(err) = next {
Err(parser::UserParseError::LexerError(err))
} else {
Ok(next.unwrap())
};
if let Ok((start, Token::StringLiteralPart(part), end)) = next {
let mut end = end;
let mut part = part;
while let Some(Ok((_, Token::StringLiteralPart(_), _))) = self.lexer.peek() {
if let Ok((_, Token::StringLiteralPart(next_part), next_end)) = self.lexer.next()? {
end = next_end;
part.push_str(&next_part);
}
}
Some(Ok((start, Token::StringLiteralPart(part), end)))
} else {
Some(next)
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::modsec::parser::ast::ActionPhase;
use pretty_assertions::assert_eq;
fn test(input: &str, expected: &[Token]) {
let tokens: Vec<_> = Lexer::new(input)
.map(|e| {
if let Err(parser::UserParseError::LexerError(e)) = e {
let before = &input[..e.char_idx - 1];
let chars = input.chars().collect::<Vec<_>>();
let char = chars.get(e.char_idx - 1).unwrap();
let after = &input[e.char_idx..];
panic!("\n{:#?}\nchar:{}<{}>{}", e, before, char, after);
} else {
e.unwrap().1
}
})
.collect();
assert_eq!(tokens, expected);
}
#[test]
fn test_lexer() {
use Token::*;
test(
r#"
# comment
# comment
# a b
SecMarker test
SecMarker "b"
"#,
&[
SecMarker,
Ident("test".into()),
SecMarker,
DQ,
Ident("b".into()),
DQ,
],
);
test(
r#"
SecDefaultAction "phase:1,deny,log"
SecDefaultAction \
"phase:2,\
deny,log"
"#,
&[
SecDefaultAction,
DQ,
phase,
PhaseNumber(ActionPhase::ReqHeader),
COMMA,
deny,
COMMA,
log,
DQ,
SecDefaultAction,
DQ,
phase,
PhaseNumber(ActionPhase::ReqBody),
COMMA,
deny,
COMMA,
log,
DQ,
],
);
test(
r#"
SecDefaultAction "setvar:a=tests',setvar:'b.a=+hello,',log"
"#,
&[
SecDefaultAction,
DQ,
setvar,
Ident("a".into()),
Assign,
StringLiteralPart("tests'".into()),
COMMA,
setvar,
SQ,
Ident("b".into()),
DOT,
Ident("a".into()),
IncAssign,
StringLiteralPart("hello,".into()),
SQ,
COMMA,
log,
DQ,
],
);
test(
r#"
SecDefaultAction "setvar:a.b.%{c.%{d.e}.f.g=tests"
"#,
&[
SecDefaultAction,
DQ,
setvar,
Ident("a".into()),
DOT,
Ident("b".into()),
DOT,
VarOpen,
Ident("c".into()),
DOT,
VarOpen,
Ident("d".into()),
DOT,
Ident("e".into()),
VarClose,
DOT,
Ident("f".into()),
DOT,
Ident("g".into()),
Assign,
StringLiteralPart("tests".into()),
DQ,
],
);
test(
r#"
SecAction \
"id:212340,\
msg:'COMODO WAF: Cross-site Scripting (XSS) Attack||%{tx.domain}|%{tx.mode}|2',\
msg:test,\
phase:2,capture,block,\
setvar:'tx.xss_points=+%{tx.points_limit4}',\
setvar:'tx.points=+%{tx.points_limit4}',\
logdata:'Matched Data: %{TX.0} found within %{MATCHED_VAR_NAME}: %{MATCHED_VAR}',\
ctl:auditLogParts=+E,\
t:none,t:urlDecodeUni,t:htmlEntityDecode,t:compressWhiteSpace,\
rev:4,severity:2,tag:'CWAF',tag:'XSS',setvar:!tx.a"
"#,
&[
SecAction,
DQ,
id,
Number(212340),
COMMA, //
msg,
SQ,
StringLiteralPart("COMODO WAF: Cross-site Scripting (XSS) Attack||".into()),
VarOpen,
Ident("tx".into()),
DOT,
Ident("domain".into()),
VarClose,
StringLiteralPart("|".into()),
VarOpen,
Ident("tx".into()),
DOT,
Ident("mode".into()),
VarClose,
StringLiteralPart("|2".into()),
SQ,
COMMA, //
msg,
StringLiteralPart("test".into()),
COMMA, //
phase,
PhaseNumber(ActionPhase::ReqBody),
COMMA, //
capture,
COMMA, //
block,
COMMA, //
setvar,
SQ,
Ident("tx".into()),
DOT,
Ident("xss_points".into()),
IncAssign,
VarOpen,
Ident("tx".into()),
DOT,
Ident("points_limit4".into()),
VarClose,
SQ,
COMMA, //
setvar,
SQ,
Ident("tx".into()),
DOT,
Ident("points".into()),
IncAssign,
VarOpen,
Ident("tx".into()),
DOT,
Ident("points_limit4".into()),
VarClose,
SQ,
COMMA,
logdata,
SQ,
StringLiteralPart("Matched Data: ".into()),
VarOpen,
Ident("TX".into()),
DOT,
Ident("0".into()),
VarClose,
StringLiteralPart(" found within ".into()),
VarOpen,
Ident("MATCHED_VAR_NAME".into()),
VarClose,
StringLiteralPart(": ".into()),
VarOpen,
Ident("MATCHED_VAR".into()),
VarClose,
SQ,
COMMA,
ctl,
Ident("auditLogParts".into()),
IncAssign,
StringLiteralPart("E".into()),
COMMA,
t,
Ident("none".into()),
COMMA,
t,
Ident("urlDecodeUni".into()),
COMMA,
t,
Ident("htmlEntityDecode".into()),
COMMA,
t,
Ident("compressWhiteSpace".into()),
COMMA,
rev,
Number(4),
COMMA,
severity,
Number(2),
COMMA,
tag,
SQ,
StringLiteralPart("CWAF".into()),
SQ,
COMMA,
tag,
SQ,
StringLiteralPart("XSS".into()),
SQ,
COMMA,
setvar,
EXC,
Ident("tx".into()),
DOT,
Ident("a".into()),
DQ,
],
);
test(
r#"
SecRule &ARGS|ARGS|ARGS_NAMES|XML:/*|!ARGS:/body\n\t/|!ARGS:/content\/ %{ab}/:test|!XML:/a \
"!@pm document.cookie .parentnode \" \\' \t.innerhtml <!-- --> <![cdata[" \
"log"
"#,
&[
SecRule,
AMP,
Ident("ARGS".into()),
PIPE,
Ident("ARGS".into()),
PIPE,
Ident("ARGS_NAMES".into()),
PIPE,
Ident("XML".into()),
COLON,
FSLASH,
StringLiteralPart("*".into()),
PIPE,
EXC,
Ident("ARGS".into()),
COLON,
FSLASH,
StringLiteralPart("body\\n\\t".into()),
FSLASH,
PIPE,
EXC,
Ident("ARGS".into()),
COLON,
FSLASH,
StringLiteralPart("content/ ".into()),
VarOpen,
Ident("ab".into()),
VarClose,
FSLASH,
COLON,
Ident("test".into()),
PIPE,
EXC,
Ident("XML".into()),
COLON,
FSLASH,
StringLiteralPart("a".into()),
DQ,
EXC,
AT,
Ident("pm".into()),
StringLiteralPart(
" document.cookie .parentnode \" \\\\' \\t.innerhtml <!-- --> <![cdata[".into(),
),
DQ,
DQ,
log,
DQ,
],
);
test(
r#"
SecRule XML:/* test "log"
"#,
&[
SecRule,
Ident("XML".into()),
COLON,
FSLASH,
StringLiteralPart("*".into()),
RuleIdent("test".into()),
DQ,
log,
DQ,
],
);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment