Created
August 27, 2023 21:15
-
-
Save mpenick/bb27a64ff1e637c64ada7c9463a935db to your computer and use it in GitHub Desktop.
A quick port of the Python Drain3 to Rust
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[package] | |
name = "drain" | |
version = "0.1.0" | |
edition = "2021" | |
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | |
[dependencies] | |
lru = "0.11" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright 2023 Michael A. Penick | |
// Permission is hereby granted, free of charge, to any person obtaining a copy of | |
// this software and associated documentation files (the “Software”), to deal in | |
// the Software without restriction, including without limitation the rights to | |
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of | |
// the Software, and to permit persons to whom the Software is furnished to do so, | |
// subject to the following conditions: | |
// | |
// The above copyright notice and this permission notice shall be included in all | |
// copies or substantial portions of the Software. | |
// | |
// THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS | |
// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR | |
// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER | |
// IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
use std::{borrow::Cow, collections::HashMap, fmt::Display, num::NonZeroUsize}; | |
#[derive(Debug)] | |
pub struct LogCluster<'a> { | |
template_tokens: Vec<Token<'a>>, | |
match_count: usize, | |
cluster_id: usize, | |
} | |
impl<'a> LogCluster<'a> { | |
pub fn cluster_id(&self) -> usize { | |
self.cluster_id | |
} | |
pub fn match_count(&self) -> usize { | |
self.match_count | |
} | |
fn new(cluster_id: usize, tokens: Vec<&str>) -> Self { | |
Self { | |
template_tokens: tokens | |
.iter() | |
.map(|token| Token::Value(Cow::Owned(token.to_string()))) | |
.collect(), | |
match_count: 1, | |
cluster_id, | |
} | |
} | |
fn seq_dist(&self, tokens: &Tokens) -> (f64, usize) { | |
assert!(self.template_tokens.len() == tokens.len()); | |
if tokens.len() == 0 { | |
return (1.0, 0); | |
} | |
let mut sim_count = 0; | |
let mut param_count = 0; | |
for (token1, token2) in self.template_tokens.iter().zip(tokens.iter()) { | |
match token1 { | |
Token::Wildcard => { | |
param_count += 1; | |
} | |
Token::Value(token1) => { | |
if token1 == token2 { | |
sim_count += 1; | |
} | |
} | |
} | |
} | |
return ( | |
sim_count as f64 / self.template_tokens.len() as f64, | |
param_count, | |
); | |
} | |
fn maybe_update(&mut self, tokens: &Tokens, values: &mut Vec<String>) -> bool { | |
assert!(self.template_tokens.len() == tokens.len()); | |
let mut updated = false; | |
for (template_token1, token2) in self.template_tokens.iter_mut().zip(tokens.iter()) { | |
match template_token1 { | |
Token::Wildcard => values.push(token2.to_string()), | |
Token::Value(token1) => { | |
if token1 != token2 { | |
values.push(token2.to_string()); | |
*template_token1 = Token::Wildcard; | |
updated = true; | |
} | |
} | |
} | |
} | |
updated | |
} | |
} | |
impl<'a> Display for LogCluster<'a> { | |
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | |
let mut first = true; | |
for token in &self.template_tokens { | |
if !first { | |
write!(f, " ")?; | |
} | |
write!(f, "{ }", token)?; | |
first = false; | |
} | |
Ok(()) | |
} | |
} | |
#[derive(Hash, PartialEq, Eq, Clone, Debug)] | |
enum Token<'a> { | |
Wildcard, | |
Value(Cow<'a, str>), | |
} | |
impl<'a> Token<'a> { | |
fn has_numbers(&self) -> bool { | |
match self { | |
Token::Wildcard => false, | |
Token::Value(s) => s.chars().any(char::is_numeric), | |
} | |
} | |
} | |
impl<'a> Display for Token<'a> { | |
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | |
match self { | |
Token::Wildcard => write!(f, "<*>"), | |
Token::Value(value) => write!(f, "{}", value), | |
} | |
} | |
} | |
#[derive(Debug)] | |
struct Node<'a> { | |
children: HashMap<Token<'a>, Node<'a>>, | |
cluster_ids: Vec<usize>, | |
} | |
impl<'a> Node<'a> { | |
fn new() -> Self { | |
Self { | |
children: HashMap::new(), | |
cluster_ids: Vec::new(), | |
} | |
} | |
} | |
type Tokens<'a> = Vec<&'a str>; | |
pub enum LogUpdateStatus { | |
CreatedCluster, | |
ChangedClusterTemplate, | |
None, | |
} | |
pub struct LogParser<'a> { | |
first_level: HashMap<usize, Node<'a>>, | |
clusters: lru::LruCache<usize, LogCluster<'a>>, | |
clusters_count: usize, | |
sim_threshold: f64, | |
max_node_depth: usize, | |
max_children: usize, | |
parameterize_numeric_tokens: bool, | |
extra_delimiters: Vec<char>, | |
} | |
impl<'a> LogParser<'a> { | |
pub fn new(max_clusters: NonZeroUsize) -> Self { | |
Self { | |
first_level: HashMap::new(), | |
clusters: lru::LruCache::new(max_clusters), | |
clusters_count: 0, | |
sim_threshold: 0.4, | |
max_node_depth: 4, | |
max_children: 100, | |
parameterize_numeric_tokens: true, | |
extra_delimiters: Vec::new(), | |
} | |
} | |
pub fn sim_threshold(mut self, value: f64) -> Self { | |
self.sim_threshold = value; | |
self | |
} | |
pub fn max_children(mut self, value: usize) -> Self { | |
self.max_children = value; | |
self | |
} | |
pub fn parameterize_numeric_tokens(mut self, value: bool) -> Self { | |
self.parameterize_numeric_tokens = value; | |
self | |
} | |
pub fn extra_delimiters(mut self, value: Vec<char>) -> Self { | |
self.extra_delimiters = value; | |
self | |
} | |
pub fn add_log_line( | |
&mut self, | |
line: &str, | |
values: &mut Vec<String>, | |
) -> (&LogCluster, LogUpdateStatus) { | |
let tokens = self.tokenize(line); | |
values.clear(); | |
match self.tree_search(&tokens) { | |
None => { | |
self.clusters_count += 1; | |
let cluster_id = self.clusters_count; | |
let cluster = LogCluster::new(cluster_id, tokens); | |
self.add_seq_to_prefix_tree(&cluster); | |
( | |
self.clusters.get_or_insert(cluster_id, || cluster), | |
LogUpdateStatus::CreatedCluster, | |
) | |
} | |
Some(cluster_id) => { | |
let cluster = self.clusters.get_mut(&cluster_id).unwrap(); | |
cluster.match_count += 1; | |
let updated = cluster.maybe_update(&tokens, values); | |
return ( | |
cluster, | |
if updated { | |
LogUpdateStatus::ChangedClusterTemplate | |
} else { | |
LogUpdateStatus::None | |
}, | |
); | |
} | |
} | |
} | |
fn tokenize<'b>(&self, s: &'b str) -> Tokens<'b> { | |
let is_delimiter = |c| -> bool { c == ' ' || self.extra_delimiters.contains(&c) }; | |
let s = s.trim_matches(is_delimiter); | |
s.split(is_delimiter).collect::<Tokens<'b>>() | |
} | |
fn add_seq_to_prefix_tree(&mut self, cluster: &LogCluster<'a>) { | |
let token_count = cluster.template_tokens.len(); | |
let mut curr_node = self.first_level.entry(token_count).or_insert(Node::new()); | |
if token_count == 0 { | |
curr_node.cluster_ids.push(cluster.cluster_id); | |
return; | |
} | |
let mut curr_node_depth = 0; | |
for token in &cluster.template_tokens { | |
if curr_node_depth >= self.max_node_depth || curr_node_depth >= token_count { | |
break; | |
} | |
if curr_node.children.contains_key(token) { | |
curr_node = curr_node.children.get_mut(token).unwrap(); | |
} else { | |
if self.parameterize_numeric_tokens && token.has_numbers() { | |
curr_node = curr_node | |
.children | |
.entry(Token::Wildcard) | |
.or_insert(Node::new()); | |
} else { | |
if curr_node.children.contains_key(&Token::Wildcard) { | |
if curr_node.children.len() < self.max_children { | |
curr_node = curr_node | |
.children | |
.entry(token.clone()) | |
.or_insert(Node::new()); | |
} else { | |
curr_node.children.get_mut(&Token::Wildcard).unwrap(); | |
} | |
} else { | |
if curr_node.children.len() + 1 < self.max_children { | |
curr_node = curr_node | |
.children | |
.entry(token.clone()) | |
.or_insert(Node::new()); | |
} else if curr_node.children.len() + 1 == self.max_children { | |
curr_node = curr_node | |
.children | |
.entry(Token::Wildcard) | |
.or_insert(Node::new()); | |
} else { | |
assert!(false) | |
} | |
} | |
} | |
} | |
curr_node_depth += 1; | |
} | |
// Add new cluster to the leaf node | |
let cluster_id = cluster.cluster_id; | |
let mut new_cluster_ids = Vec::new(); | |
for cluster_id in &curr_node.cluster_ids { | |
if self.clusters.contains(cluster_id) { | |
new_cluster_ids.push(*cluster_id); | |
} | |
} | |
new_cluster_ids.push(cluster_id); | |
curr_node.cluster_ids = new_cluster_ids; | |
} | |
fn tree_search(&mut self, tokens: &Tokens) -> Option<usize> { | |
let token_count = tokens.len(); | |
let mut curr_node = self.first_level.get(&token_count); | |
let mut curr_node_depth = 0; | |
for token in tokens { | |
if curr_node_depth >= self.max_node_depth { | |
break; | |
} | |
if curr_node_depth == token_count { | |
break; | |
} | |
match curr_node { | |
None => break, | |
Some(node) => { | |
curr_node = node.children.get(&Token::Value(Cow::Borrowed(&token))); | |
if curr_node.is_none() { | |
curr_node = node.children.get(&Token::Wildcard); | |
} | |
} | |
} | |
curr_node_depth += 1 | |
} | |
match curr_node { | |
None => None, | |
Some(node) => { | |
let mut max_sim = 0.0; | |
let mut max_param_count = 0; | |
let mut max_cluster_id: Option<usize> = None; | |
for cluster_id in &node.cluster_ids { | |
let cluster = self.clusters.get(cluster_id); | |
match cluster { | |
None => continue, | |
Some(cluster) => { | |
let (sim, param_count) = cluster.seq_dist(tokens); | |
if sim > max_sim || (sim == max_sim && param_count > max_param_count) { | |
max_sim = sim; | |
max_param_count = param_count; | |
max_cluster_id = Some(*cluster_id); | |
} | |
} | |
} | |
} | |
if max_sim >= self.sim_threshold { | |
max_cluster_id | |
} else { | |
None | |
} | |
} | |
} | |
} | |
} | |
#[cfg(test)] | |
mod tests { | |
use std::{num::NonZeroUsize, vec}; | |
use super::LogParser; | |
#[test] | |
fn add_log_line() { | |
let lines = vec![ | |
"Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth]", | |
"Dec 10 07:08:28 LabSZ sshd[24208]: input_userauth_request: invalid user webmaster [preauth]", | |
"Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2", | |
"Dec 10 09:12:35 LabSZ sshd[24492]: Failed password for invalid user pi from 0.0.0.0 port 49289 ssh2", | |
"Dec 10 09:12:44 LabSZ sshd[24501]: Failed password for invalid user ftpuser from 0.0.0.0 port 60836 ssh2", | |
"Dec 10 07:28:03 LabSZ sshd[24245]: input_userauth_request: invalid user pgadmin [preauth]", | |
]; | |
let expected = vec![ | |
"Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth]", | |
"Dec 10 <*> LabSZ <*> input_userauth_request: invalid user <*> [preauth]", | |
"Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2", | |
"Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2", | |
"Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2", | |
"Dec 10 <*> LabSZ <*> input_userauth_request: invalid user <*> [preauth]", | |
]; | |
let expected_values = vec![ | |
vec![], | |
vec!["07:08:28", "sshd[24208]:", "webmaster"], | |
vec![], | |
vec!["09:12:35", "sshd[24492]:", "pi", "49289"], | |
vec!["09:12:44", "sshd[24501]:", "ftpuser", "60836"], | |
vec!["07:28:03", "sshd[24245]:", "pgadmin"], | |
]; | |
let mut parser = LogParser::new(NonZeroUsize::new(1000).unwrap()); | |
for ((line, expected), expected_values) in lines | |
.iter() | |
.zip(expected.iter()) | |
.zip(expected_values.iter()) | |
{ | |
let mut values = Vec::new(); | |
let (group, _) = parser.add_log_line(line, &mut values); | |
let actual = format!("{}", group); | |
assert_eq!(expected.to_string(), actual); | |
assert_eq!( | |
expected_values | |
.iter() | |
.map(|v| v.to_string()) | |
.collect::<Vec<String>>(), | |
values | |
); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment