Skip to content

Instantly share code, notes, and snippets.

@mpenick
Created August 27, 2023 21:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mpenick/bb27a64ff1e637c64ada7c9463a935db to your computer and use it in GitHub Desktop.
Save mpenick/bb27a64ff1e637c64ada7c9463a935db to your computer and use it in GitHub Desktop.
A quick port of the Python Drain3 to Rust
[package]
name = "drain"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
lru = "0.11"
// Copyright 2023 Michael A. Penick
// Permission is hereby granted, free of charge, to any person obtaining a copy of
// this software and associated documentation files (the “Software”), to deal in
// the Software without restriction, including without limitation the rights to
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
// the Software, and to permit persons to whom the Software is furnished to do so,
// subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
// IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
use std::{borrow::Cow, collections::HashMap, fmt::Display, num::NonZeroUsize};
#[derive(Debug)]
pub struct LogCluster<'a> {
template_tokens: Vec<Token<'a>>,
match_count: usize,
cluster_id: usize,
}
impl<'a> LogCluster<'a> {
pub fn cluster_id(&self) -> usize {
self.cluster_id
}
pub fn match_count(&self) -> usize {
self.match_count
}
fn new(cluster_id: usize, tokens: Vec<&str>) -> Self {
Self {
template_tokens: tokens
.iter()
.map(|token| Token::Value(Cow::Owned(token.to_string())))
.collect(),
match_count: 1,
cluster_id,
}
}
fn seq_dist(&self, tokens: &Tokens) -> (f64, usize) {
assert!(self.template_tokens.len() == tokens.len());
if tokens.len() == 0 {
return (1.0, 0);
}
let mut sim_count = 0;
let mut param_count = 0;
for (token1, token2) in self.template_tokens.iter().zip(tokens.iter()) {
match token1 {
Token::Wildcard => {
param_count += 1;
}
Token::Value(token1) => {
if token1 == token2 {
sim_count += 1;
}
}
}
}
return (
sim_count as f64 / self.template_tokens.len() as f64,
param_count,
);
}
fn maybe_update(&mut self, tokens: &Tokens, values: &mut Vec<String>) -> bool {
assert!(self.template_tokens.len() == tokens.len());
let mut updated = false;
for (template_token1, token2) in self.template_tokens.iter_mut().zip(tokens.iter()) {
match template_token1 {
Token::Wildcard => values.push(token2.to_string()),
Token::Value(token1) => {
if token1 != token2 {
values.push(token2.to_string());
*template_token1 = Token::Wildcard;
updated = true;
}
}
}
}
updated
}
}
impl<'a> Display for LogCluster<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let mut first = true;
for token in &self.template_tokens {
if !first {
write!(f, " ")?;
}
write!(f, "{ }", token)?;
first = false;
}
Ok(())
}
}
#[derive(Hash, PartialEq, Eq, Clone, Debug)]
enum Token<'a> {
Wildcard,
Value(Cow<'a, str>),
}
impl<'a> Token<'a> {
fn has_numbers(&self) -> bool {
match self {
Token::Wildcard => false,
Token::Value(s) => s.chars().any(char::is_numeric),
}
}
}
impl<'a> Display for Token<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Token::Wildcard => write!(f, "<*>"),
Token::Value(value) => write!(f, "{}", value),
}
}
}
#[derive(Debug)]
struct Node<'a> {
children: HashMap<Token<'a>, Node<'a>>,
cluster_ids: Vec<usize>,
}
impl<'a> Node<'a> {
fn new() -> Self {
Self {
children: HashMap::new(),
cluster_ids: Vec::new(),
}
}
}
type Tokens<'a> = Vec<&'a str>;
pub enum LogUpdateStatus {
CreatedCluster,
ChangedClusterTemplate,
None,
}
pub struct LogParser<'a> {
first_level: HashMap<usize, Node<'a>>,
clusters: lru::LruCache<usize, LogCluster<'a>>,
clusters_count: usize,
sim_threshold: f64,
max_node_depth: usize,
max_children: usize,
parameterize_numeric_tokens: bool,
extra_delimiters: Vec<char>,
}
impl<'a> LogParser<'a> {
pub fn new(max_clusters: NonZeroUsize) -> Self {
Self {
first_level: HashMap::new(),
clusters: lru::LruCache::new(max_clusters),
clusters_count: 0,
sim_threshold: 0.4,
max_node_depth: 4,
max_children: 100,
parameterize_numeric_tokens: true,
extra_delimiters: Vec::new(),
}
}
pub fn sim_threshold(mut self, value: f64) -> Self {
self.sim_threshold = value;
self
}
pub fn max_children(mut self, value: usize) -> Self {
self.max_children = value;
self
}
pub fn parameterize_numeric_tokens(mut self, value: bool) -> Self {
self.parameterize_numeric_tokens = value;
self
}
pub fn extra_delimiters(mut self, value: Vec<char>) -> Self {
self.extra_delimiters = value;
self
}
pub fn add_log_line(
&mut self,
line: &str,
values: &mut Vec<String>,
) -> (&LogCluster, LogUpdateStatus) {
let tokens = self.tokenize(line);
values.clear();
match self.tree_search(&tokens) {
None => {
self.clusters_count += 1;
let cluster_id = self.clusters_count;
let cluster = LogCluster::new(cluster_id, tokens);
self.add_seq_to_prefix_tree(&cluster);
(
self.clusters.get_or_insert(cluster_id, || cluster),
LogUpdateStatus::CreatedCluster,
)
}
Some(cluster_id) => {
let cluster = self.clusters.get_mut(&cluster_id).unwrap();
cluster.match_count += 1;
let updated = cluster.maybe_update(&tokens, values);
return (
cluster,
if updated {
LogUpdateStatus::ChangedClusterTemplate
} else {
LogUpdateStatus::None
},
);
}
}
}
fn tokenize<'b>(&self, s: &'b str) -> Tokens<'b> {
let is_delimiter = |c| -> bool { c == ' ' || self.extra_delimiters.contains(&c) };
let s = s.trim_matches(is_delimiter);
s.split(is_delimiter).collect::<Tokens<'b>>()
}
fn add_seq_to_prefix_tree(&mut self, cluster: &LogCluster<'a>) {
let token_count = cluster.template_tokens.len();
let mut curr_node = self.first_level.entry(token_count).or_insert(Node::new());
if token_count == 0 {
curr_node.cluster_ids.push(cluster.cluster_id);
return;
}
let mut curr_node_depth = 0;
for token in &cluster.template_tokens {
if curr_node_depth >= self.max_node_depth || curr_node_depth >= token_count {
break;
}
if curr_node.children.contains_key(token) {
curr_node = curr_node.children.get_mut(token).unwrap();
} else {
if self.parameterize_numeric_tokens && token.has_numbers() {
curr_node = curr_node
.children
.entry(Token::Wildcard)
.or_insert(Node::new());
} else {
if curr_node.children.contains_key(&Token::Wildcard) {
if curr_node.children.len() < self.max_children {
curr_node = curr_node
.children
.entry(token.clone())
.or_insert(Node::new());
} else {
curr_node.children.get_mut(&Token::Wildcard).unwrap();
}
} else {
if curr_node.children.len() + 1 < self.max_children {
curr_node = curr_node
.children
.entry(token.clone())
.or_insert(Node::new());
} else if curr_node.children.len() + 1 == self.max_children {
curr_node = curr_node
.children
.entry(Token::Wildcard)
.or_insert(Node::new());
} else {
assert!(false)
}
}
}
}
curr_node_depth += 1;
}
// Add new cluster to the leaf node
let cluster_id = cluster.cluster_id;
let mut new_cluster_ids = Vec::new();
for cluster_id in &curr_node.cluster_ids {
if self.clusters.contains(cluster_id) {
new_cluster_ids.push(*cluster_id);
}
}
new_cluster_ids.push(cluster_id);
curr_node.cluster_ids = new_cluster_ids;
}
fn tree_search(&mut self, tokens: &Tokens) -> Option<usize> {
let token_count = tokens.len();
let mut curr_node = self.first_level.get(&token_count);
let mut curr_node_depth = 0;
for token in tokens {
if curr_node_depth >= self.max_node_depth {
break;
}
if curr_node_depth == token_count {
break;
}
match curr_node {
None => break,
Some(node) => {
curr_node = node.children.get(&Token::Value(Cow::Borrowed(&token)));
if curr_node.is_none() {
curr_node = node.children.get(&Token::Wildcard);
}
}
}
curr_node_depth += 1
}
match curr_node {
None => None,
Some(node) => {
let mut max_sim = 0.0;
let mut max_param_count = 0;
let mut max_cluster_id: Option<usize> = None;
for cluster_id in &node.cluster_ids {
let cluster = self.clusters.get(cluster_id);
match cluster {
None => continue,
Some(cluster) => {
let (sim, param_count) = cluster.seq_dist(tokens);
if sim > max_sim || (sim == max_sim && param_count > max_param_count) {
max_sim = sim;
max_param_count = param_count;
max_cluster_id = Some(*cluster_id);
}
}
}
}
if max_sim >= self.sim_threshold {
max_cluster_id
} else {
None
}
}
}
}
}
#[cfg(test)]
mod tests {
use std::{num::NonZeroUsize, vec};
use super::LogParser;
#[test]
fn add_log_line() {
let lines = vec![
"Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth]",
"Dec 10 07:08:28 LabSZ sshd[24208]: input_userauth_request: invalid user webmaster [preauth]",
"Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2",
"Dec 10 09:12:35 LabSZ sshd[24492]: Failed password for invalid user pi from 0.0.0.0 port 49289 ssh2",
"Dec 10 09:12:44 LabSZ sshd[24501]: Failed password for invalid user ftpuser from 0.0.0.0 port 60836 ssh2",
"Dec 10 07:28:03 LabSZ sshd[24245]: input_userauth_request: invalid user pgadmin [preauth]",
];
let expected = vec![
"Dec 10 07:07:38 LabSZ sshd[24206]: input_userauth_request: invalid user test9 [preauth]",
"Dec 10 <*> LabSZ <*> input_userauth_request: invalid user <*> [preauth]",
"Dec 10 09:12:32 LabSZ sshd[24490]: Failed password for invalid user ftpuser from 0.0.0.0 port 62891 ssh2",
"Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2",
"Dec 10 <*> LabSZ <*> Failed password for invalid user <*> from 0.0.0.0 port <*> ssh2",
"Dec 10 <*> LabSZ <*> input_userauth_request: invalid user <*> [preauth]",
];
let expected_values = vec![
vec![],
vec!["07:08:28", "sshd[24208]:", "webmaster"],
vec![],
vec!["09:12:35", "sshd[24492]:", "pi", "49289"],
vec!["09:12:44", "sshd[24501]:", "ftpuser", "60836"],
vec!["07:28:03", "sshd[24245]:", "pgadmin"],
];
let mut parser = LogParser::new(NonZeroUsize::new(1000).unwrap());
for ((line, expected), expected_values) in lines
.iter()
.zip(expected.iter())
.zip(expected_values.iter())
{
let mut values = Vec::new();
let (group, _) = parser.add_log_line(line, &mut values);
let actual = format!("{}", group);
assert_eq!(expected.to_string(), actual);
assert_eq!(
expected_values
.iter()
.map(|v| v.to_string())
.collect::<Vec<String>>(),
values
);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment