|
import scipy.io |
|
import numpy as np |
|
import pandas as pd |
|
import torchvision.datasets as dset |
|
import torch |
|
import torch.optim as opt |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import json |
|
import os |
|
import torch.nn as nn |
|
import torch.nn.functional as F |
|
from torch.utils.data import Dataset, DataLoader, Sampler, WeightedRandomSampler |
|
|
|
|
|
|
|
|
|
class Data_Loader: |
|
|
|
def __init__(self, n_trains=None): |
|
self.n_train = n_trains |
|
self.urls = [ |
|
"http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz", |
|
"http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names" |
|
] |
|
|
|
def norm_kdd_data(self, train_real, val_real, val_fake, cont_indices): |
|
symb_indices = np.delete(np.arange(train_real.shape[1]), cont_indices) |
|
mus = train_real[:, cont_indices].mean(0) |
|
sds = train_real[:, cont_indices].std(0) |
|
sds[sds == 0] = 1 |
|
|
|
def get_norm(xs, mu, sd): |
|
bin_cols = xs[:, symb_indices] |
|
cont_cols = xs[:, cont_indices] |
|
cont_cols = np.array([(x - mu) / sd for x in cont_cols]) |
|
return np.concatenate([bin_cols, cont_cols], 1) |
|
|
|
train_real = get_norm(train_real, mus, sds) |
|
val_real = get_norm(val_real, mus, sds) |
|
val_fake = get_norm(val_fake, mus, sds) |
|
return train_real, val_real, val_fake |
|
|
|
|
|
def norm_data(self, train_real, val_real, val_fake): |
|
mus = train_real.mean(0) |
|
sds = train_real.std(0) |
|
sds[sds == 0] = 1 |
|
|
|
def get_norm(xs, mu, sd): |
|
return np.array([(x - mu) / sd for x in xs]) |
|
|
|
train_real = get_norm(train_real, mus, sds) |
|
val_real = get_norm(val_real, mus, sds) |
|
val_fake = get_norm(val_fake, mus, sds) |
|
return train_real, val_real, val_fake |
|
|
|
def norm(self, data, mu=1): |
|
return 2 * (data / 255.) - mu |
|
|
|
def get_dataset(self, dataset_name, c_percent=None, true_label=1, input_length=None): |
|
if dataset_name == 'cifar10': |
|
return self.load_data_CIFAR10(true_label) |
|
if dataset_name == 'kdd': |
|
return self.KDD99_train_valid_data() |
|
if dataset_name == 'kddrev': |
|
return self.KDD99Rev_train_valid_data() |
|
if dataset_name == 'thyroid': |
|
return self.Thyroid_train_valid_data() |
|
if dataset_name == 'arrhythmia': |
|
return self.Arrhythmia_train_valid_data() |
|
if dataset_name == 'ckdd': |
|
return self.contaminatedKDD99_train_valid_data(c_percent) |
|
else: |
|
return self.NAB_data(dataset_name, input_length) |
|
|
|
|
|
def load_data_CIFAR10(self, true_label): |
|
root = './data' |
|
if not os.path.exists(root): |
|
os.mkdir(root) |
|
|
|
trainset = dset.CIFAR10(root, train=True, download=True) |
|
train_data = np.array(trainset.data) |
|
train_labels = np.array(trainset.targets) |
|
|
|
testset = dset.CIFAR10(root, train=False, download=True) |
|
test_data = np.array(testset.data) |
|
test_labels = np.array(testset.targets) |
|
|
|
train_data = train_data[np.where(train_labels == true_label)] |
|
x_train = self.norm(np.asarray(train_data, dtype='float32')) |
|
x_test = self.norm(np.asarray(test_data, dtype='float32')) |
|
return x_train, x_test, test_labels |
|
|
|
def NAB_data(self, file_name, input_length): |
|
ds = MySeriesDataset("NAB", data_file=file_name, size=20) |
|
samples = ds.chunks |
|
labels = ds.labels |
|
|
|
norm_samples = samples[labels == 0] # 3679 norm |
|
anom_samples = samples[labels == 1] # 93 anom |
|
|
|
n_train = len(norm_samples) // 2 |
|
x_train = norm_samples[:n_train] # 1839 train |
|
|
|
val_real = norm_samples[n_train:] |
|
val_fake = anom_samples |
|
|
|
|
|
return x_train.numpy(), val_real.numpy(), val_fake.numpy() |
|
|
|
|
|
|
|
|
|
def Thyroid_train_valid_data(self): |
|
data = scipy.io.loadmat("data/thyroid.mat") |
|
samples = data['X'] # 3772 |
|
labels = ((data['y']).astype(np.int32)).reshape(-1) |
|
|
|
norm_samples = samples[labels == 0] # 3679 norm |
|
anom_samples = samples[labels == 1] # 93 anom |
|
|
|
n_train = len(norm_samples) // 2 |
|
x_train = norm_samples[:n_train] # 1839 train |
|
|
|
val_real = norm_samples[n_train:] |
|
val_fake = anom_samples |
|
return self.norm_data(x_train, val_real, val_fake) |
|
|
|
|
|
def Arrhythmia_train_valid_data(self): |
|
data = scipy.io.loadmat("data/arrhythmia.mat") |
|
samples = data['X'] # 518 |
|
labels = ((data['y']).astype(np.int32)).reshape(-1) |
|
|
|
norm_samples = samples[labels == 0] # 452 norm |
|
anom_samples = samples[labels == 1] # 66 anom |
|
|
|
n_train = len(norm_samples) // 2 |
|
x_train = norm_samples[:n_train] # 226 train |
|
|
|
val_real = norm_samples[n_train:] |
|
val_fake = anom_samples |
|
return self.norm_data(x_train, val_real, val_fake) |
|
|
|
|
|
def KDD99_preprocessing(self): |
|
df_colnames = pd.read_csv(self.urls[1], skiprows=1, sep=':', names=['f_names', 'f_types']) |
|
df_colnames.loc[df_colnames.shape[0]] = ['status', ' symbolic.'] |
|
df = pd.read_csv(self.urls[0], header=None, names=df_colnames['f_names'].values) |
|
df_symbolic = df_colnames[df_colnames['f_types'].str.contains('symbolic.')] |
|
df_continuous = df_colnames[df_colnames['f_types'].str.contains('continuous.')] |
|
samples = pd.get_dummies(df.iloc[:, :-1], columns=df_symbolic['f_names'][:-1]) |
|
|
|
smp_keys = samples.keys() |
|
cont_indices = [] |
|
for cont in df_continuous['f_names']: |
|
cont_indices.append(smp_keys.get_loc(cont)) |
|
|
|
labels = np.where(df['status'] == 'normal.', 1, 0) |
|
return np.array(samples), np.array(labels), cont_indices |
|
|
|
|
|
def KDD99_train_valid_data(self): |
|
samples, labels, cont_indices = self.KDD99_preprocessing() |
|
anom_samples = samples[labels == 1] # norm: 97278 |
|
|
|
norm_samples = samples[labels == 0] # attack: 396743 |
|
|
|
n_norm = norm_samples.shape[0] |
|
ranidx = np.random.permutation(n_norm) |
|
n_train = n_norm // 2 |
|
|
|
x_train = norm_samples[ranidx[:n_train]] |
|
norm_test = norm_samples[ranidx[n_train:]] |
|
|
|
val_real = norm_test |
|
val_fake = anom_samples |
|
return self.norm_kdd_data(x_train, val_real, val_fake, cont_indices) |
|
|
|
|
|
def KDD99Rev_train_valid_data(self): |
|
samples, labels, cont_indices = self.KDD99_preprocessing() |
|
|
|
norm_samples = samples[labels == 1] # norm: 97278 |
|
|
|
# Randomly draw samples labeled as 'attack' |
|
# so that the ratio btw norm:attack will be 4:1 |
|
# len(anom) = 24,319 |
|
anom_samples = samples[labels == 0] # attack: 396743 |
|
|
|
rp = np.random.permutation(len(anom_samples)) |
|
rp_cut = rp[:24319] |
|
anom_samples = anom_samples[rp_cut] # attack:24319 |
|
|
|
n_norm = norm_samples.shape[0] |
|
ranidx = np.random.permutation(n_norm) |
|
n_train = n_norm // 2 |
|
|
|
x_train = norm_samples[ranidx[:n_train]] |
|
norm_test = norm_samples[ranidx[n_train:]] |
|
|
|
val_real = norm_test |
|
val_fake = anom_samples |
|
return self.norm_kdd_data(x_train, val_real, val_fake, cont_indices) |
|
|
|
|
|
def contaminatedKDD99_train_valid_data(self, c_percent): |
|
samples, labels, cont_indices = self.KDD99_preprocessing() |
|
|
|
ranidx = np.random.permutation(len(samples)) |
|
n_test = len(samples)//2 |
|
x_test = samples[ranidx[:n_test]] |
|
y_test = labels[ranidx[:n_test]] |
|
|
|
x_train = samples[ranidx[n_test:]] |
|
y_train = labels[ranidx[n_test:]] |
|
|
|
norm_samples = x_train[y_train == 0] # attack: 396743 |
|
anom_samples = x_train[y_train == 1] # norm: 97278 |
|
n_contaminated = int((c_percent/100)*len(anom_samples)) |
|
|
|
rpc = np.random.permutation(n_contaminated) |
|
x_train = np.concatenate([norm_samples, anom_samples[rpc]]) |
|
|
|
val_real = x_test[y_test == 0] |
|
val_fake = x_test[y_test == 1] |
|
return self.norm_kdd_data(x_train, val_real, val_fake, cont_indices) |
|
|
|
|
|
|
|
class MySeriesDataset: |
|
def __init__(self, dname, data_file, size, step=1): |
|
label_file = "combined_labels" |
|
with open(os.path.join("data", dname, "labels", label_file + '.json'), 'r') as f: |
|
self.label_json = json.load(f) |
|
|
|
data_file_path = self.getRelativePath(data_file) |
|
assert data_file_path is not -1 |
|
data_df = pd.read_csv(os.path.join("data", dname, "data", data_file_path)) |
|
data_df['timestamp'] = pd.to_datetime(data_df['timestamp']) |
|
data_df['stand_value'] = standardization(data_df['value']) |
|
anomalies_ts = [a_ts.replace(".000000", "") for a_ts in self.label_json[data_file_path]] |
|
anomalies = data_df[data_df['timestamp'].isin(anomalies_ts)] |
|
# anomalies['timestamp'] = pd.to_datetime(anomalies['timestamp']) |
|
|
|
self.chunks = torch.FloatTensor(data_df['value']).unfold(0, size, step) |
|
# self.chunks = self.chunks.view(-1, size) |
|
self.df = data_df |
|
self.anomalies = anomalies |
|
|
|
''' |
|
dataframe没有unfold函数,但是有rolling滑窗函数,rolling如果要实现自定义函数需要传数值类型的列, |
|
但是真正的数据用不到,而是通过传入Series的index判断某条数据是不是异常数据,最后生成的chunks是size+1, |
|
所以用size为period做rolling运算会多一个label数据。 |
|
''' |
|
self.labels = torch.tensor(data_df['value'].rolling(size).apply( |
|
lambda x:self.ifAnomalySeries(x, anomalies.index.tolist())).dropna().tolist(), dtype=torch.int64)#[:-1] |
|
|
|
|
|
def ifAnomalySeries(self, windowed_data, anomalies_idx): |
|
if True in [x in anomalies_idx for x in windowed_data.index.tolist()]: |
|
return 1 |
|
else: |
|
return 0 |
|
|
|
|
|
def getRelativePath(self, data_file): |
|
# 输入目标csv文件名字, 在label的json中找到了返回这个csv文件的完整路径,找不到返回-1 |
|
for file in list(self.label_json.keys()): |
|
if file.rfind(data_file) is not -1: |
|
return file |
|
return -1 |
|
|
|
|
|
def __len__(self): |
|
return self.chunks.size(0) |
|
|
|
|
|
def __getitem__(self, i): |
|
x = self.chunks[i, :, :] # 倒数第一个是要预测的label,因此从第一个取到倒数第二个 |
|
# y = self.chunks[i, :, -1:].squeeze(-1) # 取倒数第一个作为label |
|
anomaly_label = self.labels[i] |
|
return x, anomaly_label |
|
|
|
|
|
def normalization(data): |
|
return (data - data.min())/(data.max() - data.min()) |
|
|
|
|
|
def standardization(data): |
|
return (data - data.mean())/(data.std()) |