Skip to content

Instantly share code, notes, and snippets.

@blu3r4y
Created May 18, 2021 09:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save blu3r4y/dc2b2d305ff0cdd995ddcac70330af7e to your computer and use it in GitHub Desktop.
Save blu3r4y/dc2b2d305ff0cdd995ddcac70330af7e to your computer and use it in GitHub Desktop.
Features used by Dynatrace - SAL - LIT.AI.JKU in the NAD 2021 challenge
# Copyright 2021
# Dynatrace Research
# SAL Silicon Austria Labs
# LIT Artificial Intelligence Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pandas as pd
import feature_ip_port as ipp
import feature_ratios as rat
import feature_segments as seg
df = pd.read_csv("example.csv", parse_dates=["time"]).drop("label", axis="columns")
features = pd.concat([
ipp.get_ip_area(df, columns=["src", "dst"], areas=["global", "link_local", "unspecified"]),
ipp.get_ip_flow_type(df, src="src", dst="dst"),
ipp.get_ip_binary_match(df, src="src", dst="dst"),
ipp.get_ip_one_bits(df, columns=["src", "dst"]),
ipp.get_ip_parts(df, columns=["src", "dst"]),
ipp.get_port_area(df, columns=["spt", "dpt"]),
ipp.get_port_match(df, spt="spt", dpt="dpt"),
rat.get_cnt_ratios(df, unsafe=False, fill=0),
rat.get_in_out_ratios(df, fill=0),
rat.get_cnt_distances(df),
seg.get_segment_features(df, ipp.get_ip_parts(df, columns=["src", "dst"]),
groups=["src", "dst_3", "dst_2", "dst_1"], lag=1, parallel=False)
], axis="columns")
# take a glimpse at the data
print(features.head().T)
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
time src dst spt dpt duration out (bytes) in (bytes) proto app cnt_dst cnt_src cnt_serv_src cnt_serv_dst cnt_dst_slow cnt_src_slow cnt_serv_src_slow cnt_serv_dst_slow cnt_dst_conn cnt_src_conn cnt_serv_src_conn cnt_serv_dst_conn label
2020-12-03 04:42:51 2887270751 2887273992 37103 53 0 516870 43627 17 5 1 3 2 0 2 22 229 0 1 5 2 0 Probing-Nmap
2020-12-03 07:16:06 2887274048 167837957 53582 161 64 6559 0 17 35 5 1 131 1 21 2 3001 1 4 1 98 1 Probing-Port sweep
2020-12-03 07:29:07 2887270690 134744072 4324 53 427 107636 0 17 5 1 2 7 0 32 8 1269 0 2 3 21 0 Normal
2020-12-03 07:41:14 2887274048 167838199 53306 161 0 0 0 17 35 5 1 112 0 21 1 3037 0 4 1 70 0 Normal
2020-12-03 07:44:00 2887270786 134744072 0 0 0 178651 0 1 11 1 2 5 5 5 8 746 746 1 3 11 11 Normal
2020-12-03 07:53:57 2887270776 2887273992 60846 53 820 292791 0 17 5 1 13 0 0 52 61 167 0 1 8 1 1 Normal
2020-12-03 09:12:28 2887270733 134744072 60235 53 0 433203 0 17 5 1 4 0 0 23 14 739 0 1 3 0 0 Normal
2020-12-03 10:11:42 2887274003 2887273982 49980 161 0 0 47106 17 35 1 1 0 0 12 1 400 0 1 1 0 0 Probing-Port sweep
2020-12-03 10:17:48 2887274003 167837961 35740 161 230 652054 0 17 35 4 1 3 0 12 2 400 0 5 1 4 0 Probing-IP sweep
2020-12-03 11:42:37 2887270719 2887273992 51223 53 590 0 62349 17 5 6 10 36 1 89 52 87 1 3 7 28 1 Normal
2020-12-03 12:00:48 2887270690 879905829 55718 443 975 535170 0 6 10 3 1 0 0 43 1 179 0 2 1 2 2 Normal
2020-12-03 12:01:58 2887270803 2887274010 52860 15002 251 0 9322 6 24 1 9 0 0 23 42 47 0 1 5 0 0 Normal
2020-12-03 12:04:24 2887271429 879896140 50007 3478 611 0 11193 17 24 2 1 0 0 39 1 3 0 1 1 0 0 Probing-Nmap
2020-12-03 12:09:27 2887270791 2887271430 44421 3007 401 0 59202 6 24 4 1 2 219 34 1 6 249 4 1 2 11 Normal
2020-12-03 12:11:00 2887270776 2887273992 53246 53 0 193804 0 17 5 2 8 13 0 48 60 100 0 2 4 0 0 Normal
2020-12-03 12:20:31 2887270775 2887273543 0 0 0 0 9192 1 11 256 1 3256 8 256 1 3256 8 27 1 95 3 Normal
2020-12-03 12:25:38 2887270721 2726122003 12793 443 130 0 0 6 10 3 1 6 0 284 2 2346 0 2 1 3 0 Normal
2020-12-03 12:31:08 2887270786 134744072 0 0 690 201966 0 1 11 1 3 8 8 2 13 749 749 1 3 2 2 Normal
2020-12-03 12:33:38 2887270973 2007043601 3203 443 244 0 50333 6 10 1 1 2 2 256 2 1507 2 1 1 2 2 Normal
2020-12-03 12:59:31 2887270791 2887271305 43422 555 76 0 49478 6 24 7 1 26 340 258 1 29 825 7 1 2 10 Normal
2020-12-03 13:01:33 2887270791 2887271304 43423 5061 0 0 0 6 24 7 1 19 360 258 1 20 996 7 1 0 7 Normal
2020-12-03 13:16:58 2887270718 2887191178 49971 443 141 0 15505 6 10 1 1 1 1 77 2 669 1 1 1 1 1 Normal
2020-12-03 13:19:31 2887271434 2887273992 32876 53 756 0 0 17 5 1 9 0 0 5 55 98 0 1 8 0 0 Normal
2020-12-03 13:20:05 2887270791 2887274085 33938 9929 263 489289 0 6 24 23 1 3 185 23 1 7 398 18 1 1 6 Probing-Port sweep
2020-12-03 13:22:41 2887270766 2887273992 52751 389 53 347788 0 6 24 2 10 0 0 86 44 2 0 2 8 1 1 Normal
2020-12-03 13:24:21 2887274048 167838202 43313 161 227 0 100373 17 35 6 1 139 1 21 1 3014 1 3 1 65 1 Probing-Port sweep
2020-12-03 13:27:20 2887270791 2887273518 38447 443 424 0 8667 6 10 18 1 20 1 160 1 743 1 4 1 3 0 Normal
2020-12-03 13:34:01 2887270773 599449625 60020 443 40 85926 10580 6 10 2 1 11 2 32 2 94 2 2 1 11 2 Normal
2020-12-03 13:36:29 2887270695 2887273992 56041 53 238 579606 0 17 5 3 9 9 1 31 49 297 1 1 5 9 1 Normal
2020-12-03 13:45:54 2887270707 2887273992 56286 53 711 0 0 17 5 2 4 43 1 88 27 89 1 2 3 43 1 Normal
2020-12-03 13:52:01 2887270800 2887273992 60715 53 0 0 0 17 5 2 11 0 0 42 49 135 0 1 8 0 0 Normal
2020-12-03 13:53:22 2887270702 2887273992 55013 53 573 0 7914 17 5 2 6 1 1 16 50 82 1 2 3 1 1 Normal
2020-12-03 13:56:48 2887270721 2887273992 51926 53 0 0 25147 17 5 2 6 9 1 290 44 1353 3 2 6 9 1 Normal
2020-12-03 14:15:24 2887270690 134744072 60007 53 560 94236 0 17 5 1 3 5 2 42 12 1345 2 1 3 5 2 DDOS-smurf
2020-12-03 14:21:56 2887270695 3232243202 54711 53 505 0 64957 17 5 1 1 2 2 31 1 276 2 1 1 2 2 Normal
2020-12-03 14:38:05 2887270719 2887273992 57507 53 1155 0 0 17 5 1 5 1 0 196 42 375 0 1 5 1 0 Normal
2020-12-03 15:08:45 2887271433 1755049120 54908 9980 0 348644 0 6 24 2 2 2 2 4 2 80 2 2 1 2 2 Probing-Nmap
2020-12-03 15:17:08 2887274048 167837955 57681 161 255 424427 17477 17 35 7 1 167 1 21 2 3086 1 4 1 61 1 DDOS-smurf
2020-12-03 15:25:58 2887270733 134744072 56760 53 0 0 1214 17 5 2 3 3 0 23 12 797 0 1 3 3 0 DDOS-smurf
2020-12-03 15:41:42 2887270733 134744072 59899 53 179 33008 0 17 5 3 4 22 1 33 12 1011 1 1 3 13 1 Normal
2020-12-03 15:45:21 2887274048 167838202 53466 161 738 0 0 17 35 7 1 195 1 21 1 2975 1 5 1 45 0 Normal
2020-12-03 16:03:09 2887270733 134744072 58023 53 0 44753 24036 17 5 1 3 23 1 71 10 887 1 1 3 20 0 Probing-Nmap
2020-12-03 16:03:50 2887270749 2472317572 49663 443 0 504157 0 6 10 1 1 1 1 26 6 131 1 1 1 1 1 Probing-Nmap
2020-12-03 16:07:51 2887270713 2887273992 59468 53 444 514854 79828 17 5 1 7 99 0 87 46 258 0 1 5 50 0 Normal
2020-12-03 16:09:02 2887271198 2887274010 4831 15002 0 211426 72762 6 24 1 2 0 0 2 39 39 0 1 2 0 0 Probing-Nmap
2020-12-03 16:13:42 2887271433 792239461 38558 22000 0 0 0 6 24 2 1 2 2 11 2 8 2 2 1 2 2 Normal
2020-12-03 16:16:29 2887270949 400236807 5012 443 64 223690 0 6 10 2 1 13 1 75 1 509 1 2 1 14 2 Normal
2020-12-03 16:27:30 2887274002 134744072 52748 53 0 0 19953 17 5 1 3 18 0 1 11 309 0 1 2 14 0 Probing-IP sweep
2020-12-03 16:30:17 2887270880 2887273992 55273 53 0 365406 52995 17 5 2 6 3 1 52 43 273 1 1 4 1 1 Normal
2020-12-03 16:51:05 2887270721 2887273992 57952 389 429 0 24905 17 24 2 6 4 0 44 44 66 0 2 5 4 0 Normal
2020-12-03 16:56:42 2887270766 679349478 54465 443 193 1184868 43560 6 10 1 1 0 0 12 2 21 0 1 1 0 0 Probing-Nmap
2020-12-03 17:05:44 2887270702 2887274039 54152 8080 250 1238058 0 6 24 2 2 20 1 50 19 89 1 1 2 20 1 Normal
2020-12-03 17:13:24 2887270690 3639551595 62882 53 234 1036442 29645 17 5 6 1 26 0 48 1 1286 0 5 1 23 0 Normal
2020-12-03 17:21:38 2887270775 2887273525 54014 9091 0 0 0 6 24 9 1 7 3 11 1 7 3 7 1 2 1 Normal
2020-12-03 17:30:09 2887270791 2887274002 33938 23502 36 756026 31962 6 24 27 2 0 358 27 33 0 1115 18 1 1 6 Normal
2020-12-03 17:30:16 2887274048 167838202 44714 161 441 412807 19311 17 35 5 1 101 1 21 1 3099 1 3 1 63 1 Normal
2020-12-03 17:51:12 2887270690 3627731182 58941 443 0 105819 28366 17 24 3 1 2 2 36 15 164 2 3 1 2 2 Normal
2020-12-03 17:52:15 2887274048 167838201 50287 161 117 0 9899 17 35 6 1 103 0 21 1 3068 0 5 1 57 0 Probing-IP sweep
2020-12-03 17:59:43 3105873370 1746059117 63891 3221 331 0 56057 6 24 5 1 0 185 64 1 5 298 4 1 0 11 Normal
2020-12-03 18:03:19 2887271299 887498523 53588 443 64 0 34686 6 10 2 1 10 0 23 3 150 0 2 1 5 0 Normal
2020-12-03 18:31:28 2887271429 677035416 6755 443 2011 0 20473 6 10 8 1 19 1 107 5 590 1 3 1 7 1 Normal
2020-12-03 18:33:13 2887271434 2887273992 56338 53 447 17475 10263 17 5 1 7 4 0 4 44 98 0 1 7 4 0 Normal
2020-12-03 18:37:14 2887270791 2887274035 33938 8222 0 225679 74265 6 24 128 1 9 374 128 3 34 1159 21 1 0 3 Normal
2020-12-03 18:37:59 167838205 2887274082 1027 6343 338 0 0 17 24 1 1 5 5 1 4 1199 1199 1 1 9 9 Normal
2020-12-03 18:50:47 2887270775 2887273527 26046 1886 0 0 75629 6 24 8 1 1722 223 388 2 27353 499 8 1 98 11 Probing-Nmap
2020-12-03 18:51:27 3116346117 2887273528 34420 1864 184 1017498 0 6 24 7 1 7 0 32 3 8 0 7 1 2 0 Normal
2020-12-03 18:54:06 2887270775 2887273475 56296 9594 0 1113505 0 6 24 1 1 26 2 3 2 75 2 1 1 10 2 Probing-Nmap
2020-12-03 18:57:39 167837960 2887273992 6434 123 399 0 26336 17 24 1 9 1 1 1 44 39 1 1 9 1 1 Normal
2020-12-03 19:01:35 2887270702 2887274010 56507 15002 75 8341 17981 6 24 1 10 1 1 31 37 41 1 1 7 1 1 DDOS-smurf
2020-12-03 19:13:30 2887273729 2887274002 58822 53 126 414050 0 17 5 1 1 0 0 5 27 55 0 1 1 0 0 Normal
2020-12-03 19:24:07 2887270765 311247748 0 0 827 0 764 1 11 249 1 340 1 256 1 347 1 77 1 77 0 DDOS-smurf
2020-12-03 19:27:19 2887274048 167838205 59247 161 369 956821 0 17 35 7 2 122 1 21 2 3018 1 3 2 62 1 Normal
2020-12-03 19:31:37 2887271434 599639059 60500 10001 41 74904 92254 6 24 6 1 160 0 9 1 160 0 6 1 93 0 Normal
2020-12-03 19:40:45 2887270775 2887273547 26046 1886 0 360049 0 6 24 27 1 3496 152 163 2 7818 280 23 1 100 4 Probing-Nmap
2020-12-03 20:03:33 2887270739 391226437 51482 443 273 0 0 6 10 6 1 15 0 91 1 538 0 6 1 11 1 Normal
2020-12-03 20:22:51 2887189907 2887274189 0 0 230 0 114759 1 11 1 1 4 4 1 1 485 485 1 1 4 4 Normal
2020-12-03 20:24:30 2887270721 879573039 7808 443 57 0 0 6 10 1 1 5 2 124 14 959 2 1 1 5 2 Normal
2020-12-03 21:24:20 2887270765 387297544 62778 32588 0 110564 0 6 24 36 1 5059 189 41 1 10434 319 27 1 99 4 Normal
2020-12-03 21:31:10 1123633412 1037569569 44031 1328 374 0 20714 6 24 22 1 15 32 22 1 15 32 17 1 15 3 Probing-IP sweep
2020-12-03 21:49:32 2887270775 2887273475 35076 1104 0 109715 99420 6 24 1 1 24 0 3 2 80 0 1 1 8 0 Normal
2020-12-03 22:06:54 2887270719 2887273992 11627 53 695 0 57472 17 5 3 8 22 1 128 53 293 1 1 5 15 1 DDOS-smurf
2020-12-03 22:16:21 2887271426 2887274039 12046 8080 0 762925 29390 6 24 1 1 0 0 31 10 4 0 1 1 0 0 Normal
2020-12-03 22:35:33 2887270775 2887273475 55182 9091 348 0 0 6 24 1 1 31 7 3 2 47 7 1 1 30 7 Normal
2020-12-03 22:37:51 2887270814 2887273992 52280 53 731 248100 0 17 5 2 9 2 0 29 45 80 0 2 8 2 0 Normal
2020-12-03 22:38:49 2887270695 3758096636 57430 5355 646 0 0 17 24 3 1 0 0 26 2 28 0 3 1 0 0 Normal
2020-12-03 22:48:28 3105873214 2887271232 54264 5998 55 337558 0 6 24 2 1 1 208 2 1 1 208 2 1 1 53 Normal
2020-12-03 22:56:20 1123633412 1037569542 44031 139 154 903713 11111 6 20 29 1 6 97 29 1 15 197 19 1 6 4 Normal
2020-12-03 23:03:11 2887274048 167838198 57258 161 419 366 0 17 35 6 1 185 0 21 1 3073 0 4 1 81 0 Normal
2020-12-03 23:12:59 2887270971 134744072 63161 53 364 0 25562 17 5 5 4 188 2 97 13 830 2 2 3 58 2 Normal
2020-12-03 23:15:27 59465216 220415529 55046 443 332 0 22776 6 10 52 1 60 0 128 1 62 0 18 1 21 0 Normal
2020-12-03 23:21:19 2887274048 167838200 49910 161 413 0 51248 17 35 6 1 181 1 21 1 3004 1 5 1 69 1 Normal
2020-12-03 23:23:14 2887270711 2887273473 51280 5080 574 0 0 6 24 6 1 0 0 14 2 3 0 6 1 0 0 Normal
2020-12-03 23:37:04 2887270773 2887191179 64916 443 2672 1360504 105474 6 10 1 1 1 1 25 8 165 1 1 1 1 1 Probing-IP sweep
2020-12-04 00:02:07 2887270803 3423402026 56458 443 639 1799779 109907 6 10 4 1 9 0 57 1 473 0 4 1 9 0 Normal
2020-12-04 00:37:16 2887270825 2887273992 49620 53 2 1423168 12959 17 5 6 4 4 1 31 30 46 1 1 1 1 1 Normal
2020-12-04 01:03:03 2887270711 2887273482 46430 443 437 311589 0 6 10 58 1 140 2 86 1 597 2 23 1 49 2 Normal
2020-12-04 01:10:57 2887274135 2887274010 64286 15002 632 816977 47380 6 24 1 6 1 1 2 17 41 1 1 7 1 1 Normal
2020-12-04 01:49:47 2887270775 2887273726 26046 1886 233 0 560 6 24 253 1 6232 256 388 1 12496 484 24 1 100 5 Probing-Port sweep
2020-12-04 01:52:03 2887270721 2887060099 1310 18001 0 0 0 6 24 3 1 1 1 63 1 29 1 2 1 1 1 Normal
2020-12-04 02:50:45 2887270775 2887273475 47416 2968 210 366217 0 6 24 1 1 10 3 11 2 63 3 1 1 8 3 Normal
# Copyright 2021
# Dynatrace Research
# SAL Silicon Austria Labs
# LIT Artificial Intelligence Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from ipaddress import IPv4Address
import numpy as np
import pandas as pd
def get_ip_area(df: pd.DataFrame, columns: List[str], areas: List[str]) -> pd.DataFrame:
"""
One-hot encode to which ip address block a certain address belongs to
:param df: original cleaned data frame
:param columns: the columns to process (most likely `["src", "dst"]`)
:param areas: the areas to process, one or more of
`multicast, private, global, unspecified, reserved, loopback, link_local`
"""
result = pd.DataFrame(index=df.index)
for column in columns:
ipv4 = df[column].map(IPv4Address)
for area in areas:
result[f"{column}_{area}"] = ipv4.apply(lambda ip: getattr(ip, f"is_{area}")).astype(bool)
return result
# noinspection PyUnresolvedReferences
def get_ip_flow_type(df: pd.DataFrame, src: str = "src", dst: str = "dst") -> pd.DataFrame:
"""
One-hot encoding of source and destination traffic type
:param df: original cleaned data frame
:param src: the column name of the source address
:param dst: the column name of the destination address
"""
result = pd.DataFrame(index=df.index)
src_global = df[src].apply(lambda e: IPv4Address(e).is_global)
dst_global = df[dst].apply(lambda e: IPv4Address(e).is_global)
result["is_inter"] = np.logical_and(src_global, dst_global).astype(bool)
result["is_ingress"] = np.logical_and(src_global, ~dst_global).astype(bool)
result["is_egress"] = np.logical_and(~src_global, dst_global).astype(bool)
result["is_intra"] = np.logical_and(~src_global, ~dst_global).astype(bool)
# ensure one-hot encoding
assert np.all(np.sum(result, axis=1) == 1)
return result
def get_ip_binary_match(df: pd.DataFrame, src: str = "src", dst: str = "dst") -> pd.DataFrame:
"""
Compute the number of bits that are equal in src and dst,
from left to right, stopping at the first mismatch,
which helps identifying how similar two addresses are
:param df: original cleaned data frame
:param src: the column name of the source address
:param dst: the column name of the destination address
"""
ips = df[[src, dst]].to_numpy()
# a bitwise XOR will reveal the bit position of the first mismatch
# which we use to check the necessary bit representation length
match = np.bitwise_xor(ips[:, 0], ips[:, 1])
bit_length = 32 - np.ceil(np.log2(match + 1)).astype("uint8")
result = pd.DataFrame(index=df.index)
result["ip_match"] = bit_length
return result
def get_ip_one_bits(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
"""
Transform every number (preferably, ip address) to its number of set bits
:param df: original cleaned data frame
:param columns: the columns to process (most likely `["src", "dst"]`)
"""
result = df.loc[:, columns]
result[:] = _hamming_weight(result.to_numpy())
result = result.add_suffix("_one_bits")
result = result.astype("uint8")
return result
def get_ip_parts(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
"""
Get the four individual parts of an ip address, indexed from lowest to highest,
e.g. `192.168.0.1` becomes `{"_0": 1, "_1": 0, "_2": 168, "_3": 192}`
:param df: original cleaned data frame
:param columns: the columns to process (most likely `["src", "dst"]`)
"""
result = pd.DataFrame(index=df.index)
for column in columns:
arr = df[column].to_numpy()
result[f"{column}_0"] = arr & 0xFF
result[f"{column}_1"] = (arr >> 8) & 0xFF
result[f"{column}_2"] = (arr >> 16) & 0xFF
result[f"{column}_3"] = (arr >> 24) & 0xFF
result = result.astype("uint8")
return result
def get_port_area(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
"""
One-hot encode port numbers to their respective area
```
common: 0 to 1023
registered: 1024 to 49151
ephemeral: 49152 to 65535
```
:param df: original cleaned data frame
:param columns: the columns to process (most likely `["spt", "dpt"]`)
"""
result = pd.DataFrame(index=df.index)
for column in columns:
port = df[column]
result[f"{column}_common"] = (port <= 1023).astype(bool)
result[f"{column}_registered"] = ((port >= 1024) & (port <= 49151)).astype(bool)
result[f"{column}_ephemeral"] = (port >= 49152).astype(bool)
return result
# noinspection PyUnresolvedReferences
def get_port_match(df: pd.DataFrame, spt: str = "spt", dpt: str = "dpt") -> pd.DataFrame:
"""
A column that is one if the src and dst port match
:param df: original cleaned data frame
:param spt: the column name of the source port
:param dpt: the column name of the destination port
"""
result = pd.DataFrame(index=df.index)
result["port_match"] = (df[spt] == df[dpt]).astype(bool)
return result
HAM_M1 = np.uint64(0x5555555555555555)
HAM_M2 = np.uint64(0x3333333333333333)
HAM_M4 = np.uint64(0x0f0f0f0f0f0f0f0f)
HAM_H01 = np.uint64(0x0101010101010101)
def _hamming_weight(x):
"""
efficient implementation of the hamming weight to
find the number of one bits of a number
(c) https://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
"""
x = x - ((x >> 1) & HAM_M1)
x = (x & HAM_M2) + ((x >> 2) & HAM_M2)
x = (x + (x >> 4)) & HAM_M4
x = (x * HAM_H01) >> 56
return x
# Copyright 2021
# Dynatrace Research
# SAL Silicon Austria Labs
# LIT Artificial Intelligence Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import pandas as pd
def get_cnt_ratios(df: pd.DataFrame, unsafe: bool = False, fill: float = 0) -> pd.DataFrame:
"""
Get various ratios between cnt values.
This function expects the following column names to be present:
cnt_src, cnt_src_slow, cnt_src_conn,
cnt_dst, cnt_dst_slow, cnt_dst_conn,
cnt_serv_src, cnt_serv_src_slow, cnt_serv_src_conn,
cnt_serv_dst, cnt_serv_dst_slow, cnt_serv_dst_conn
:param df: the original cleaned data frame
:param unsafe: also compute ratios for columns where the divisor can be zero
:param fill: fill invalid divisions with this number
"""
res = pd.DataFrame(index=df.index)
# ratios relative to slow and conn
##################################
assert np.all(df["cnt_src_slow"] > 0)
assert np.all(df["cnt_src_conn"] > 0)
res["relative_cnt_src_to_slow"] = df["cnt_src"] / df["cnt_src_slow"]
res["relative_cnt_src_to_conn"] = df["cnt_src"] / df["cnt_src_conn"]
assert np.all(df["cnt_dst_slow"] > 0)
assert np.all(df["cnt_dst_conn"] > 0)
res["relative_cnt_dst_to_slow"] = df["cnt_dst"] / df["cnt_dst_slow"]
res["relative_cnt_dst_to_conn"] = df["cnt_dst"] / df["cnt_dst_conn"]
# assert np.all(df["cnt_serv_src_slow"] > 0) # FALSE
# assert np.all(df["cnt_serv_src_conn"] > 0) # FALSE
if unsafe:
res["relative_cnt_serv_src_to_slow"] = _finite_divide(df["cnt_serv_src"], df["cnt_serv_src_slow"], fill)
res["relative_cnt_serv_src_to_conn"] = _finite_divide(df["cnt_serv_src"], df["cnt_serv_src_conn"], fill)
# assert np.all(df["cnt_serv_dst_slow"] > 0) # FALSE
# assert np.all(df["cnt_serv_dst_conn"] > 0) # FALSE
if unsafe:
res["relative_cnt_serv_dst_to_slow"] = _finite_divide(df["cnt_serv_dst"], df["cnt_serv_dst_slow"], fill)
res["relative_cnt_serv_dst_to_conn"] = _finite_divide(df["cnt_serv_dst"], df["cnt_serv_dst_conn"], fill)
# src / dst ratios
##################
assert np.all(df["cnt_dst"] > 0)
assert np.all(df["cnt_dst_slow"] > 0)
assert np.all(df["cnt_dst_slow"] > 0)
res["ratio_cnt_src_dst"] = df["cnt_src"] / df["cnt_dst"]
res["ratio_cnt_src_dst_slow"] = df["cnt_src_slow"] / df["cnt_dst_slow"]
res["ratio_cnt_src_dst_conn"] = df["cnt_src_conn"] / df["cnt_dst_conn"]
# assert np.all(df["cnt_serv_dst"] > 0) # FALSE
# assert np.all(df["cnt_serv_dst_slow"] > 0) # FALSE
# assert np.all(df["cnt_serv_dst_conn"] > 0) # FALSE
if unsafe:
res["ratio_cnt_serv_src_dst"] = _finite_divide(df["cnt_serv_src"], df["cnt_serv_dst"], fill)
res["ratio_cnt_serv_src_dst_slow"] = _finite_divide(df["cnt_serv_src_slow"], df["cnt_serv_dst_slow"], fill)
res["ratio_cnt_serv_src_dst_conn"] = _finite_divide(df["cnt_serv_src_conn"], df["cnt_serv_dst_conn"], fill)
# conn to slow ratios
#####################
assert np.all(df["cnt_dst_slow"] > 0)
assert np.all(df["cnt_src_slow"] > 0)
res["ratio_cnt_dst_conn_slow"] = df["cnt_dst_conn"] / df["cnt_dst_slow"]
res["ratio_cnt_src_conn_slow"] = df["cnt_src_conn"] / df["cnt_src_slow"]
# assert np.all(df["cnt_serv_src_slow"] > 0) # FALSE
# assert np.all(df["cnt_serv_dst_slow"] > 0) # FALSE
if unsafe:
res["ratio_cnt_serv_src_conn_slow"] = _finite_divide(df["cnt_serv_src_conn"], df["cnt_serv_src_slow"], fill)
res["ratio_cnt_serv_dst_conn_slow"] = _finite_divide(df["cnt_serv_dst_conn"], df["cnt_serv_dst_slow"], fill)
res = res.astype("float32") # save some gpu memory
# make sure that our values are finite and not too big
assert np.all(np.isfinite(res)) and np.all(res < 1e+12)
return res
def get_in_out_ratios(df: pd.DataFrame, fill: float = 0) -> pd.DataFrame:
"""
Get ratios between in and out traffic counts.
This function expects the following column names to be present:
in (bytes), out (bytes), duration
:param df: the original cleaned data frame
:param fill: fill invalid divisions with this number
"""
res = pd.DataFrame(index=df.index)
res["in_bytes_per_duration"] = _finite_divide(df["in (bytes)"], df["duration"], fill)
res["out_bytes_per_duration"] = _finite_divide(df["out (bytes)"], df["duration"], fill)
res["ratio_in_out_bytes"] = _finite_divide(df["in (bytes)"], df["out (bytes)"], fill)
res = res.astype("float32") # save some gpu memory
# make sure that our values are finite and not too big
assert np.all(np.isfinite(res)) and np.all(res < 1e+12)
return res
def get_cnt_distances(df: pd.DataFrame) -> pd.DataFrame:
"""
Get various distances between cnt values.
This function expects the following column names to be present:
cnt_src, cnt_src_slow, cnt_src_conn,
cnt_dst, cnt_dst_slow, cnt_dst_conn,
cnt_serv_src, cnt_serv_src_slow, cnt_serv_src_conn,
cnt_serv_dst, cnt_serv_dst_slow, cnt_serv_dst_conn
:param df: the original cleaned data frame
"""
df = df.astype("int64")
res = pd.DataFrame(index=df.index)
# src dst differences
#####################
res["diff_cnt_src_dst"] = df["cnt_src"] - df["cnt_dst"]
res["diff_cnt_src_dst_slow"] = df["cnt_src_slow"] - df["cnt_dst_slow"]
res["diff_cnt_src_dst_conn"] = df["cnt_src_conn"] - df["cnt_dst_conn"]
res["diff_cnt_serv_src_dst"] = df["cnt_serv_src"] - df["cnt_serv_dst"]
res["diff_cnt_serv_src_dst_slow"] = df["cnt_serv_src_slow"] - df["cnt_serv_dst_slow"]
res["diff_cnt_serv_src_dst_conn"] = df["cnt_serv_src_conn"] - df["cnt_serv_dst_conn"]
# conn slow differences
#######################
res["diff_dst_conn_slow"] = df["cnt_dst_conn"] - df["cnt_dst_slow"]
res["diff_src_conn_slow"] = df["cnt_src_conn"] - df["cnt_src_slow"]
res["diff_serv_src_conn_slow"] = df["cnt_serv_src_conn"] - df["cnt_serv_src_slow"]
res["diff_serv_dst_conn_slow"] = df["cnt_serv_dst_conn"] - df["cnt_serv_dst_slow"]
return res
def _finite_divide(a: np.ndarray, b: np.ndarray, fill: float = 0) -> np.ndarray:
"""Divides `a / b` but will fix `0 / 0` and `1 / 0` to `fill` (default: 0)"""
with np.errstate(divide="ignore", invalid="ignore"):
c = np.true_divide(a, b)
c[c == np.inf] = fill
c = np.nan_to_num(c, nan=fill)
return c
# Copyright 2021
# Dynatrace Research
# SAL Silicon Austria Labs
# LIT Artificial Intelligence Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from functools import partial
from multiprocessing import Pool, cpu_count
import numpy as np
import pandas as pd
def get_segment_features(df_clean: pd.DataFrame, df_ip: pd.DataFrame, groups: List[str], lag: int,
parallel: bool = True) -> pd.DataFrame:
"""
Group the data frame according to the specified `groups`, then, within each group,
cluster rows together that are consecutive to one another (according to the `lag` param),
and assign the segment length, and the number of unique spt, dpt, app per segment.
Example: For `times = [1, 3, 4, 5, 7, 8, 15]` we would create the segments
`[[1], [3, 4, 5], [7, 8], [15]]` with segment lengths `[1, 3, 2, 1]` with `lag = 1`
and the number of unique spt, dpt, app per segment.
Larger values for `lag` will also cluster points farther apart into one segment, e.g.
with `lag = 2` we would create the segments `[[1, 3, 4, 5, 7, 8], [15]]`
:param df_clean: the original cleaned data frame
:param df_ip: the "ip_parts" features data frame
:param groups: a list of columns to group by
:param lag: the minimum difference between observations to form a segment
:param parallel: whether to process groups in parallel or not
"""
df = pd.concat([df_clean, df_ip], axis="columns")
# transform and sort time column
df["time"] = df["time"].astype(np.int64) // 10 ** 9
df = df.sort_values("time")
groups = [group for _, group in df.groupby(groups, as_index=False)]
apply_function = partial(_assign_segment_metrics, lag=lag) # fix the lag parameter
if not parallel:
segments = map(apply_function, groups)
else:
with Pool(cpu_count()) as pool:
segments = pool.map(apply_function, groups)
return pd.concat(segments).sort_index()
def _assign_segment_metrics(group: pd.DataFrame, lag: int = 1):
assert lag >= 1
assert len(group) > 0
assert group["time"].is_monotonic_increasing
# append new columns with some default values
columns = ["segment_length", "nunique_dst", "nunique_spt", "nunique_dpt", "nunique_app"]
result = pd.DataFrame(1, dtype=int, index=group.index, columns=columns)
# get the time segment splits
time = group["time"].to_numpy()
splits = np.flatnonzero(np.diff(time) > lag) + 1
splits = np.insert(splits, 0, 0)
dim, nsegments = len(time), len(splits)
# pre-cache column locations to make the subsequent loop faster
loc_segment_length = result.columns.get_loc("segment_length")
loc_nunique_dst = result.columns.get_loc("nunique_dst")
loc_nunique_spt = result.columns.get_loc("nunique_spt")
loc_nunique_dpt = result.columns.get_loc("nunique_dpt")
loc_nunique_app = result.columns.get_loc("nunique_app")
loc_dst = group.columns.get_loc("dst")
loc_spt = group.columns.get_loc("spt")
loc_dpt = group.columns.get_loc("dpt")
loc_app = group.columns.get_loc("app")
for i in range(nsegments):
lo = splits[i]
hi = splits[i + 1] if i + 1 < nsegments else dim
slc = slice(lo, hi - 1) # segment slice
# assign the segment length for the entire group
result.iloc[slc, loc_segment_length] = hi - lo
# compute number of unique spt, dpt, app fields for this segment
result.iloc[slc, loc_nunique_dst] = group.iloc[slc, loc_dst].nunique()
result.iloc[slc, loc_nunique_spt] = group.iloc[slc, loc_spt].nunique()
result.iloc[slc, loc_nunique_dpt] = group.iloc[slc, loc_dpt].nunique()
result.iloc[slc, loc_nunique_app] = group.iloc[slc, loc_app].nunique()
return result
numpy>=1.19
pandas>=1.2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment