Skip to content

Instantly share code, notes, and snippets.

@blu3r4y
Created May 18, 2021 09:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save blu3r4y/dc2b2d305ff0cdd995ddcac70330af7e to your computer and use it in GitHub Desktop.
Save blu3r4y/dc2b2d305ff0cdd995ddcac70330af7e to your computer and use it in GitHub Desktop.
Features used by Dynatrace - SAL - LIT.AI.JKU in the NAD 2021 challenge
# Copyright 2021
# Dynatrace Research
# SAL Silicon Austria Labs
# LIT Artificial Intelligence Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pandas as pd
import feature_ip_port as ipp
import feature_ratios as rat
import feature_segments as seg
df = pd.read_csv("example.csv", parse_dates=["time"]).drop("label", axis="columns")
features = pd.concat([
ipp.get_ip_area(df, columns=["src", "dst"], areas=["global", "link_local", "unspecified"]),
ipp.get_ip_flow_type(df, src="src", dst="dst"),
ipp.get_ip_binary_match(df, src="src", dst="dst"),
ipp.get_ip_one_bits(df, columns=["src", "dst"]),
ipp.get_ip_parts(df, columns=["src", "dst"]),
ipp.get_port_area(df, columns=["spt", "dpt"]),
ipp.get_port_match(df, spt="spt", dpt="dpt"),
rat.get_cnt_ratios(df, unsafe=False, fill=0),
rat.get_in_out_ratios(df, fill=0),
rat.get_cnt_distances(df),
seg.get_segment_features(df, ipp.get_ip_parts(df, columns=["src", "dst"]),
groups=["src", "dst_3", "dst_2", "dst_1"], lag=1, parallel=False)
], axis="columns")
# take a glimpse at the data
print(features.head().T)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "opponent-exhaust",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import feature_ip_port as ipp\n",
"import feature_ratios as rat\n",
"import feature_segments as seg"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "italic-opposition",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"example.csv\", parse_dates=[\"time\"]).drop(\"label\", axis=\"columns\")\n",
"\n",
"features = pd.concat([\n",
" ipp.get_ip_area(df, columns=[\"src\", \"dst\"], areas=[\"global\", \"link_local\", \"unspecified\"]),\n",
" ipp.get_ip_flow_type(df, src=\"src\", dst=\"dst\"),\n",
" ipp.get_ip_binary_match(df, src=\"src\", dst=\"dst\"),\n",
" ipp.get_ip_one_bits(df, columns=[\"src\", \"dst\"]),\n",
" ipp.get_ip_parts(df, columns=[\"src\", \"dst\"]),\n",
" ipp.get_port_area(df, columns=[\"spt\", \"dpt\"]),\n",
" ipp.get_port_match(df, spt=\"spt\", dpt=\"dpt\"),\n",
" rat.get_cnt_ratios(df, unsafe=False, fill=0),\n",
" rat.get_in_out_ratios(df, fill=0),\n",
" rat.get_cnt_distances(df),\n",
" seg.get_segment_features(df, ipp.get_ip_parts(df, columns=[\"src\", \"dst\"]),\n",
" groups=[\"src\", \"dst_3\", \"dst_2\", \"dst_1\"], lag=1, parallel=False)\n",
"], axis=\"columns\")\n",
"\n",
"# take a glimpse at the data\n",
"features.T"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
time src dst spt dpt duration out (bytes) in (bytes) proto app cnt_dst cnt_src cnt_serv_src cnt_serv_dst cnt_dst_slow cnt_src_slow cnt_serv_src_slow cnt_serv_dst_slow cnt_dst_conn cnt_src_conn cnt_serv_src_conn cnt_serv_dst_conn label
2020-12-03 04:42:51 2887270751 2887273992 37103 53 0 516870 43627 17 5 1 3 2 0 2 22 229 0 1 5 2 0 Probing-Nmap
2020-12-03 07:16:06 2887274048 167837957 53582 161 64 6559 0 17 35 5 1 131 1 21 2 3001 1 4 1 98 1 Probing-Port sweep
2020-12-03 07:29:07 2887270690 134744072 4324 53 427 107636 0 17 5 1 2 7 0 32 8 1269 0 2 3 21 0 Normal
2020-12-03 07:41:14 2887274048 167838199 53306 161 0 0 0 17 35 5 1 112 0 21 1 3037 0 4 1 70 0 Normal
2020-12-03 07:44:00 2887270786 134744072 0 0 0 178651 0 1 11 1 2 5 5 5 8 746 746 1 3 11 11 Normal
2020-12-03 07:53:57 2887270776 2887273992 60846 53 820 292791 0 17 5 1 13 0 0 52 61 167 0 1 8 1 1 Normal
2020-12-03 09:12:28 2887270733 134744072 60235 53 0 433203 0 17 5 1 4 0 0 23 14 739 0 1 3 0 0 Normal
2020-12-03 10:11:42 2887274003 2887273982 49980 161 0 0 47106 17 35 1 1 0 0 12 1 400 0 1 1 0 0 Probing-Port sweep
2020-12-03 10:17:48 2887274003 167837961 35740 161 230 652054 0 17 35 4 1 3 0 12 2 400 0 5 1 4 0 Probing-IP sweep
2020-12-03 11:42:37 2887270719 2887273992 51223 53 590 0 62349 17 5 6 10 36 1 89 52 87 1 3 7 28 1 Normal
2020-12-03 12:00:48 2887270690 879905829 55718 443 975 535170 0 6 10 3 1 0 0 43 1 179 0 2 1 2 2 Normal
2020-12-03 12:01:58 2887270803 2887274010 52860 15002 251 0 9322 6 24 1 9 0 0 23 42 47 0 1 5 0 0 Normal
2020-12-03 12:04:24 2887271429 879896140 50007 3478 611 0 11193 17 24 2 1 0 0 39 1 3 0 1 1 0 0 Probing-Nmap
2020-12-03 12:09:27 2887270791 2887271430 44421 3007 401 0 59202 6 24 4 1 2 219 34 1 6 249 4 1 2 11 Normal
2020-12-03 12:11:00 2887270776 2887273992 53246 53 0 193804 0 17 5 2 8 13 0 48 60 100 0 2 4 0 0 Normal
2020-12-03 12:20:31 2887270775 2887273543 0 0 0 0 9192 1 11 256 1 3256 8 256 1 3256 8 27 1 95 3 Normal
2020-12-03 12:25:38 2887270721 2726122003 12793 443 130 0 0 6 10 3 1 6 0 284 2 2346 0 2 1 3 0 Normal
2020-12-03 12:31:08 2887270786 134744072 0 0 690 201966 0 1 11 1 3 8 8 2 13 749 749 1 3 2 2 Normal
2020-12-03 12:33:38 2887270973 2007043601 3203 443 244 0 50333 6 10 1 1 2 2 256 2 1507 2 1 1 2 2 Normal
2020-12-03 12:59:31 2887270791 2887271305 43422 555 76 0 49478 6 24 7 1 26 340 258 1 29 825 7 1 2 10 Normal
2020-12-03 13:01:33 2887270791 2887271304 43423 5061 0 0 0 6 24 7 1 19 360 258 1 20 996 7 1 0 7 Normal
2020-12-03 13:16:58 2887270718 2887191178 49971 443 141 0 15505 6 10 1 1 1 1 77 2 669 1 1 1 1 1 Normal
2020-12-03 13:19:31 2887271434 2887273992 32876 53 756 0 0 17 5 1 9 0 0 5 55 98 0 1 8 0 0 Normal
2020-12-03 13:20:05 2887270791 2887274085 33938 9929 263 489289 0 6 24 23 1 3 185 23 1 7 398 18 1 1 6 Probing-Port sweep
2020-12-03 13:22:41 2887270766 2887273992 52751 389 53 347788 0 6 24 2 10 0 0 86 44 2 0 2 8 1 1 Normal
2020-12-03 13:24:21 2887274048 167838202 43313 161 227 0 100373 17 35 6 1 139 1 21 1 3014 1 3 1 65 1 Probing-Port sweep
2020-12-03 13:27:20 2887270791 2887273518 38447 443 424 0 8667 6 10 18 1 20 1 160 1 743 1 4 1 3 0 Normal
2020-12-03 13:34:01 2887270773 599449625 60020 443 40 85926 10580 6 10 2 1 11 2 32 2 94 2 2 1 11 2 Normal
2020-12-03 13:36:29 2887270695 2887273992 56041 53 238 579606 0 17 5 3 9 9 1 31 49 297 1 1 5 9 1 Normal
2020-12-03 13:45:54 2887270707 2887273992 56286 53 711 0 0 17 5 2 4 43 1 88 27 89 1 2 3 43 1 Normal
2020-12-03 13:52:01 2887270800 2887273992 60715 53 0 0 0 17 5 2 11 0 0 42 49 135 0 1 8 0 0 Normal
2020-12-03 13:53:22 2887270702 2887273992 55013 53 573 0 7914 17 5 2 6 1 1 16 50 82 1 2 3 1 1 Normal
2020-12-03 13:56:48 2887270721 2887273992 51926 53 0 0 25147 17 5 2 6 9 1 290 44 1353 3 2 6 9 1 Normal
2020-12-03 14:15:24 2887270690 134744072 60007 53 560 94236 0 17 5 1 3 5 2 42 12 1345 2 1 3 5 2 DDOS-smurf
2020-12-03 14:21:56 2887270695 3232243202 54711 53 505 0 64957 17 5 1 1 2 2 31 1 276 2 1 1 2 2 Normal
2020-12-03 14:38:05 2887270719 2887273992 57507 53 1155 0 0 17 5 1 5 1 0 196 42 375 0 1 5 1 0 Normal
2020-12-03 15:08:45 2887271433 1755049120 54908 9980 0 348644 0 6 24 2 2 2 2 4 2 80 2 2 1 2 2 Probing-Nmap
2020-12-03 15:17:08 2887274048 167837955 57681 161 255 424427 17477 17 35 7 1 167 1 21 2 3086 1 4 1 61 1 DDOS-smurf
2020-12-03 15:25:58 2887270733 134744072 56760 53 0 0 1214 17 5 2 3 3 0 23 12 797 0 1 3 3 0 DDOS-smurf
2020-12-03 15:41:42 2887270733 134744072 59899 53 179 33008 0 17 5 3 4 22 1 33 12 1011 1 1 3 13 1 Normal
2020-12-03 15:45:21 2887274048 167838202 53466 161 738 0 0 17 35 7 1 195 1 21 1 2975 1 5 1 45 0 Normal
2020-12-03 16:03:09 2887270733 134744072 58023 53 0 44753 24036 17 5 1 3 23 1 71 10 887 1 1 3 20 0 Probing-Nmap
2020-12-03 16:03:50 2887270749 2472317572 49663 443 0 504157 0 6 10 1 1 1 1 26 6 131 1 1 1 1 1 Probing-Nmap
2020-12-03 16:07:51 2887270713 2887273992 59468 53 444 514854 79828 17 5 1 7 99 0 87 46 258 0 1 5 50 0 Normal
2020-12-03 16:09:02 2887271198 2887274010 4831 15002 0 211426 72762 6 24 1 2 0 0 2 39 39 0 1 2 0 0 Probing-Nmap
2020-12-03 16:13:42 2887271433 792239461 38558 22000 0 0 0 6 24 2 1 2 2 11 2 8 2 2 1 2 2 Normal
2020-12-03 16:16:29 2887270949 400236807 5012 443 64 223690 0 6 10 2 1 13 1 75 1 509 1 2 1 14 2 Normal
2020-12-03 16:27:30 2887274002 134744072 52748 53 0 0 19953 17 5 1 3 18 0 1 11 309 0 1 2 14 0 Probing-IP sweep
2020-12-03 16:30:17 2887270880 2887273992 55273 53 0 365406 52995 17 5 2 6 3 1 52 43 273 1 1 4 1 1 Normal
2020-12-03 16:51:05 2887270721 2887273992 57952 389 429 0 24905 17 24 2 6 4 0 44 44 66 0 2 5 4 0 Normal
2020-12-03 16:56:42 2887270766 679349478 54465 443 193 1184868 43560 6 10 1 1 0 0 12 2 21 0 1 1 0 0 Probing-Nmap
2020-12-03 17:05:44 2887270702 2887274039 54152 8080 250 1238058 0 6 24 2 2 20 1 50 19 89 1 1 2 20 1 Normal
2020-12-03 17:13:24 2887270690 3639551595 62882 53 234 1036442 29645 17 5 6 1 26 0 48 1 1286 0 5 1 23 0 Normal
2020-12-03 17:21:38 2887270775 2887273525 54014 9091 0 0 0 6 24 9 1 7 3 11 1 7 3 7 1 2 1 Normal
2020-12-03 17:30:09 2887270791 2887274002 33938 23502 36 756026 31962 6 24 27 2 0 358 27 33 0 1115 18 1 1 6 Normal
2020-12-03 17:30:16 2887274048 167838202 44714 161 441 412807 19311 17 35 5 1 101 1 21 1 3099 1 3 1 63 1 Normal
2020-12-03 17:51:12 2887270690 3627731182 58941 443 0 105819 28366 17 24 3 1 2 2 36 15 164 2 3 1 2 2 Normal
2020-12-03 17:52:15 2887274048 167838201 50287 161 117 0 9899 17 35 6 1 103 0 21 1 3068 0 5 1 57 0 Probing-IP sweep
2020-12-03 17:59:43 3105873370 1746059117 63891 3221 331 0 56057 6 24 5 1 0 185 64 1 5 298 4 1 0 11 Normal
2020-12-03 18:03:19 2887271299 887498523 53588 443 64 0 34686 6 10 2 1 10 0 23 3 150 0 2 1 5 0 Normal
2020-12-03 18:31:28 2887271429 677035416 6755 443 2011 0 20473 6 10 8 1 19 1 107 5 590 1 3 1 7 1 Normal
2020-12-03 18:33:13 2887271434 2887273992 56338 53 447 17475 10263 17 5 1 7 4 0 4 44 98 0 1 7 4 0 Normal
2020-12-03 18:37:14 2887270791 2887274035 33938 8222 0 225679 74265 6 24 128 1 9 374 128 3 34 1159 21 1 0 3 Normal
2020-12-03 18:37:59 167838205 2887274082 1027 6343 338 0 0 17 24 1 1 5 5 1 4 1199 1199 1 1 9 9 Normal
2020-12-03 18:50:47 2887270775 2887273527 26046 1886 0 0 75629 6 24 8 1 1722 223 388 2 27353 499 8 1 98 11 Probing-Nmap
2020-12-03 18:51:27 3116346117 2887273528 34420 1864 184 1017498 0 6 24 7 1 7 0 32 3 8 0 7 1 2 0 Normal
2020-12-03 18:54:06 2887270775 2887273475 56296 9594 0 1113505 0 6 24 1 1 26 2 3 2 75 2 1 1 10 2 Probing-Nmap
2020-12-03 18:57:39 167837960 2887273992 6434 123 399 0 26336 17 24 1 9 1 1 1 44 39 1 1 9 1 1 Normal
2020-12-03 19:01:35 2887270702 2887274010 56507 15002 75 8341 17981 6 24 1 10 1 1 31 37 41 1 1 7 1 1 DDOS-smurf
2020-12-03 19:13:30 2887273729 2887274002 58822 53 126 414050 0 17 5 1 1 0 0 5 27 55 0 1 1 0 0 Normal
2020-12-03 19:24:07 2887270765 311247748 0 0 827 0 764 1 11 249 1 340 1 256 1 347 1 77 1 77 0 DDOS-smurf
2020-12-03 19:27:19 2887274048 167838205 59247 161 369 956821 0 17 35 7 2 122 1 21 2 3018 1 3 2 62 1 Normal
2020-12-03 19:31:37 2887271434 599639059 60500 10001 41 74904 92254 6 24 6 1 160 0 9 1 160 0 6 1 93 0 Normal
2020-12-03 19:40:45 2887270775 2887273547 26046 1886 0 360049 0 6 24 27 1 3496 152 163 2 7818 280 23 1 100 4 Probing-Nmap
2020-12-03 20:03:33 2887270739 391226437 51482 443 273 0 0 6 10 6 1 15 0 91 1 538 0 6 1 11 1 Normal
2020-12-03 20:22:51 2887189907 2887274189 0 0 230 0 114759 1 11 1 1 4 4 1 1 485 485 1 1 4 4 Normal
2020-12-03 20:24:30 2887270721 879573039 7808 443 57 0 0 6 10 1 1 5 2 124 14 959 2 1 1 5 2 Normal
2020-12-03 21:24:20 2887270765 387297544 62778 32588 0 110564 0 6 24 36 1 5059 189 41 1 10434 319 27 1 99 4 Normal
2020-12-03 21:31:10 1123633412 1037569569 44031 1328 374 0 20714 6 24 22 1 15 32 22 1 15 32 17 1 15 3 Probing-IP sweep
2020-12-03 21:49:32 2887270775 2887273475 35076 1104 0 109715 99420 6 24 1 1 24 0 3 2 80 0 1 1 8 0 Normal
2020-12-03 22:06:54 2887270719 2887273992 11627 53 695 0 57472 17 5 3 8 22 1 128 53 293 1 1 5 15 1 DDOS-smurf
2020-12-03 22:16:21 2887271426 2887274039 12046 8080 0 762925 29390 6 24 1 1 0 0 31 10 4 0 1 1 0 0 Normal
2020-12-03 22:35:33 2887270775 2887273475 55182 9091 348 0 0 6 24 1 1 31 7 3 2 47 7 1 1 30 7 Normal
2020-12-03 22:37:51 2887270814 2887273992 52280 53 731 248100 0 17 5 2 9 2 0 29 45 80 0 2 8 2 0 Normal
2020-12-03 22:38:49 2887270695 3758096636 57430 5355 646 0 0 17 24 3 1 0 0 26 2 28 0 3 1 0 0 Normal
2020-12-03 22:48:28 3105873214 2887271232 54264 5998 55 337558 0 6 24 2 1 1 208 2 1 1 208 2 1 1 53 Normal
2020-12-03 22:56:20 1123633412 1037569542 44031 139 154 903713 11111 6 20 29 1 6 97 29 1 15 197 19 1 6 4 Normal
2020-12-03 23:03:11 2887274048 167838198 57258 161 419 366 0 17 35 6 1 185 0 21 1 3073 0 4 1 81 0 Normal
2020-12-03 23:12:59 2887270971 134744072 63161 53 364 0 25562 17 5 5 4 188 2 97 13 830 2 2 3 58 2 Normal
2020-12-03 23:15:27 59465216 220415529 55046 443 332 0 22776 6 10 52 1 60 0 128 1 62 0 18 1 21 0 Normal
2020-12-03 23:21:19 2887274048 167838200 49910 161 413 0 51248 17 35 6 1 181 1 21 1 3004 1 5 1 69 1 Normal
2020-12-03 23:23:14 2887270711 2887273473 51280 5080 574 0 0 6 24 6 1 0 0 14 2 3 0 6 1 0 0 Normal
2020-12-03 23:37:04 2887270773 2887191179 64916 443 2672 1360504 105474 6 10 1 1 1 1 25 8 165 1 1 1 1 1 Probing-IP sweep
2020-12-04 00:02:07 2887270803 3423402026 56458 443 639 1799779 109907 6 10 4 1 9 0 57 1 473 0 4 1 9 0 Normal
2020-12-04 00:37:16 2887270825 2887273992 49620 53 2 1423168 12959 17 5 6 4 4 1 31 30 46 1 1 1 1 1 Normal
2020-12-04 01:03:03 2887270711 2887273482 46430 443 437 311589 0 6 10 58 1 140 2 86 1 597 2 23 1 49 2 Normal
2020-12-04 01:10:57 2887274135 2887274010 64286 15002 632 816977 47380 6 24 1 6 1 1 2 17 41 1 1 7 1 1 Normal
2020-12-04 01:49:47 2887270775 2887273726 26046 1886 233 0 560 6 24 253 1 6232 256 388 1 12496 484 24 1 100 5 Probing-Port sweep
2020-12-04 01:52:03 2887270721 2887060099 1310 18001 0 0 0 6 24 3 1 1 1 63 1 29 1 2 1 1 1 Normal
2020-12-04 02:50:45 2887270775 2887273475 47416 2968 210 366217 0 6 24 1 1 10 3 11 2 63 3 1 1 8 3 Normal
# Copyright 2021
# Dynatrace Research
# SAL Silicon Austria Labs
# LIT Artificial Intelligence Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from ipaddress import IPv4Address
import numpy as np
import pandas as pd
def get_ip_area(df: pd.DataFrame, columns: List[str], areas: List[str]) -> pd.DataFrame:
"""
One-hot encode to which ip address block a certain address belongs to
:param df: original cleaned data frame
:param columns: the columns to process (most likely `["src", "dst"]`)
:param areas: the areas to process, one or more of
`multicast, private, global, unspecified, reserved, loopback, link_local`
"""
result = pd.DataFrame(index=df.index)
for column in columns:
ipv4 = df[column].map(IPv4Address)
for area in areas:
result[f"{column}_{area}"] = ipv4.apply(lambda ip: getattr(ip, f"is_{area}")).astype(bool)
return result
# noinspection PyUnresolvedReferences
def get_ip_flow_type(df: pd.DataFrame, src: str = "src", dst: str = "dst") -> pd.DataFrame:
"""
One-hot encoding of source and destination traffic type
:param df: original cleaned data frame
:param src: the column name of the source address
:param dst: the column name of the destination address
"""
result = pd.DataFrame(index=df.index)
src_global = df[src].apply(lambda e: IPv4Address(e).is_global)
dst_global = df[dst].apply(lambda e: IPv4Address(e).is_global)
result["is_inter"] = np.logical_and(src_global, dst_global).astype(bool)
result["is_ingress"] = np.logical_and(src_global, ~dst_global).astype(bool)
result["is_egress"] = np.logical_and(~src_global, dst_global).astype(bool)
result["is_intra"] = np.logical_and(~src_global, ~dst_global).astype(bool)
# ensure one-hot encoding
assert np.all(np.sum(result, axis=1) == 1)
return result
def get_ip_binary_match(df: pd.DataFrame, src: str = "src", dst: str = "dst") -> pd.DataFrame:
"""
Compute the number of bits that are equal in src and dst,
from left to right, stopping at the first mismatch,
which helps identifying how similar two addresses are
:param df: original cleaned data frame
:param src: the column name of the source address
:param dst: the column name of the destination address
"""
ips = df[[src, dst]].to_numpy()
# a bitwise XOR will reveal the bit position of the first mismatch
# which we use to check the necessary bit representation length
match = np.bitwise_xor(ips[:, 0], ips[:, 1])
bit_length = 32 - np.ceil(np.log2(match + 1)).astype("uint8")
result = pd.DataFrame(index=df.index)
result["ip_match"] = bit_length
return result
def get_ip_one_bits(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
"""
Transform every number (preferably, ip address) to its number of set bits
:param df: original cleaned data frame
:param columns: the columns to process (most likely `["src", "dst"]`)
"""
result = df.loc[:, columns]
result[:] = _hamming_weight(result.to_numpy())
result = result.add_suffix("_one_bits")
result = result.astype("uint8")
return result
def get_ip_parts(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
"""
Get the four individual parts of an ip address, indexed from lowest to highest,
e.g. `192.168.0.1` becomes `{"_0": 1, "_1": 0, "_2": 168, "_3": 192}`
:param df: original cleaned data frame
:param columns: the columns to process (most likely `["src", "dst"]`)
"""
result = pd.DataFrame(index=df.index)
for column in columns:
arr = df[column].to_numpy()
result[f"{column}_0"] = arr & 0xFF
result[f"{column}_1"] = (arr >> 8) & 0xFF
result[f"{column}_2"] = (arr >> 16) & 0xFF
result[f"{column}_3"] = (arr >> 24) & 0xFF
result = result.astype("uint8")
return result
def get_port_area(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
"""
One-hot encode port numbers to their respective area
```
common: 0 to 1023
registered: 1024 to 49151
ephemeral: 49152 to 65535
```
:param df: original cleaned data frame
:param columns: the columns to process (most likely `["spt", "dpt"]`)
"""
result = pd.DataFrame(index=df.index)
for column in columns:
port = df[column]
result[f"{column}_common"] = (port <= 1023).astype(bool)
result[f"{column}_registered"] = ((port >= 1024) & (port <= 49151)).astype(bool)
result[f"{column}_ephemeral"] = (port >= 49152).astype(bool)
return result
# noinspection PyUnresolvedReferences
def get_port_match(df: pd.DataFrame, spt: str = "spt", dpt: str = "dpt") -> pd.DataFrame:
"""
A column that is one if the src and dst port match
:param df: original cleaned data frame
:param spt: the column name of the source port
:param dpt: the column name of the destination port
"""
result = pd.DataFrame(index=df.index)
result["port_match"] = (df[spt] == df[dpt]).astype(bool)
return result
HAM_M1 = np.uint64(0x5555555555555555)
HAM_M2 = np.uint64(0x3333333333333333)
HAM_M4 = np.uint64(0x0f0f0f0f0f0f0f0f)
HAM_H01 = np.uint64(0x0101010101010101)
def _hamming_weight(x):
"""
efficient implementation of the hamming weight to
find the number of one bits of a number
(c) https://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
"""
x = x - ((x >> 1) & HAM_M1)
x = (x & HAM_M2) + ((x >> 2) & HAM_M2)
x = (x + (x >> 4)) & HAM_M4
x = (x * HAM_H01) >> 56
return x
# Copyright 2021
# Dynatrace Research
# SAL Silicon Austria Labs
# LIT Artificial Intelligence Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import pandas as pd
def get_cnt_ratios(df: pd.DataFrame, unsafe: bool = False, fill: float = 0) -> pd.DataFrame:
"""
Get various ratios between cnt values.
This function expects the following column names to be present:
cnt_src, cnt_src_slow, cnt_src_conn,
cnt_dst, cnt_dst_slow, cnt_dst_conn,
cnt_serv_src, cnt_serv_src_slow, cnt_serv_src_conn,
cnt_serv_dst, cnt_serv_dst_slow, cnt_serv_dst_conn
:param df: the original cleaned data frame
:param unsafe: also compute ratios for columns where the divisor can be zero
:param fill: fill invalid divisions with this number
"""
res = pd.DataFrame(index=df.index)
# ratios relative to slow and conn
##################################
assert np.all(df["cnt_src_slow"] > 0)
assert np.all(df["cnt_src_conn"] > 0)
res["relative_cnt_src_to_slow"] = df["cnt_src"] / df["cnt_src_slow"]
res["relative_cnt_src_to_conn"] = df["cnt_src"] / df["cnt_src_conn"]
assert np.all(df["cnt_dst_slow"] > 0)
assert np.all(df["cnt_dst_conn"] > 0)
res["relative_cnt_dst_to_slow"] = df["cnt_dst"] / df["cnt_dst_slow"]
res["relative_cnt_dst_to_conn"] = df["cnt_dst"] / df["cnt_dst_conn"]
# assert np.all(df["cnt_serv_src_slow"] > 0) # FALSE
# assert np.all(df["cnt_serv_src_conn"] > 0) # FALSE
if unsafe:
res["relative_cnt_serv_src_to_slow"] = _finite_divide(df["cnt_serv_src"], df["cnt_serv_src_slow"], fill)
res["relative_cnt_serv_src_to_conn"] = _finite_divide(df["cnt_serv_src"], df["cnt_serv_src_conn"], fill)
# assert np.all(df["cnt_serv_dst_slow"] > 0) # FALSE
# assert np.all(df["cnt_serv_dst_conn"] > 0) # FALSE
if unsafe:
res["relative_cnt_serv_dst_to_slow"] = _finite_divide(df["cnt_serv_dst"], df["cnt_serv_dst_slow"], fill)
res["relative_cnt_serv_dst_to_conn"] = _finite_divide(df["cnt_serv_dst"], df["cnt_serv_dst_conn"], fill)
# src / dst ratios
##################
assert np.all(df["cnt_dst"] > 0)
assert np.all(df["cnt_dst_slow"] > 0)
assert np.all(df["cnt_dst_slow"] > 0)
res["ratio_cnt_src_dst"] = df["cnt_src"] / df["cnt_dst"]
res["ratio_cnt_src_dst_slow"] = df["cnt_src_slow"] / df["cnt_dst_slow"]
res["ratio_cnt_src_dst_conn"] = df["cnt_src_conn"] / df["cnt_dst_conn"]
# assert np.all(df["cnt_serv_dst"] > 0) # FALSE
# assert np.all(df["cnt_serv_dst_slow"] > 0) # FALSE
# assert np.all(df["cnt_serv_dst_conn"] > 0) # FALSE
if unsafe:
res["ratio_cnt_serv_src_dst"] = _finite_divide(df["cnt_serv_src"], df["cnt_serv_dst"], fill)
res["ratio_cnt_serv_src_dst_slow"] = _finite_divide(df["cnt_serv_src_slow"], df["cnt_serv_dst_slow"], fill)
res["ratio_cnt_serv_src_dst_conn"] = _finite_divide(df["cnt_serv_src_conn"], df["cnt_serv_dst_conn"], fill)
# conn to slow ratios
#####################
assert np.all(df["cnt_dst_slow"] > 0)
assert np.all(df["cnt_src_slow"] > 0)
res["ratio_cnt_dst_conn_slow"] = df["cnt_dst_conn"] / df["cnt_dst_slow"]
res["ratio_cnt_src_conn_slow"] = df["cnt_src_conn"] / df["cnt_src_slow"]
# assert np.all(df["cnt_serv_src_slow"] > 0) # FALSE
# assert np.all(df["cnt_serv_dst_slow"] > 0) # FALSE
if unsafe:
res["ratio_cnt_serv_src_conn_slow"] = _finite_divide(df["cnt_serv_src_conn"], df["cnt_serv_src_slow"], fill)
res["ratio_cnt_serv_dst_conn_slow"] = _finite_divide(df["cnt_serv_dst_conn"], df["cnt_serv_dst_slow"], fill)
res = res.astype("float32") # save some gpu memory
# make sure that our values are finite and not too big
assert np.all(np.isfinite(res)) and np.all(res < 1e+12)
return res
def get_in_out_ratios(df: pd.DataFrame, fill: float = 0) -> pd.DataFrame:
"""
Get ratios between in and out traffic counts.
This function expects the following column names to be present:
in (bytes), out (bytes), duration
:param df: the original cleaned data frame
:param fill: fill invalid divisions with this number
"""
res = pd.DataFrame(index=df.index)
res["in_bytes_per_duration"] = _finite_divide(df["in (bytes)"], df["duration"], fill)
res["out_bytes_per_duration"] = _finite_divide(df["out (bytes)"], df["duration"], fill)
res["ratio_in_out_bytes"] = _finite_divide(df["in (bytes)"], df["out (bytes)"], fill)
res = res.astype("float32") # save some gpu memory
# make sure that our values are finite and not too big
assert np.all(np.isfinite(res)) and np.all(res < 1e+12)
return res
def get_cnt_distances(df: pd.DataFrame) -> pd.DataFrame:
"""
Get various distances between cnt values.
This function expects the following column names to be present:
cnt_src, cnt_src_slow, cnt_src_conn,
cnt_dst, cnt_dst_slow, cnt_dst_conn,
cnt_serv_src, cnt_serv_src_slow, cnt_serv_src_conn,
cnt_serv_dst, cnt_serv_dst_slow, cnt_serv_dst_conn
:param df: the original cleaned data frame
"""
df = df.astype("int64")
res = pd.DataFrame(index=df.index)
# src dst differences
#####################
res["diff_cnt_src_dst"] = df["cnt_src"] - df["cnt_dst"]
res["diff_cnt_src_dst_slow"] = df["cnt_src_slow"] - df["cnt_dst_slow"]
res["diff_cnt_src_dst_conn"] = df["cnt_src_conn"] - df["cnt_dst_conn"]
res["diff_cnt_serv_src_dst"] = df["cnt_serv_src"] - df["cnt_serv_dst"]
res["diff_cnt_serv_src_dst_slow"] = df["cnt_serv_src_slow"] - df["cnt_serv_dst_slow"]
res["diff_cnt_serv_src_dst_conn"] = df["cnt_serv_src_conn"] - df["cnt_serv_dst_conn"]
# conn slow differences
#######################
res["diff_dst_conn_slow"] = df["cnt_dst_conn"] - df["cnt_dst_slow"]
res["diff_src_conn_slow"] = df["cnt_src_conn"] - df["cnt_src_slow"]
res["diff_serv_src_conn_slow"] = df["cnt_serv_src_conn"] - df["cnt_serv_src_slow"]
res["diff_serv_dst_conn_slow"] = df["cnt_serv_dst_conn"] - df["cnt_serv_dst_slow"]
return res
def _finite_divide(a: np.ndarray, b: np.ndarray, fill: float = 0) -> np.ndarray:
"""Divides `a / b` but will fix `0 / 0` and `1 / 0` to `fill` (default: 0)"""
with np.errstate(divide="ignore", invalid="ignore"):
c = np.true_divide(a, b)
c[c == np.inf] = fill
c = np.nan_to_num(c, nan=fill)
return c
# Copyright 2021
# Dynatrace Research
# SAL Silicon Austria Labs
# LIT Artificial Intelligence Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from functools import partial
from multiprocessing import Pool, cpu_count
import numpy as np
import pandas as pd
def get_segment_features(df_clean: pd.DataFrame, df_ip: pd.DataFrame, groups: List[str], lag: int,
parallel: bool = True) -> pd.DataFrame:
"""
Group the data frame according to the specified `groups`, then, within each group,
cluster rows together that are consecutive to one another (according to the `lag` param),
and assign the segment length, and the number of unique spt, dpt, app per segment.
Example: For `times = [1, 3, 4, 5, 7, 8, 15]` we would create the segments
`[[1], [3, 4, 5], [7, 8], [15]]` with segment lengths `[1, 3, 2, 1]` with `lag = 1`
and the number of unique spt, dpt, app per segment.
Larger values for `lag` will also cluster points farther apart into one segment, e.g.
with `lag = 2` we would create the segments `[[1, 3, 4, 5, 7, 8], [15]]`
:param df_clean: the original cleaned data frame
:param df_ip: the "ip_parts" features data frame
:param groups: a list of columns to group by
:param lag: the minimum difference between observations to form a segment
:param parallel: whether to process groups in parallel or not
"""
df = pd.concat([df_clean, df_ip], axis="columns")
# transform and sort time column
df["time"] = df["time"].astype(np.int64) // 10 ** 9
df = df.sort_values("time")
groups = [group for _, group in df.groupby(groups, as_index=False)]
apply_function = partial(_assign_segment_metrics, lag=lag) # fix the lag parameter
if not parallel:
segments = map(apply_function, groups)
else:
with Pool(cpu_count()) as pool:
segments = pool.map(apply_function, groups)
return pd.concat(segments).sort_index()
def _assign_segment_metrics(group: pd.DataFrame, lag: int = 1):
assert lag >= 1
assert len(group) > 0
assert group["time"].is_monotonic_increasing
# append new columns with some default values
columns = ["segment_length", "nunique_dst", "nunique_spt", "nunique_dpt", "nunique_app"]
result = pd.DataFrame(1, dtype=int, index=group.index, columns=columns)
# get the time segment splits
time = group["time"].to_numpy()
splits = np.flatnonzero(np.diff(time) > lag) + 1
splits = np.insert(splits, 0, 0)
dim, nsegments = len(time), len(splits)
# pre-cache column locations to make the subsequent loop faster
loc_segment_length = result.columns.get_loc("segment_length")
loc_nunique_dst = result.columns.get_loc("nunique_dst")
loc_nunique_spt = result.columns.get_loc("nunique_spt")
loc_nunique_dpt = result.columns.get_loc("nunique_dpt")
loc_nunique_app = result.columns.get_loc("nunique_app")
loc_dst = group.columns.get_loc("dst")
loc_spt = group.columns.get_loc("spt")
loc_dpt = group.columns.get_loc("dpt")
loc_app = group.columns.get_loc("app")
for i in range(nsegments):
lo = splits[i]
hi = splits[i + 1] if i + 1 < nsegments else dim
slc = slice(lo, hi - 1) # segment slice
# assign the segment length for the entire group
result.iloc[slc, loc_segment_length] = hi - lo
# compute number of unique spt, dpt, app fields for this segment
result.iloc[slc, loc_nunique_dst] = group.iloc[slc, loc_dst].nunique()
result.iloc[slc, loc_nunique_spt] = group.iloc[slc, loc_spt].nunique()
result.iloc[slc, loc_nunique_dpt] = group.iloc[slc, loc_dpt].nunique()
result.iloc[slc, loc_nunique_app] = group.iloc[slc, loc_app].nunique()
return result
numpy>=1.19
pandas>=1.2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment