Skip to content

Instantly share code, notes, and snippets.

@kohnakagawa
Created August 11, 2019 02:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kohnakagawa/ecc0dfdf3e81c0a69531fb1602101bdf to your computer and use it in GitHub Desktop.
Save kohnakagawa/ecc0dfdf3e81c0a69531fb1602101bdf to your computer and use it in GitHub Desktop.
Malware Data Science chapter 8の内容をEmberで使われている特徴量で実施した場合の結果
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"import glob\n",
"import sklearn\n",
"import ember\n",
"import numpy as np\n",
"import yara\n",
"import pickle"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"b_paths = glob.glob(os.path.join(\"data\", \"benignware\", \"*\"))\n",
"m_paths = glob.glob(os.path.join(\"data\", \"malware\", \"*\"))\n",
"labels = [0 for _ in b_paths] + [1 for _ in m_paths]\n",
"data_paths = b_paths + m_paths"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"rule = yara.compile(source='rule IsPeFile {strings:$mz = \"MZ\"condition:$mz at 0 and uint32(uint32(0x3C)) == 0x4550}')\n",
"path_labels = [(p, l) for p, l in zip(data_paths, labels) if rule.match(p)]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"fextractor = ember.PEFeatureExtractor()\n",
"fvector = np.array([fextractor.feature_vector(bytez=open(p, \"rb\").read()) for p, _ in path_labels])\n",
"labels = np.array([l for _, l in path_labels])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"with open(\"fvector.pickle\", \"wb\") as fb:\n",
" pickle.dump(fvector, fb)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"X = fvector\n",
"y = labels\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
" \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n"
]
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.model_selection import KFold\n",
"\n",
"clf = RandomForestClassifier()\n",
"kf = KFold(n_splits=4)\n",
"results = []\n",
"for train_idx, test_idx in kf.split(X_train, y_train):\n",
" clf.fit(X_train[train_idx], y_train[train_idx])\n",
" y_pred = clf.predict(X_train[test_idx])\n",
" results.append(accuracy_score(y_train[test_idx], y_pred))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.978494623655914\n"
]
}
],
"source": [
"y_pred = clf.predict(X_test)\n",
"print(accuracy_score(y_test, y_pred))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import roc_curve, auc\n",
"y_pred_prob = clf.predict_proba(X_test)[:,1]\n",
"fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)\n",
"roc_auc = auc(fpr, tpr)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"plt.title('Receiver Operating Characteristic')\n",
"plt.plot(fpr, tpr, 'b', label = 'AUC = %0.5f' % roc_auc)\n",
"plt.legend(loc = 'lower right')\n",
"plt.xlim([0, 0.03])\n",
"plt.ylim([0, 1])\n",
"plt.ylabel('True Positive Rate')\n",
"plt.xlabel('False Positive Rate')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7f7908ba6668>"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sns\n",
"fti = clf.feature_importances_ \n",
"idx = 0\n",
"feature_dim_names = []\n",
"for f in fextractor.features:\n",
" feature_dim_names.append((f.name, idx, idx + f.dim))\n",
" idx += f.dim\n",
"\n",
"importances = []\n",
"for name, idx_beg, idx_end in feature_dim_names:\n",
" importance_sum = np.sum(fti[idx_beg:idx_end])\n",
" importances.append(importance_sum)\n",
"\n",
"names = [f.name for f in fextractor.features] \n",
"sns.barplot(x=importances, y=names)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@kohnakagawa
Copy link
Author

AUCを比較すると、Emberのほうが優れていた。

Malware Data Scienceのモデルだと AUC が0.9951
Emberだと AUC が0.9972

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment