Skip to content

Instantly share code, notes, and snippets.

@garanews
Created May 21, 2019 16:17
Show Gist options
  • Save garanews/f3c969a1644dd3365e23141c9cb70ffc to your computer and use it in GitHub Desktop.
Save garanews/f3c969a1644dd3365e23141c9cb70ffc to your computer and use it in GitHub Desktop.
test_load3.py
import os
import hashlib
import glob
import magic
import peutils
import pefile
import sys
import tqdm
import dask
import pandas as pd
import dask.dataframe as dd
from dask.distributed import Client
from dask import delayed
import asyncio
import asyncpg
try:
import pyimpfuzzy
except ImportError:
pass
from apiscout.ApiScout import ApiScout
from apiscout.ApiVector import ApiVector
import numpy as np
winapi_path = "/home/analyst/malwareclustering/data/winapi1024v1.txt"
folder_path = '/nsm/VirusShare_00221/*'
signatures_path = '/home/analyst/malwareclustering/data/UserDB.TXT'
apivector = ApiVector(winapi_path)
scout = ApiScout()
scout.setBaseAddress(0)
scout.loadWinApi1024(winapi_path)
def check_file(f):
magictest = magic.Magic(uncompress=True)
with open(signatures_path, 'rt') as fs:
signatures = peutils.SignatureDatabase(data=fs.read())
try:
if magictest.from_file(f).find('PE32') == -1 or magictest.from_file(f).find('self-extracting') != -1:
return "error"
except magic.MagicException:
return "error"
try:
pe = pefile.PE(f)
matches = signatures.match_all(pe, ep_only = True)
if matches:
return "error"
except:
return "error"
return f
def get_digest(item):
if item == 'error':
return pd.Series({'error': True, 'id': None, 'filename': item.split("/")[-1], 'impfuzzy': '', 'apiscout': '', 'md5': '', 'sha1': '', 'sha256': '', 'family': '', 'packed': ''})
md5 = hashlib.md5()
sha1 = hashlib.sha1()
sha256 = hashlib.sha256()
try:
impfuzzy = pyimpfuzzy.get_impfuzzy(item)
except:
impfuzzy = ""
if os.path.isfile(item):
with open(item, "rb") as f_binary:
binary = f_binary.read()
try:
scout_ev = scout.evaluateImportTable(binary, is_unmapped=True)
scout_result = scout.getWinApi1024Vectors(scout_ev).get('import_table', {}).get('vector', None)
except:
scout_result = None
if scout_result in ("", 'A171', None):
return pd.Series({'error': True, 'id': None, 'filename': item.split("/")[-1], 'impfuzzy': '', 'apiscout': '', 'md5': '', 'sha1': '', 'sha256': '', 'family': '', 'packed': ''})
with open(item, "rb") as f:
while True:
buf = f.read(2047)
if not buf:
break
md5.update(buf)
sha1.update(buf)
sha256.update(buf)
pkb = np.packbits(np.array(apivector.decompress(scout_result), dtype='bool')).tobytes()
return pd.Series({'error': False, 'id': None, 'filename': item.split("/")[-1], 'impfuzzy': impfuzzy, 'apiscout': scout_result, 'md5': md5.hexdigest(), 'sha1': sha1.hexdigest(), 'sha256': sha256.hexdigest(), 'family': 'Unknown', 'packed': pkb})
def insert_db(df):
values = [tuple(x) for x in df.values]
loop = asyncio.get_event_loop()
for i in range(int(len(values)/1000)+1):
loop.run_until_complete(main(values[(i*1000):((i+1)*1000)]))
loop.close()
async def main(scout_end):
conn = await asyncpg.connect('postgresql://analyst:LA2205wg@localhost/malwares')
res = await conn.fetch('''
INSERT INTO malware_clustering(name, impfuzzy, apivector, md5, sha1, sha256, family, data, family_suggested)(
SELECT
r.name, r.impfuzzy, r.apivector, r.md5, r.sha1, r.sha256, r.family, r.data, r.family_suggested
FROM
unnest($1::malware_clustering[]) as r
)
RETURNING id
''', scout_end)
await conn.close()
return "OK"
if __name__ == '__main__':
client = Client('172.23.108.15:8786')
files = glob.iglob(folder_path, recursive=True)
df = pd.DataFrame(files)
ddf = dd.from_pandas(df[:1000], npartitions=16)
ddf = client.persist(ddf)
fase_1 = ddf.map_partitions(
lambda df: df.apply(
lambda x: check_file(x[0]), axis=1
),
meta=('filename', 'str')
)
fase_2 = fase_1.apply(
lambda x: get_digest(x),
meta=[('error', 'bool'), ('id', 'int'), ('filename', 'str'), ('impfuzzy', 'str'), ('apiscout', 'str'), ('md5', 'str'), ('sha1', 'str'), ('sha256', 'str'), ('family', 'str'), ('packed', 'bytes')]
)
fase_2 = fase_2[fase_2['error'] == False]
fase_2 = fase_2.drop('error', axis=1)
fase_2['id'] = 0
fase_2['ss'] = ''
fase_3 = fase_2.map_partitions(
lambda df: insert_db(df),
meta=('res','str')
).compute()
print(fase_3.head())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment