Created
February 1, 2020 22:43
-
-
Save datlife/e933a902a741126a8604c42e7b50bbd6 to your computer and use it in GitHub Desktop.
Extract all DGA data to a file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bqkrtxgkmriwsiwcngtivpx.info | |
jdtmfupdyueqeldvhsjzdvzob.net | |
guhmpoxzivhba.com | |
nqqxqhuacaqhzurde.org | |
lgqsqgpqzijwid.info | |
ykolyecdcyk.biz | |
ztvflnxqzpxvpfobv.biz | |
zqrmkpivrbxccawozqwqpfzh.org | |
iqyqwhntrxfeq.org | |
ftadkbomxlnsib.info | |
jdhnidjdshlhejjjjnvtt.org | |
ekooucerate.org | |
tnpajfhzdedhjsx.org | |
bvmcmljldpewugc.org |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DGA Dataset Overview | |
Source: github.com/baderj/domain_generation_algorithms | |
------------------------------------- | |
ALGORITHMS | # OF DOMAINS | |
-----------------------|------------- | |
qakbot | 5000 | |
suppobox | 85 | |
suppobox | 85 | |
suppobox | 85 | |
symmi | 64 | |
reconyc | 100 | |
tempedreve | 1746 | |
chinad | 256 | |
unknown_malware | 50 | |
unknown_malware | 50 | |
ranbyus | 40 | |
ranbyus | 40 | |
nymaim | 128 | |
unnamed_javascript_dga | 30 | |
ramnit | 100 | |
pizd | 85 | |
vawtrak | 100 | |
gozi | 12 | |
shiotob | 2001 | |
tinba | 401 | |
banjori | 1000 | |
locky | 8 | |
pitou | 7 | |
fobber | 300 | |
fobber | 300 | |
padcrypt | 24 | |
nymaim2 | 704 | |
qadars | 200 | |
simda | 1000 | |
necurs | 2048 | |
dircrypt | 30 | |
proslikefan | 100 | |
newgoz | 1000 | |
sisron | 40 | |
monerodownloader | 2500 | |
corebot | 40 | |
dnschanger | 5 | |
kraken | 2000 | |
kraken | 2000 | |
pushdo | 1350 | |
murofet | 1000 | |
murofet | 1020 | |
murofet | 100000 | |
pykspa | 5000 | |
pykspa | 10 | |
unnamed_downloader | 120 | |
mydoom | 99 | |
qsnatch | 150 | |
qsnatch | 2592 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
"""This script concatenates all the examle_domains.txt file from git [1] into | |
a one file. It also generate a meta data to quickly learn about some stats in data. | |
Author: Dat Nguyen | |
Since: 2020 | |
[1] github.com/baderj/domain_generation_algorithms | |
""" | |
VALID_FQDN='(?=^.{4,253}$)(^(?:[a-zA-Z0-9](?:(?:[a-zA-Z0-9\-]){0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,}$)' | |
META=dga.meta | |
OUTPUT=dga.dat | |
# Clear previous run | |
printf "" > $OUTPUT | |
printf 'DGA Dataset Overview\n\n' > $META | |
printf 'Source: github.com/baderj/domain_generation_algorithms\n\n' >> $META | |
echo '-------------------------------------' >> $META | |
printf '%-22s | %-10s\n' "ALGORITHMS" "# OF DOMAINS" >> $META | |
echo '-----------------------|-------------' >> $META | |
# Find .txt files excluding README and seed files. | |
DATA_FILES=$(find . -type f -name "*.txt" | grep -v 'README\|words') | |
for FILE in $DATA_FILES | |
do | |
DGA=$(echo $FILE | awk -F\/ '{print $2}') | |
CLEAN_DATA=$(grep -P "$VALID_FQDN" $FILE) | |
# There is a difference between echo $VAR and echo "$VAR". | |
# The later preserves all the output format of var | |
echo "$CLEAN_DATA" >> $OUTPUT | |
COUNT=$(echo "$CLEAN_DATA" | wc -l) | |
printf "%-22s | %-10s\n" $DGA $COUNT >> $META | |
done | |
total=$(wc -l $OUTPUT) | |
echo "$OUTPUT contains $total domains." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment