Skip to content

Instantly share code, notes, and snippets.

@xanda
Forked from jedisct1/dga_score.rb
Created July 25, 2014 04:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xanda/7dc2b4ca0837a6ca77bc to your computer and use it in GitHub Desktop.
Save xanda/7dc2b4ca0837a6ca77bc to your computer and use it in GitHub Desktop.
#! /usr/bin/env ruby
require 'awesome_print'
require 'msgpack'
require 'public_suffix'
require 'singleton'
class DGAScore
include Singleton
NS = (1..4)
NGRAMS_FILE = 'tmp/ngrams'
def initialize
PublicSuffix::List.private_domains = TRUE
@ngrams_chain = [ ]
if File.exists?(NGRAMS_FILE)
@ngrams_chain = MessagePack.unpack(File.open(NGRAMS_FILE).read)
else
build_ngrams
File.open(NGRAMS_FILE, 'wb').write(MessagePack.pack(@ngrams_chain))
end
end
def build_ngrams
File.open('data/alexa-1m.txt').each_line do |line|
line.chomp!
line += '.' unless line.end_with?('.')
add_training_string(line)
end
normalize_ngrams_chain
end
def update_ngram_chain(n, str)
@ngrams_chain[n] ||= { }
ngram_chain = @ngrams_chain[n]
(0..str.length - n).each do |i|
ngram1 = str[i, n]
ngram_chain[ngram1] ||= 0
ngram_chain[ngram1] += 1
end
end
def normalize_ngram_chain(n)
ngram_chain = @ngrams_chain[n]
total = ngram_chain.values.inject(0, :+).to_f
ngram_chain.each_pair do |ngram1, count|
ngram_chain[ngram1] = count / total
end
end
def normalize_ngrams_chain
NS.each { |n| normalize_ngram_chain(n) }
end
def add_training_string(str)
str = str.dup
str.downcase!
NS.each { |n| update_ngram_chain(n, str) }
end
def dump
ap @ngrams_chain
end
def score_for_ngram(n, str)
return 0.0 if str.length < n
ngram_chain = @ngrams_chain[n]
score = (0..str.length - n).inject(0.0) do |acc, i|
ngram1 = str[i, n]
p = ngram_chain[ngram1] || -1.0
acc + p
end
score / (str.length - n + 1)
end
def perplexity_for_string(str)
str = str.dup
str.gsub!(%r{^(www|imap|mail|mx|smtp|ns)-?\d*\.}i, '')
str.gsub!(%r{(\.[a-z]{2})\.[a-z]+\.?$}i, '')
ns_first, ns_last = 1, 2
return 0.0 if str.length < ns_last
sum = 0.0
(0..str.length - ns_last).each do |i|
ngram1 = str[i, ns_last]
p_w2_w1 = @ngrams_chain[ns_last][ngram1] || 1e-6
(ns_last - 1).downto(ns_first) do |n|
ngram1 = str[i, n]
p_w2_w1 /= @ngrams_chain[n][ngram1] || 1e-6
end
sum += Math.log(p_w2_w1)
end
perplexity = Math.exp(-sum / (str.length - ns_last + 1))
perplexity = (perplexity - 3.22) * 100.0 / 2006.8
[100.0, [perplexity, 0.0].max].min
end
def score_for_string(str)
str += '.' unless str.end_with?('.')
total_weight = 0
score = NS.inject(0.0) do |acc, n|
weight = n
total_weight += weight
acc + score_for_ngram(n, str) * weight
end
score /= total_weight
score = score * -100.0 / 0.303
score = [100.0, [0.0, score].max].min
end
def entropy_for_string(str)
begin
str = PublicSuffix.parse(str).sld
rescue Exception
end
b, len = str.bytes.to_a, str.length.to_f
b.uniq.inject(0.0) do |acc, c|
x = b.count(c) / len
acc + (x > 0.0 ? - x * Math.log2(x) : 0.0)
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment