Skip to content

Instantly share code, notes, and snippets.

Created January 11, 2013 07:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save anonymous/4508627 to your computer and use it in GitHub Desktop.
Save anonymous/4508627 to your computer and use it in GitHub Desktop.
#! /usr/bin/env ruby
require 'awesome_print'
require 'msgpack'
class DGAScore
include Singleton
NS = (1..4)
NGRAMS_FILE = '/tmp/ngrams'
def initialize
@ngrams_chain = [ ]
if File.exists?(NGRAMS_FILE)
@ngrams_chain = MessagePack.unpack(File.open(NGRAMS_FILE).read)
else
build_ngrams
File.open(NGRAMS_FILE, 'wb').write(MessagePack.pack(@ngrams_chain))
end
end
def build_ngrams
File.open('alexa-1m.txt').each_line do |line|
line.chomp!
line += '.' unless line.end_with?('.')
add_training_string(line)
end
normalize_ngrams_chain
end
def update_ngram_chain(n, str)
@ngrams_chain[n] ||= { }
ngram_chain = @ngrams_chain[n]
(0..str.length - n).each do |i|
ngram1 = str[i, n]
ngram_chain[ngram1] ||= 0
ngram_chain[ngram1] += 1
end
end
def normalize_ngram_chain(n)
ngram_chain = @ngrams_chain[n]
total = ngram_chain.values.inject(0, :+).to_f
ngram_chain.each_pair do |ngram1, count|
ngram_chain[ngram1] = count / total
end
end
def normalize_ngrams_chain
NS.each { |n| normalize_ngram_chain(n) }
end
def add_training_string(str)
str.downcase!
NS.each { |n| update_ngram_chain(n, str) }
end
def dump
ap @ngrams_chain
end
def score_for_ngram(n, str)
return 0.0 if str.length < n
ngram_chain = @ngrams_chain[n]
score = (0..str.length - n).inject(0.0) do |acc, i|
ngram1 = str[i, n]
p = ngram_chain[ngram1] || -1.0
acc + p
end
score / (str.length - n + 1)
end
def perplexity_for_string(str)
str.gsub!(%r{^(www|imap|mail|mx|smtp|ns)-?\d*\.}i, '')
str.gsub!(%r{\.[a-z]+\.?$}i, '')
ns_first, ns_last = 1, 2
return 0.0 if str.length < ns_last
sum = 0.0
(0..str.length - ns_last).each do |i|
ngram1 = str[i, ns_last]
p_w2_w1 = @ngrams_chain[ns_last][ngram1] || 1.0
(ns_last - 1).downto(ns_first) do |n|
ngram1 = str[i, n]
p_w2_w1 /= @ngrams_chain[n][ngram1] || 1.0
end
sum += Math.log(p_w2_w1)
end
perplexity = Math.exp(-sum / (str.length - ns_last + 1))
end
def score_for_string(str)
total_weight = 0
score = NS.inject(0.0) do |acc, n|
weight = n
total_weight += weight
acc + score_for_ngram(n, str) * weight
end
score /= total_weight
score = score * -1000.0
score = [0, score].max
end
end
dgascore = DGAScore.new
STDIN.each_line do |name|
name.chomp!
name += '.' unless name.end_with?('.')
score = dgascore.score_for_string(name)
perplexity = dgascore.perplexity_for_string(name)
puts "#{score.round(2)} #{perplexity.round(2)} #{name}"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment