Created
April 10, 2017 15:57
-
-
Save trevorsibanda/2ce1115fcc235dd80500156a7fe52121 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//simple algorithm to calculate entropy of a file in Scala | |
package zw.trevor.disorder | |
class Entropy(bytes: Seq[Byte]){ | |
val max_bytes = 256 | |
def log2(n: Double) = math.log(n)/math.log(2) | |
lazy val token_freqs = { | |
var freqs: Array[Int] = (0 until max_bytes).map{x=>0}.toArray | |
bytes.map{b => | |
//java bytes are signed, use b & 0xff to make unsigned | |
freqs(b.toInt & 0xff) = freqs(b.toInt & 0xff)+1 | |
} | |
freqs | |
} | |
lazy val num_tokens = token_freqs.filter(_.toInt != 0).toSet.size | |
lazy val token_probs = { | |
var probs: Array[Double] = ((0 until max_bytes).map{x=>0.00}).toArray | |
val len = bytes.length.toDouble | |
(0 until max_bytes).map{i=> | |
probs(i) = (token_freqs(i)/ len) | |
} | |
probs | |
} | |
lazy val entropy = ((0 until max_bytes).map{ | |
case i if token_freqs(i)!=0 => token_probs(i) * log2(token_probs(i)) | |
case _ => 0.00 | |
}).sum * -1.00 | |
lazy val max_entropy = log2(num_tokens) | |
lazy val m_ratio = entropy/max_entropy | |
} | |
object Entropy extends App{ | |
import java.nio.file._ | |
import java.io.File | |
val path = "/usr/bin" | |
new File(path).listFiles.filter(!_.isDirectory)map{ | |
case file if file.canRead => | |
val bytes = Files.readAllBytes(Paths.get(file.getPath)) | |
val dis = new Entropy(bytes) | |
println(s"""${file.getPath.replaceAll(" ", "_space_")} ${dis.entropy} ${file.length}""") | |
case _ => () | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment