Skip to content

Instantly share code, notes, and snippets.

@mjs3339
Last active March 24, 2020 09:13
Show Gist options
  • Save mjs3339/f243f37bf38809bcd21d79054e353857 to your computer and use it in GitHub Desktop.
Save mjs3339/f243f37bf38809bcd21d79054e353857 to your computer and use it in GitHub Desktop.
Chi Squared Byte Array Test
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Drawing;
using System.Globalization;
using System.IO;
using System.Linq;
public static class ChiSquared
{
/// <summary>
/// Calculated from an English word dictionary containing over 466,000 words.
/// </summary>
private static readonly float[] _expectedPercentages = {.0846f, .0189f, .0420f, .0353f, .1098f, .0125f, .0243f, .0274f, .0864f, .0018f, .0089f, .0574f, .0292f, .0715f, .0709f, .0310f, .0019f, .0704f, .0705f, .0647f, .0363f, .0099f, .0085f, .0028f, .0192f, .0041f};
/// <summary>
/// Not accurate 100% all of the time.
/// </summary>
/// <param name="path"></param>
public static bool IsFileCompressed(this string path)
{
var arr = File.ReadAllBytes(path);
var r1 = arr.ChiSquaredTest();
return r1.isRandom;
}
/// <summary>
/// Tests a buffer for randomness. Returns chi squared values.
/// isRandom - is the buffer a random sequence.
/// Quality - Less than 1 or greater than 1 is off target. Observed is off expected.
/// Entropy - Calculates a 8 bit Entropy level of the buffer as a percentage of perfect disorder 100%
/// ExpectedChiSq - The expected chi squared value.
/// LowLimit - (R - (2*sqrt(R)))
/// chiSqValue - The observed chi squared value.
/// UpperLimit - (R + (2*sqrt(R)))
/// </summary>
/// <param name="bArr">The byte Array</param>
public static (bool isRandom, float Quality, float Entropy, int ExpectedChiSq, float LowLimit, float chiSqValue, float UpperLimit) ChiSquaredTest(this byte[] bArr)
{
if (bArr != null)
{
var iArr = Ia(bArr);
var ent = Entropy(bArr);
if (ent < 95)
return (false, 0, ent, 0, 0, 0, 0);
var aLen = iArr.Length;
var rLim = aLen / 10;
var n = aLen;
var r = rLim - 1;
var freq = new ConcurrentDictionary<int, int>();
iArr.AsParallel().WithDegreeOfParallelism(2).ForAll(I =>
{
var iT = Math.Abs(Math.Abs(I) % rLim - rLim);
if (!freq.ContainsKey(iT))
freq.TryAdd(iT, 1);
else
freq[iT] += 1;
});
var t = freq.Sum(e => (float) Math.Pow(e.Value, 2));
var cS = Math.Abs(r * t / n - n);
var fL = r - 2.0f * (float) Math.Sqrt(r);
var fH = r + 2.0f * (float) Math.Sqrt(r);
var iR = (fL <= cS) & (fH >= cS);
var q = cS / r;
return (iR, q, ent, r, fL, cS, fH);
}
return default;
}
private static int[] Ia(byte[] ba)
{
var bal = ba.Length;
var dWordCount = bal / 4 + (bal % 4 == 0 ? 0 : 1);
var arr = new int[dWordCount];
Buffer.BlockCopy(ba, 0, arr, 0, bal);
return arr;
}
private static float Entropy(byte[] s)
{
float len = s.Length;
var map = new int[256];
for (var i = 0; i < (int) len; i++)
map[s[i]]++;
var result = 0f;
for (var idx = 0; idx < map.Length; idx++)
{
var frequency = map[idx] / len;
if (frequency > 0)
result -= frequency * (float) Math.Log(frequency, 2);
}
return result / 8f * 100f;
}
public static int ChiSquaredCount(this byte[] s, byte b)
{
float len = s.Length;
var map = new int[256];
for (var i = 0; i < (int) len; i++)
map[s[i]]++;
return map[b];
}
public static int ChiSquaredCount(this string s, char b)
{
float len = s.Length;
var map = new int[256];
for (var i = 0; i < (int) len; i++)
map[s[i]]++;
return map[b];
}
public static float ChiSquaredAsPercent(this string s, char b)
{
float len = s.Length;
var map = new int[256];
for (var i = 0; i < (int) len; i++)
map[s[i]]++;
return map[b] / len;
}
/// <summary>
/// Compute the letter frequencies within the English language.
/// Use a large English language text block for accurate testing.
/// </summary>
/// <param name="s">String that contains the large English text</param>
public static KeyValuePair<char, float>[] ChiSquaredTextAsPercent(this string s)
{
float len = s.Length;
s = s.ToLower(CultureInfo.CurrentCulture);
var lst = new Dictionary<char, float>();
var map = new int[256];
for (var i = 0; i < (int) len; i++)
if (s[i].IsLetter())
map[s[i]]++;
var t = map.Sum(e => e);
foreach (var l in "abcdefghijklmnopqrstuvwxyz")
lst.Add(l, map[l] / (float) t);
var klst = lst.OrderBy(e => e.Key).ToArray();
var KeyList = "";
var ValueList = "";
foreach (var kv in klst)
{
KeyList += $"{kv.Key},";
ValueList += $"{kv.Value:.0000},";
}
var nlst = lst.OrderBy(e => e.Value).ToArray();
return nlst;
}
public static float ChiSquaredTextTest(this string s)
{
var realLen = 0;
s = s.ToLower(CultureInfo.CurrentCulture);
var observed = new Dictionary<char, int>();
foreach (var c in s)
if (c.IsLetter())
{
if (!observed.ContainsKey(c))
observed.Add(c, 1);
else
observed[c]++;
realLen++;
}
var expected = new Dictionary<char, float>();
for (var i = 0; i < 26; i++)
expected.Add((char) (i + 97), _expectedPercentages[i] * realLen);
var cSList = new List<float>();
foreach (var item in expected)
{
var c = item.Key;
if (observed.ContainsKey(c))
cSList.Add((float) Math.Pow(observed[c] - expected[c], 2) / expected[c]);
}
return cSList.Sum(e => e) / realLen * 100f;
}
/// <summary>
/// The value of 10 as a combined chi-squared total distance percentage threshold is subjective.
/// Determined from about 40 test runs. Most non-text files have readings
/// in the 100's
/// </summary>
/// <param name="path">Path to the file to test</param>
public static bool IsTextFile(this string path)
{
return File.ReadAllText(path).ChiSquaredTextTest() < 10;
}
public static float ImageChiSqrdDp(string p1, string p2)
{
var hist1 = new int[256];
var hist2 = new int[256];
var iLen1 = 0;
var iLen2 = 0;
using (var img1 = new Bitmap(p1))
{
for (var y = 0; y < img1.Height; y++)
for (var x = 0; x < img1.Width; x++)
{
iLen1++;
var pixel1 = img1.GetPixel(x, y);
hist1[(pixel1.R + pixel1.G + pixel1.B) / 3]++;
}
}
using (var img2 = new Bitmap(p2))
{
for (var y = 0; y < img2.Height; y++)
for (var x = 0; x < img2.Width; x++)
{
iLen2++;
var pixel2 = img2.GetPixel(x, y);
hist2[(pixel2.R + pixel2.G + pixel2.B) / 3]++;
}
}
var ChiSqrd= 0.0f;
for (var i = 0; i < 256; i++)
if (hist2[i] != 0)
ChiSqrd += (float) Math.Pow(hist1[i] - hist2[i], 2) / hist2[i];
return ChiSqrd;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment