Skip to content

Instantly share code, notes, and snippets.

@brenapp
Created March 23, 2019 21:23
Show Gist options
  • Save brenapp/dd5f19186a30ec571d69a0af5365ec4d to your computer and use it in GitHub Desktop.
Save brenapp/dd5f19186a30ec571d69a0af5365ec4d to your computer and use it in GitHub Desktop.
# Finds the character distribution of a file
"""Returns a list of the frequencies of each character by ASCII code"""
def frequency(string):
dist = [0] * 127
for char in string:
dist[ord(char)] += 1
return dist
"""Find the Chi Square of a Distribution"""
def chi_square(expected, distribution):
total = 0
for number in distribution:
total += ((number - expected) ** 2) / expected
return total
"""Compares two strings of text and returns the one with the lower Chi Square. Returns a tuple and with the index the Chi Square"""
def compare(a, b, expected = (18 * 1000) / 127):
a_chi = chi_square(expected, frequency(a))
b_chi = chi_square(expected, frequency(b))
return (0 if a_chi > b_chi else 1, max([a_chi, b_chi]))
# Glue file for data analysis
# Will first scan data/ and find Chi-squared values of the distribution
import os
import numpy as np
import sys
import logging
import statistic
import distribution
import regions
import time
logging.basicConfig(filename='output.log',level=logging.DEBUG)
# ls files
files = os.listdir("text_files")
# Expected value for Chi Square
EXPECTED = (18 * 1000) / 127
# Stores Chi Square values for each file and file contents
dataset = [0] * len(files)
contents = [""] * len(files)
print("Searching through files...")
# Iterate through each file in the directory
for index, filename in enumerate(files):
print(" Scanning {}...".format(filename))
data = open("text_files/" + filename, "r").read()
# Save file contents for later
contents[index] = data
# Get character distribution & calculate Chi Square
freq = distribution.frequency(data)
dataset[index] = distribution.chi_square(EXPECTED, freq)
print("Done\n\n")
# Calculate the five number summary using numpy
quarts = np.percentile(dataset, [25, 50, 75])
print("Chi Square Distribution of Characters -- Five Number Summary")
print(" MIN = {} ({})".format(min(dataset), files[dataset.index(min(dataset))]))
print(" Q1 = {}".format(quarts[0]))
print(" Q2 = {}".format(quarts[1]))
print(" Q3 = {}".format(quarts[2]))
print(" MAX = {} ({})".format(max(dataset), files[dataset.index(max(dataset))]))
print("\n")
# Find outliers
print("Outliers")
outliers = statistic.outliers(dataset, 10, 90)
if len(outliers[0]) < 1:
print(" NONE")
print("\nNo usual files found.")
sys.exit(0)
for i in outliers[0]:
print(" {} (χ² = {})".format(files[i], dataset[i]))
print("\nFinding usual regions of outlier files...")
for i in outliers[0]:
strange = regions.unusal(contents[i])
print("\n")
print("{}".format(files[i]))
print("{}".format(strange))
logging.debug("{}, {}, {}".format(files[i], dataset[i], strange))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment