Skip to content

Instantly share code, notes, and snippets.

@atucom
Last active October 29, 2018 23:18
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save atucom/3c085308ce73e9903cbe922d129799cd to your computer and use it in GitHub Desktop.
Save atucom/3c085308ce73e9903cbe922d129799cd to your computer and use it in GitHub Desktop.
This returns the files in the current directory that are statistical outliers in terms of file size
#!/usr/bin/env python3
"""
@atucom
This returns the files in the target directory that are
statistical outliers in terms of file size
This is useful in the quest for finding target data.
"""
from __future__ import division
import argparse
import sys
try:
import os
import numpy
except ImportError as e:
print(e)
quit(1)
# Median absolute deviation
# stole the complicated stuff from stackoverflow, thanks!
def mad(data, axis=None):
return numpy.mean(numpy.abs(data - numpy.mean(data, axis)), axis)
def _mad(x):
return numpy.abs(x - numpy.median(x)) / mad(x)
def getFiles(directory):
# Gets files for the target directory
try:
filenames = os.listdir(path = directory)
except FileNotFoundError:
print("Directory not found, try again")
exit(1)
files = {}
for filename in filenames:
files[filename] = os.stat(directory + '/' + filename).st_size
return files
def outlierFileNames(files):
# You have to calculate the mad first, so grab all the values and get the standard deviation back as "a"
filesizes = [size for filename, size in files.items()]
madDeviations = _mad(filesizes)
threshold = 1.4826 # I _think_ this is right, someone please correct me if i'm not
# move around by index, if the position of mad is greater than w/e thrshold, then return that item back to us:
for index in range(len(madDeviations)):
if madDeviations[index] > threshold:
print(list(files.items())[index][0])
def main():
"""Main Execution"""
parser = argparse.ArgumentParser(
description='Returns files with outlier filesizes',
epilog="Example: \n\t %s <dir to scan>"%sys.argv[0])
parser.add_argument(
'-d','--dir',
metavar='Directory',
dest='dir',
help='Directory to scan'
)
args = parser.parse_args()
if args.dir:
files = getFiles(args.dir)
outlierFileNames(files)
else:
files = getFiles('.')
outlierFileNames(files)
if __name__ == '__main__':
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment