Last active
October 29, 2018 23:18
-
-
Save atucom/3c085308ce73e9903cbe922d129799cd to your computer and use it in GitHub Desktop.
This returns the files in the current directory that are statistical outliers in terms of file size
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
@atucom | |
This returns the files in the target directory that are | |
statistical outliers in terms of file size | |
This is useful in the quest for finding target data. | |
""" | |
from __future__ import division | |
import argparse | |
import sys | |
try: | |
import os | |
import numpy | |
except ImportError as e: | |
print(e) | |
quit(1) | |
# Median absolute deviation | |
# stole the complicated stuff from stackoverflow, thanks! | |
def mad(data, axis=None): | |
return numpy.mean(numpy.abs(data - numpy.mean(data, axis)), axis) | |
def _mad(x): | |
return numpy.abs(x - numpy.median(x)) / mad(x) | |
def getFiles(directory): | |
# Gets files for the target directory | |
try: | |
filenames = os.listdir(path = directory) | |
except FileNotFoundError: | |
print("Directory not found, try again") | |
exit(1) | |
files = {} | |
for filename in filenames: | |
files[filename] = os.stat(directory + '/' + filename).st_size | |
return files | |
def outlierFileNames(files): | |
# You have to calculate the mad first, so grab all the values and get the standard deviation back as "a" | |
filesizes = [size for filename, size in files.items()] | |
madDeviations = _mad(filesizes) | |
threshold = 1.4826 # I _think_ this is right, someone please correct me if i'm not | |
# move around by index, if the position of mad is greater than w/e thrshold, then return that item back to us: | |
for index in range(len(madDeviations)): | |
if madDeviations[index] > threshold: | |
print(list(files.items())[index][0]) | |
def main(): | |
"""Main Execution""" | |
parser = argparse.ArgumentParser( | |
description='Returns files with outlier filesizes', | |
epilog="Example: \n\t %s <dir to scan>"%sys.argv[0]) | |
parser.add_argument( | |
'-d','--dir', | |
metavar='Directory', | |
dest='dir', | |
help='Directory to scan' | |
) | |
args = parser.parse_args() | |
if args.dir: | |
files = getFiles(args.dir) | |
outlierFileNames(files) | |
else: | |
files = getFiles('.') | |
outlierFileNames(files) | |
if __name__ == '__main__': | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment