-
-
Save arrogantrabbit/2d853ce959872c7b351d59247dec21e6 to your computer and use it in GitHub Desktop.
Create ASCII histogram of the file sizes in the folder hierarchy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/zsh | |
# Create ASCII histogram of the file sizes in the folder hierarchy | |
target="$1" | |
max_width_columns=120 | |
if [ -z "$target" ]; then | |
echo "Usage: $0 /path/to/directory" | |
exit 1 | |
fi | |
# Check if the directory exists | |
if [ ! -d "$target" ]; then | |
echo "Error: Directory '$target' not found" | |
exit 1 | |
fi | |
# Print the histogram header | |
echo "File size distribution histogram for '$target'" | |
sizes=$(find "$target" -type f -exec stat -f '%z' {} +) | |
# Alternative to get actual disk size -- find "$target" -type f -exec du {} + | | |
dump_the_histogram() | |
{ | |
counts=$(echo "$sizes" | awk -v bin_size_bytes="$bin_size_bytes" ' | |
{ | |
count[int($1/bin_size_bytes)]++ | |
} END { | |
for (size in count) | |
print (size + ((bin_size_bytes >1)?1:0)) * bin_size_bytes,count[size] | |
}' | | |
sort -n) | |
max_count=$(echo "$counts" | awk 'BEGIN {max = 0} {if ($2 > max) max = $2} END {print max}') | |
scale=$(bc -le "$max_count / $max_width_columns") | |
if [ "$max_count" -gt "$max_width_columns" ]; then | |
printf "One star represents about %.0f files. Omitting results with counts smaller than that\n" "$scale" | |
else | |
scale=1 | |
fi | |
echo "$counts" | awk -v scale=$scale ' | |
{ | |
size = $1 | |
count = $2 | |
if (scale <= count) | |
{ | |
units = "B "; | |
if (size > 16*1024) | |
{ | |
units = "kiB"; | |
size = size / 1024 | |
if (size > 16*1024) | |
{ | |
units = "MiB"; | |
size = size / 1024 | |
} | |
} | |
printf("%10d %s | %10d | ", size, units, count ); | |
for (i = 0; i < int(count/scale); i++) | |
printf("*"); | |
printf("\n"); | |
} | |
}' | |
} | |
# https://en.wikipedia.org/wiki/Histogram | |
data_count=$(echo "$sizes" | wc -l ) | |
bin_size_bytes=$(awk -v count="$data_count" 'BEGIN{print int(2 * count**(1/3)) + 1}') | |
echo "--- Bin size by Rice: $bin_size_bytes bytes" | |
dump_the_histogram | |
bin_size_bytes=$(awk -v count="$data_count" 'BEGIN{ print int(1 + log(count)/log(2))}') | |
echo "--- Bin size by Sturges: $bin_size_bytes bytes" | |
dump_the_histogram | |
bin_size_bytes=1024 | |
echo "--- Bin size fixed: $bin_size_bytes bytes" | |
dump_the_histogram | |
bin_size_bytes=4096 | |
echo "--- Bin size fixed: $bin_size_bytes bytes" | |
dump_the_histogram | |
bin_size_bytes=16384 | |
echo "--- Bin size fixed: $bin_size_bytes bytes" | |
dump_the_histogram |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment