Skip to content

Instantly share code, notes, and snippets.

@arrogantrabbit
Last active April 18, 2023 05:17
Show Gist options
  • Save arrogantrabbit/2d853ce959872c7b351d59247dec21e6 to your computer and use it in GitHub Desktop.
Save arrogantrabbit/2d853ce959872c7b351d59247dec21e6 to your computer and use it in GitHub Desktop.
Create ASCII histogram of the file sizes in the folder hierarchy
#!/bin/zsh
# Create ASCII histogram of the file sizes in the folder hierarchy
target="$1"
max_width_columns=120
if [ -z "$target" ]; then
echo "Usage: $0 /path/to/directory"
exit 1
fi
# Check if the directory exists
if [ ! -d "$target" ]; then
echo "Error: Directory '$target' not found"
exit 1
fi
# Print the histogram header
echo "File size distribution histogram for '$target'"
sizes=$(find "$target" -type f -exec stat -f '%z' {} +)
# Alternative to get actual disk size -- find "$target" -type f -exec du {} + |
dump_the_histogram()
{
counts=$(echo "$sizes" | awk -v bin_size_bytes="$bin_size_bytes" '
{
count[int($1/bin_size_bytes)]++
} END {
for (size in count)
print (size + ((bin_size_bytes >1)?1:0)) * bin_size_bytes,count[size]
}' |
sort -n)
max_count=$(echo "$counts" | awk 'BEGIN {max = 0} {if ($2 > max) max = $2} END {print max}')
scale=$(bc -le "$max_count / $max_width_columns")
if [ "$max_count" -gt "$max_width_columns" ]; then
printf "One star represents about %.0f files. Omitting results with counts smaller than that\n" "$scale"
else
scale=1
fi
echo "$counts" | awk -v scale=$scale '
{
size = $1
count = $2
if (scale <= count)
{
units = "B ";
if (size > 16*1024)
{
units = "kiB";
size = size / 1024
if (size > 16*1024)
{
units = "MiB";
size = size / 1024
}
}
printf("%10d %s | %10d | ", size, units, count );
for (i = 0; i < int(count/scale); i++)
printf("*");
printf("\n");
}
}'
}
# https://en.wikipedia.org/wiki/Histogram
data_count=$(echo "$sizes" | wc -l )
bin_size_bytes=$(awk -v count="$data_count" 'BEGIN{print int(2 * count**(1/3)) + 1}')
echo "--- Bin size by Rice: $bin_size_bytes bytes"
dump_the_histogram
bin_size_bytes=$(awk -v count="$data_count" 'BEGIN{ print int(1 + log(count)/log(2))}')
echo "--- Bin size by Sturges: $bin_size_bytes bytes"
dump_the_histogram
bin_size_bytes=1024
echo "--- Bin size fixed: $bin_size_bytes bytes"
dump_the_histogram
bin_size_bytes=4096
echo "--- Bin size fixed: $bin_size_bytes bytes"
dump_the_histogram
bin_size_bytes=16384
echo "--- Bin size fixed: $bin_size_bytes bytes"
dump_the_histogram
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment