Skip to content

Instantly share code, notes, and snippets.

@TangentFoxy
Last active January 14, 2024 02:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TangentFoxy/2bcbffb2d85fcfa61ccae3defec89215 to your computer and use it in GitHub Desktop.
Save TangentFoxy/2bcbffb2d85fcfa61ccae3defec89215 to your computer and use it in GitHub Desktop.
Generate statistics on file sizes within a directory. (Lua, using LuaFilesystem.)
#!/usr/bin/env luajit
-- THIS IS BEING MAINTAINED AT https://github.com/TangentFoxy/.lua-files
-- GO THERE INSTEAD OF DOWNLOADING THIS FILE DIRECTLY.
-- Primarily written by ChatGPT using GPT-3.5, with corrections and modifications by me.
-- Do whatever the hell you want with it.
local lfs = require "lfs"
-- Function to get the filesize of a given file
function get_filesize(filepath)
local file = io.open(filepath, "rb")
if file then
local size = file:seek("end")
file:close()
return size
else
return nil
end
end
-- Function to recursively traverse directories and get file sizes
function traverse_directory(path)
local total_size = 0
local total_files = 0
local file_sizes = {}
for entry in lfs.dir(path) do
if entry ~= "." and entry ~= ".." then
local full_path = path..'\\'..entry
local attributes = lfs.attributes(full_path)
if attributes and attributes.mode == "file" then
local size = get_filesize(full_path)
if size then
print(full_path, size, "bytes")
table.insert(file_sizes, size)
total_size = total_size + size
total_files = total_files + 1
else
print(full_path, "File not found or inaccessible")
end
elseif attributes and attributes.mode == "directory" then
local subdir_total_size, subdir_total_files, subdir_file_sizes = traverse_directory(full_path)
total_size = total_size + subdir_total_size
total_files = total_files + subdir_total_files
while #subdir_file_sizes > 0 do
table.insert(file_sizes, table.remove(subdir_file_sizes))
end
end
end
end
return total_size, total_files, file_sizes
end
-- Function to calculate evenly spaced percentiles
function calculate_percentiles(data, num_percentiles)
local result = {}
table.sort(data)
for i = 1, num_percentiles do
local p = (i - 1) / (num_percentiles - 1) * 100
local index = math.ceil(#data * p / 100)
if index == 0 then index = 1 end
result[i] = data[index]
end
return result
end
-- Function to print percentiles table returned from calculate_percentiles
function print_percentiles(percentiles)
for i, value in pairs(percentiles) do
local p = (i - 1) / (#percentiles - 1) * 100
if p == 50 then
print(p .. "th percentile (median):", value, "bytes")
else
print(p .. "th percentile:", value, "bytes")
end
end
end
-- Function to calculate mode
function calculate_mode(data)
local freq_map = {}
local max_freq = 0
local modes = {}
for _, value in ipairs(data) do
freq_map[value] = (freq_map[value] or 0) + 1
if freq_map[value] > max_freq then
max_freq = freq_map[value]
end
end
if max_freq == 1 then
return modes, max_freq -- no mode
end
for value, freq in pairs(freq_map) do
if freq == max_freq then
table.insert(modes, value)
end
end
table.sort(modes)
return modes, max_freq
end
-- Function to print mode results
function print_mode_results(modes, max_freq)
if #modes == 0 then
print("No mode found.")
elseif #modes == 1 then
print("Mode:", modes[1], "bytes")
else
print("Multiple modes:")
for i, mode in ipairs(modes) do
print("Mode " .. i .. ":", mode, "bytes")
end
end
print("Frequency:", max_freq)
end
-- Function to calculate standard deviation
function calculate_standard_deviation(data)
local n = #data
local sum = 0
local sum_of_squared_deviations = 0
if n < 1 then
return 0 -- Standard deviation is undefined for small sample sizes
end
-- Calculate mean
for _, value in ipairs(data) do
sum = sum + value
end
local mean = sum / n
-- Calculate sum of squared deviations
for _, value in ipairs(data) do
local deviation = value - mean
sum_of_squared_deviations = sum_of_squared_deviations + deviation^2
end
-- Calculate standard deviation
local variance = sum_of_squared_deviations / (n - 1)
local standard_deviation = math.sqrt(variance)
return standard_deviation
end
-- Function to calculate a histogram
function calculate_histogram(data, num_bins)
local histogram = {}
local min_value = math.min(unpack(data))
local max_value = math.max(unpack(data))
local bin_width = (max_value - min_value) / num_bins
for i = 1, num_bins do
local bin_start = min_value + (i - 1) * bin_width
local bin_end = bin_start + bin_width
histogram[i] = {bin_start, bin_end, 0}
end
for _, value in ipairs(data) do
local bin_index = math.floor((value - min_value) / bin_width) + 1
if bin_index <= num_bins then
histogram[bin_index][3] = histogram[bin_index][3] + 1
else
-- the largest file always calculates to an nth + 1 bin
histogram[num_bins][3] = histogram[num_bins][3] + 1
end
end
return histogram
end
-- Function to print histogram results
function print_histogram(histogram)
for i, bin in ipairs(histogram) do
local bin_start, bin_end, count = unpack(bin)
print(string.format("%.2f - %.2f:", bin_start, bin_end), count, "files")
end
end
-- Function to print histogram results with logarithmic scaling and aligned graphical representation
function print_histogram_graphical(histogram, graph_width)
local max_count = 0
-- Find the maximum count to determine the scale
for _, bin in ipairs(histogram) do
local count = bin[3]
if count > max_count then
max_count = count
end
end
local max_log_scaled = math.log(max_count + 1) -- Add 1 to avoid log(0)
-- Print the histogram with graphical representation and aligned text data
for _, bin in ipairs(histogram) do
local bin_start, bin_end, count = unpack(bin)
local log_scaled_count = math.log(count + 1) -- Add 1 to avoid log(0)
local scaled_width = math.floor((log_scaled_count / max_log_scaled) * graph_width) -- Adjust the width as needed
local bar = string.rep("#", scaled_width)
local empty_spaces = string.rep(" ", graph_width - scaled_width) -- Add empty spaces for alignment
print(string.format("[%s%s] %.2f - %.2f: %d files", bar, empty_spaces, bin_start, bin_end, count))
end
end
local root_directory = "." -- bodge to work in-place
local total_size, total_files, total_file_sizes = traverse_directory(root_directory)
if total_files > 0 then
print("")
print(total_files, "files found.")
local average_size = total_size / total_files
print("Average (mean) file size:", average_size, "bytes")
local standard_deviation = calculate_standard_deviation(total_file_sizes)
print("Standard deviation:", standard_deviation)
local mode_results, max_freq = calculate_mode(total_file_sizes)
print_mode_results(mode_results, max_freq)
local bin_size = math.ceil(math.sqrt(total_files)) -- Square Root Rule
local histogram_results = calculate_histogram(total_file_sizes, bin_size)
print_histogram_graphical(histogram_results, 40)
local percentiles = calculate_percentiles(total_file_sizes, 11)
print_percentiles(percentiles)
else
print("No files found.")
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment