Skip to content

Instantly share code, notes, and snippets.

@berceanu
Created September 11, 2023 15:02
Show Gist options
  • Save berceanu/7047c51f85ca026628de2c089c38c84f to your computer and use it in GitHub Desktop.
Save berceanu/7047c51f85ca026628de2c089c38c84f to your computer and use it in GitHub Desktop.
Estimate optimal number of histogram bins based on method of Shimazaki and Shinomoto.
"""
Shimazaki, Hideaki, and Shigeru Shinomoto.
"A method for selecting the bin size of a time histogram."
Neural computation 19.6, 1503 (2007).
https://doi.org/10.1162/neco.2007.19.6.1503
"""
import argparse
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from openpmd_resampler.reader import ParticleDataReader
def compute_cost_function(random_numbers, num_bins, weights):
min_val, max_val = np.min(random_numbers), np.max(random_numbers)
bin_sizes = (max_val - min_val) / num_bins
cost_values = np.zeros(np.size(bin_sizes))
for i in range(np.size(num_bins)):
bin_edges = np.linspace(min_val, max_val, num_bins[i] + 1)
event_counts = np.histogram(random_numbers, bin_edges, weights=weights)[0]
mean_event_count = np.mean(event_counts)
variance_event_count = (
np.sum((event_counts - mean_event_count) ** 2) / num_bins[i]
)
cost_values[i] = (2 * mean_event_count - variance_event_count) / (
bin_sizes[i] ** 2
)
return cost_values, bin_sizes
def select_optimal_bin_size(cost_values):
min_cost = np.min(cost_values)
optimal_bin_size_index = np.where(cost_values == min_cost)[0][0]
return optimal_bin_size_index
def plot_results(
random_numbers, num_bins, cost_values, bin_sizes, optimal_bin_size_index, weights
):
min_val, max_val = np.min(random_numbers), np.max(random_numbers)
bin_edges = np.linspace(min_val, max_val, num_bins[optimal_bin_size_index] + 1)
plt.figure(figsize=(12, 4))
plt.subplot(121)
plt.hist(random_numbers, bin_edges, weights=weights)
plt.title("Histogram")
plt.ylabel("Frequency")
plt.xlabel("Value")
plt.subplot(122)
plt.plot(bin_sizes, cost_values, ".", alpha=0.5)
plt.plot(bin_sizes[optimal_bin_size_index], np.min(cost_values), "*r")
plt.title("Cost Function")
plt.xlabel("Bin Size")
plt.ylabel("Cost Function Value")
plt.savefig("OptimalHistogramAndCostFunction.png")
def main():
# Parse command line arguments
parser = argparse.ArgumentParser()
parser.add_argument("opmd_path", type=str, help="Path to the OpenPMD file")
args = parser.parse_args()
opmd_path = Path(args.opmd_path)
##############################
# Create the dataframe
df = ParticleDataReader.from_file(
opmd_path, particle_species_name="e_highGamma"
) # or "e_all"
random_numbers = df["momentum_y_mev_c"]
weights = df["weights"]
min_num_bins = 100
max_num_bins = 4000
num_bins = np.arange(min_num_bins, max_num_bins)
cost_values, bin_sizes = compute_cost_function(random_numbers, num_bins, weights)
optimal_bin_size_index = select_optimal_bin_size(cost_values)
actual_num_bins = num_bins[optimal_bin_size_index]
print(f"Actual number of bins: {actual_num_bins}")
plot_results(
random_numbers,
num_bins,
cost_values,
bin_sizes,
optimal_bin_size_index,
weights,
)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment