Created
September 11, 2023 15:02
-
-
Save berceanu/7047c51f85ca026628de2c089c38c84f to your computer and use it in GitHub Desktop.
Estimate optimal number of histogram bins based on method of Shimazaki and Shinomoto.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Shimazaki, Hideaki, and Shigeru Shinomoto. | |
"A method for selecting the bin size of a time histogram." | |
Neural computation 19.6, 1503 (2007). | |
https://doi.org/10.1162/neco.2007.19.6.1503 | |
""" | |
import argparse | |
from pathlib import Path | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from openpmd_resampler.reader import ParticleDataReader | |
def compute_cost_function(random_numbers, num_bins, weights): | |
min_val, max_val = np.min(random_numbers), np.max(random_numbers) | |
bin_sizes = (max_val - min_val) / num_bins | |
cost_values = np.zeros(np.size(bin_sizes)) | |
for i in range(np.size(num_bins)): | |
bin_edges = np.linspace(min_val, max_val, num_bins[i] + 1) | |
event_counts = np.histogram(random_numbers, bin_edges, weights=weights)[0] | |
mean_event_count = np.mean(event_counts) | |
variance_event_count = ( | |
np.sum((event_counts - mean_event_count) ** 2) / num_bins[i] | |
) | |
cost_values[i] = (2 * mean_event_count - variance_event_count) / ( | |
bin_sizes[i] ** 2 | |
) | |
return cost_values, bin_sizes | |
def select_optimal_bin_size(cost_values): | |
min_cost = np.min(cost_values) | |
optimal_bin_size_index = np.where(cost_values == min_cost)[0][0] | |
return optimal_bin_size_index | |
def plot_results( | |
random_numbers, num_bins, cost_values, bin_sizes, optimal_bin_size_index, weights | |
): | |
min_val, max_val = np.min(random_numbers), np.max(random_numbers) | |
bin_edges = np.linspace(min_val, max_val, num_bins[optimal_bin_size_index] + 1) | |
plt.figure(figsize=(12, 4)) | |
plt.subplot(121) | |
plt.hist(random_numbers, bin_edges, weights=weights) | |
plt.title("Histogram") | |
plt.ylabel("Frequency") | |
plt.xlabel("Value") | |
plt.subplot(122) | |
plt.plot(bin_sizes, cost_values, ".", alpha=0.5) | |
plt.plot(bin_sizes[optimal_bin_size_index], np.min(cost_values), "*r") | |
plt.title("Cost Function") | |
plt.xlabel("Bin Size") | |
plt.ylabel("Cost Function Value") | |
plt.savefig("OptimalHistogramAndCostFunction.png") | |
def main(): | |
# Parse command line arguments | |
parser = argparse.ArgumentParser() | |
parser.add_argument("opmd_path", type=str, help="Path to the OpenPMD file") | |
args = parser.parse_args() | |
opmd_path = Path(args.opmd_path) | |
############################## | |
# Create the dataframe | |
df = ParticleDataReader.from_file( | |
opmd_path, particle_species_name="e_highGamma" | |
) # or "e_all" | |
random_numbers = df["momentum_y_mev_c"] | |
weights = df["weights"] | |
min_num_bins = 100 | |
max_num_bins = 4000 | |
num_bins = np.arange(min_num_bins, max_num_bins) | |
cost_values, bin_sizes = compute_cost_function(random_numbers, num_bins, weights) | |
optimal_bin_size_index = select_optimal_bin_size(cost_values) | |
actual_num_bins = num_bins[optimal_bin_size_index] | |
print(f"Actual number of bins: {actual_num_bins}") | |
plot_results( | |
random_numbers, | |
num_bins, | |
cost_values, | |
bin_sizes, | |
optimal_bin_size_index, | |
weights, | |
) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment