Skip to content

Instantly share code, notes, and snippets.

@DGrady
Last active November 26, 2019 19:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DGrady/e35ce473733a8cd7c18f80fca56bb27e to your computer and use it in GitHub Desktop.
Save DGrady/e35ce473733a8cd7c18f80fca56bb27e to your computer and use it in GitHub Desktop.
Histogram based on frequency or count data
import numpy as np
import pandas as pd
def frequency_histogram(
data: pd.DataFrame,
n_bins=20,
bins=None,
log_bins=False,
normalize=False,
for_bar=True):
x, f, *_ = data.columns
xmin, xmax = data[x].min(), data[x].max()
if log_bins and bins is None:
bins = np.logspace(np.floor(np.log10(xmin)), np.log10(xmax), num=n_bins+1)
elif bins is None:
bins = np.linspace(xmin, xmax, num=n_bins+1)
assignments = pd.cut(data[x], bins, include_lowest=True)
totals = data.groupby(assignments)[f].sum()
edges = (
totals.index # `totals` has a Categorical index
.categories # the categories are represented as an ``IntervalIndex``
.to_tuples() # convert the ``IntervalIndex`` to a regular index of tuples
.tolist() # convert the regular index to a list of tuples
)
edges = pd.DataFrame(edges, columns=['edge_left', 'edge_right'])
result = edges.join(totals.reset_index(drop=True))
if for_bar:
# Give a result that's useful for ``matplotlib.pyplot.bar`` that
# has left edges and bin widths.
#
# CAUTION You still need to use `plt.bar( … , align='edge')`
result['edge_right'] = result['edge_right'] - result['edge_left']
result.rename(columns={'edge_right': 'edge_width'}, inplace=True)
if normalize:
# TODO CAUTION This won't produce a properly normalized density - we
# actually want sum(bin width * bin height) to be 1, but this only ensures
# sum(bin height) is 1
result[f] = result[f] / result[f].sum()
return result
def histogram_logarithmic(a, w, bins=10):
amin, amax = a.min(), a.max()
edge = np.logspace(np.floor(np.log10(amin)), np.log10(amax), num=bins+1)
height, edge = np.histogram(a, bins=edge, weights=w, density=True)
width = np.diff(edge)
left_edge = edge[:-1]
return height, left_edge, width
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment