Last active
November 26, 2019 19:12
-
-
Save DGrady/e35ce473733a8cd7c18f80fca56bb27e to your computer and use it in GitHub Desktop.
Histogram based on frequency or count data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
def frequency_histogram( | |
data: pd.DataFrame, | |
n_bins=20, | |
bins=None, | |
log_bins=False, | |
normalize=False, | |
for_bar=True): | |
x, f, *_ = data.columns | |
xmin, xmax = data[x].min(), data[x].max() | |
if log_bins and bins is None: | |
bins = np.logspace(np.floor(np.log10(xmin)), np.log10(xmax), num=n_bins+1) | |
elif bins is None: | |
bins = np.linspace(xmin, xmax, num=n_bins+1) | |
assignments = pd.cut(data[x], bins, include_lowest=True) | |
totals = data.groupby(assignments)[f].sum() | |
edges = ( | |
totals.index # `totals` has a Categorical index | |
.categories # the categories are represented as an ``IntervalIndex`` | |
.to_tuples() # convert the ``IntervalIndex`` to a regular index of tuples | |
.tolist() # convert the regular index to a list of tuples | |
) | |
edges = pd.DataFrame(edges, columns=['edge_left', 'edge_right']) | |
result = edges.join(totals.reset_index(drop=True)) | |
if for_bar: | |
# Give a result that's useful for ``matplotlib.pyplot.bar`` that | |
# has left edges and bin widths. | |
# | |
# CAUTION You still need to use `plt.bar( … , align='edge')` | |
result['edge_right'] = result['edge_right'] - result['edge_left'] | |
result.rename(columns={'edge_right': 'edge_width'}, inplace=True) | |
if normalize: | |
# TODO CAUTION This won't produce a properly normalized density - we | |
# actually want sum(bin width * bin height) to be 1, but this only ensures | |
# sum(bin height) is 1 | |
result[f] = result[f] / result[f].sum() | |
return result | |
def histogram_logarithmic(a, w, bins=10): | |
amin, amax = a.min(), a.max() | |
edge = np.logspace(np.floor(np.log10(amin)), np.log10(amax), num=bins+1) | |
height, edge = np.histogram(a, bins=edge, weights=w, density=True) | |
width = np.diff(edge) | |
left_edge = edge[:-1] | |
return height, left_edge, width |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment