Skip to content

Instantly share code, notes, and snippets.

@lkilcher
Last active September 11, 2023 09:33
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lkilcher/8722204056c117bc1ff0edc5b4b4a29e to your computer and use it in GitHub Desktop.
Save lkilcher/8722204056c117bc1ff0edc5b4b4a29e to your computer and use it in GitHub Desktop.
import numpy as np
import matplotlib.pyplot as plt
def map_hist(x, y, h, bins):
xi = np.digitize(x, bins[0]) - 1
yi = np.digitize(y, bins[1]) - 1
inds = np.ravel_multi_index((xi, yi),
(len(bins[0]) - 1, len(bins[1]) - 1),
mode='clip')
vals = h.flatten()[inds]
bads = ((x < bins[0][0]) | (x > bins[0][-1]) |
(y < bins[1][0]) | (y > bins[1][-1]))
vals[bads] = np.NaN
return vals
def scatter_hist2d(x, y,
s=20, marker=u'o',
mode='mountain',
bins=10, range=None,
normed=False, weights=None, # np.histogram2d args
ax=None, dens_func=None,
**kwargs):
"""
Make a scattered-histogram plot.
Parameters
----------
x, y : array_like, shape (n, )
Input data
s : scalar or array_like, shape (n, ), optional, default: 20
size in points^2.
marker : `~matplotlib.markers.MarkerStyle`, optional, default: 'o'
See `~matplotlib.markers` for more information on the different
styles of markers scatter supports. `marker` can be either
an instance of the class or the text shorthand for a particular
marker.
mode: [None | 'mountain' | 'valley' | 'clip']
Possible values are:
- None : The points are plotted as one scatter object, in the
order in-which they are specified at input.
- 'mountain' : The points are sorted/plotted in the order of
the number of points in their 'bin'. This means that points
in the highest density will be plotted on-top of others. This
cleans-up the edges a bit, the points near the edges will
overlap.
- 'valley' : The reverse order of 'mountain'. The low density
bins are plotted on top of the high ones.
bins : int or array_like or [int, int] or [array, array], optional
The bin specification:
* If int, the number of bins for the two dimensions (nx=ny=bins).
* If array_like, the bin edges for the two dimensions
(x_edges=y_edges=bins).
* If [int, int], the number of bins in each dimension
(nx, ny = bins).
* If [array, array], the bin edges in each dimension
(x_edges, y_edges = bins).
* A combination [int, array] or [array, int], where int
is the number of bins and array is the bin edges.
range : array_like, shape(2,2), optional
The leftmost and rightmost edges of the bins along each dimension
(if not specified explicitly in the `bins` parameters):
``[[xmin, xmax], [ymin, ymax]]``. All values outside of this range
will be considered outliers and not tallied in the histogram.
normed : bool, optional
If False, returns the number of samples in each bin. If True,
returns the bin density ``bin_count / sample_count / bin_area``.
weights : array_like, shape(N,), optional
An array of values ``w_i`` weighing each sample ``(x_i, y_i)``.
Weights are normalized to 1 if `normed` is True. If `normed` is
False, the values of the returned histogram are equal to the sum of
the weights belonging to the samples falling into each bin.
ax : an axes instance to plot into.
dens_func : function or callable (default: None)
A function that modifies (inputs and returns) the dens
values (e.g., np.log10). The default is to not modify the
values.
kwargs : these are all passed on to scatter.
Returns
-------
paths : `~matplotlib.collections.PathCollection`
The scatter instance.
"""
if ax is None:
ax = plt.gca()
h, xe, ye = np.histogram2d(x, y, bins=bins,
range=range, normed=normed,
weights=weights)
# bins = (xe, ye)
dens = map_hist(x, y, h, bins=(xe, ye))
if dens_func is not None:
dens = dens_func(dens)
iorder = slice(None) # No ordering by default
if mode == 'mountain':
iorder = np.argsort(dens)
elif mode == 'valley':
iorder = np.argsort(dens)[::-1]
x = x[iorder]
y = y[iorder]
dens = dens[iorder]
return ax.scatter(x, y,
s=s, c=dens,
marker=marker,
**kwargs)
def scatter_hexbin(
x, y, s=20, C=None, marker='o',
gridsize=100, mode='mountain',
xscale='linear', yscale='linear', extent=None,
cmap=None, norm=None, vmin=None, vmax=None,
reduce_C_function=np.mean,
ax=None,
**kwargs):
"""
Make a scatter plot where points are grouped into 'hexagonal bins'.
If *C* is *None*, the color-value of the points in each bin is
determined by the number of points in the hexagon (i.e., a
hexagonal histogram). Otherwise, *C* specifies values at the
coordinate (x[i], y[i]). For each hexagon, these values are
reduced using *reduce_C_function*. The 'color-values' are then
mapped to colors using the specified colormap.
Parameters
----------
x, y : array-like
The data positions. *x* and *y* must be of the same length.
s : array-like, default: 20
The marker size in points**2. Default is 20.
C : array-like, optional
If given, these values are accumulated in the bins. Otherwise,
every point has a value of 1. Must be of the same length as *x*
and *y*.
marker : MarkerStyle, default: 'o'
The marker style. This is passed directly to scatter.
gridsize : int or (int, int), default: 100
If a single int, the number of hexagons in the *x*-direction.
The number of hexagons in the *y*-direction is chosen such that
the hexagons are approximately regular.
Alternatively, if a tuple (*nx*, *ny*), the number of hexagons
in the *x*-direction and the *y*-direction.
mode: [None | 'mountain' | 'valley']
Possible values are:
- None : The points are plotted as one scatter object, in the
order in-which they are specified at input. This leaves the
edges of hexbins somewhat chaotic.
- 'mountain' : The points are sorted/plotted in the order of
the number of points in their 'bin'. This means that points
in the highest density will be plotted on-top of others. This
cleans-up the edges a bit, the points near the edges will
overlap.
- 'valley' : The reverse order of 'mountain'. The low density
bins are plotted on top of the high ones.
xscale : {'linear', 'log'}, default: 'linear'
Use a linear or log10 scale on the horizontal axis.
yscale : {'linear', 'log'}, default: 'linear'
Use a linear or log10 scale on the vertical axis.
extent : 4-tuple of float, default: *None*
The limits of the bins (xmin, xmax, ymin, ymax).
The default assigns the limits based on
*gridsize*, *x*, *y*, *xscale* and *yscale*.
If *xscale* or *yscale* is set to 'log', the limits are
expected to be the exponent for a power of 10. E.g. for
x-limits of 1 and 50 in 'linear' scale and y-limits
of 10 and 1000 in 'log' scale, enter (1, 50, 1, 3).
Returns
-------
paths : `~matplotlib.collections.PathCollection`
The scatter instance.
Other Parameters
----------------
cmap : str or `~matplotlib.colors.Colormap`, default: :rc:`image.cmap`
The Colormap instance or registered colormap name used to map
the bin values to colors.
norm : `~matplotlib.colors.Normalize`, optional
The Normalize instance scales the bin values to the canonical
colormap range [0, 1] for mapping to colors. By default, the data
range is mapped to the colorbar range using linear scaling.
vmin, vmax : float, default: None
The colorbar range. If *None*, suitable min/max values are
automatically chosen by the `.Normalize` instance (defaults to
the respective min/max values of the bins in case of the default
linear scaling).
It is an error to use *vmin*/*vmax* when *norm* is given.
reduce_C_function : callable, default: `numpy.mean`
The function to aggregate *C* within the bins. It is ignored if
*C* is not given. This must have the signature::
def reduce_C_function(C: array) -> float
Commonly used functions are:
- `numpy.mean`: average of the points
- `numpy.sum`: integral of the point values
- `numpy.amax`: value taken from the largest point
**kwargs : These are all passed on to scatter
Notes
-----
For best results, you typically have to make the marker size
(``s``) much smaller than the hexbin size. Setting
edgecolor='none' can also be helpful to reduce markersize.
See Also
--------
scatter : a scatter plot.
hexbin : 2D hexagonal binning plot of points x, y.
hist2d : A 2D histogram
scatter_hist2d : a 2D 'scatter histogram' with rectangular bins.
"""
if ax is None:
ax = plt.gca()
# Much of this was copied <matplotlib>/lib/matplotlib/axes/_axes.py
# Set the size of the hexagon grid
if np.iterable(gridsize):
nx, ny = gridsize
else:
nx = gridsize
ny = int(nx / np.sqrt(3))
# Count the number of data in each hexagon
x = np.asarray(x, float)
y = np.asarray(y, float)
# Will be log()'d if necessary, and then rescaled.
tx = x
ty = y
if xscale == 'log':
if np.any(x <= 0.0):
raise ValueError("x contains non-positive values, so can not "
"be log-scaled")
tx = np.log10(tx)
if yscale == 'log':
if np.any(y <= 0.0):
raise ValueError("y contains non-positive values, so can not "
"be log-scaled")
ty = np.log10(ty)
if extent is not None:
xmin, xmax, ymin, ymax = extent
else:
xmin, xmax = (tx.min(), tx.max()) if len(x) else (0, 1)
ymin, ymax = (ty.min(), ty.max()) if len(y) else (0, 1)
#####
# Start hexagon magic
nx1 = nx + 1
ny1 = ny + 1
nx2 = nx
ny2 = ny
n = nx1 * ny1 + nx2 * ny2
# In the x-direction, the hexagons exactly cover the region from
# xmin to xmax. Need some padding to avoid roundoff errors.
padding = 1.e-9 * (xmax - xmin)
xmin -= padding
xmax += padding
sx = (xmax - xmin) / nx
sy = (ymax - ymin) / ny
# Positions in hexagon index coordinates.
ix = (tx - xmin) / sx
iy = (ty - ymin) / sy
ix1 = np.round(ix).astype(int)
iy1 = np.round(iy).astype(int)
ix2 = np.floor(ix).astype(int)
iy2 = np.floor(iy).astype(int)
# flat indices, plus one so that out-of-range points go to position 0.
i1 = np.where((0 <= ix1) & (ix1 < nx1) & (0 <= iy1) & (iy1 < ny1),
ix1 * ny1 + iy1 + 1, 0)
i2 = np.where((0 <= ix2) & (ix2 < nx2) & (0 <= iy2) & (iy2 < ny2),
ix2 * ny2 + iy2 + 1, 0)
d1 = (ix - ix1) ** 2 + 3.0 * (iy - iy1) ** 2
d2 = (ix - ix2 - 0.5) ** 2 + 3.0 * (iy - iy2 - 0.5) ** 2
# Which points are in the 'first set' vs. 'second set'?
bdist = (d1 < d2)
# This is the index of the hexagon 'number'
ihex = i2 + nx1 * ny1 - 1
ihex[bdist] = i1[bdist] - 1
# End hexagon magic
#####
if C is None: # [1:] drops out-of-range points.
counts1 = np.bincount(i1[bdist], minlength=1 + nx1 * ny1)[1:]
counts2 = np.bincount(i2[~bdist], minlength=1 + nx2 * ny2)[1:]
accum = np.concatenate([counts1, counts2]).astype(float)
C = np.ones(len(x))
else:
# store the C values in a list per hexagon index
Cs_at_i1 = [[] for _ in range(1 + nx1 * ny1)]
Cs_at_i2 = [[] for _ in range(1 + nx2 * ny2)]
for i in range(len(x)):
if bdist[i]:
Cs_at_i1[i1[i]].append(C[i])
else:
Cs_at_i2[i2[i]].append(C[i])
accum = np.array(
[reduce_C_function(acc) if len(acc) > mincnt else np.nan
for Cs_at_i in [Cs_at_i1, Cs_at_i2]
for acc in Cs_at_i[1:]], # [1:] drops out-of-range points.
float)
good_idxs = ~np.isnan(accum)
offsets = np.zeros((n, 2), float)
offsets[:nx1 * ny1, 0] = np.repeat(np.arange(nx1), ny1)
offsets[:nx1 * ny1, 1] = np.tile(np.arange(ny1), nx1)
offsets[nx1 * ny1:, 0] = np.repeat(np.arange(nx2) + 0.5, ny2)
offsets[nx1 * ny1:, 1] = np.tile(np.arange(ny2), nx2) + 0.5
offsets[:, 0] *= sx
offsets[:, 1] *= sy
offsets[:, 0] += xmin
offsets[:, 1] += ymin
# remove accumulation bins with no data
offsets = offsets[good_idxs, :]
accum = accum[good_idxs]
cvals = accum[ihex]
iorder = slice(None) # No ordering by default
if mode == 'mountain':
iorder = np.argsort(cvals)
elif mode == 'valley':
iorder = np.argsort(cvals)[::-1]
x = x[iorder]
y = y[iorder]
cvals = cvals[iorder]
return ax.scatter(x, y,
s=s, c=cvals, marker=marker,
cmap=None, norm=None, vmin=None, vmax=None,
**kwargs)
if __name__ == '__main__':
import matplotlib as mpl
mpl.use('TkAgg')
plt.ion()
randgen = np.random.RandomState(84309242)
npoint = 10000
x = randgen.randn(npoint)
y = 2 * randgen.randn(npoint) + x
extent = [-10, 10, -10, 10]
bins = np.linspace(extent[0], extent[1], 50)
hexbin_gridsize = [50, 20]
fig, axs = plt.subplots(3, 2, figsize=[6, 8],
gridspec_kw=dict(hspace=0.3),
sharex=True, sharey=True)
ax = axs[0, 0]
ax.plot(x, y, '.', color='b', )
ax.set_title("Traditional Scatterplot")
ax = axs[1, 0]
ax.hist2d(x, y, bins=[bins, bins])
ax.set_title("Traditional 2-D Histogram")
ax = axs[2, 0]
scatter_hist2d(x, y, bins=[ bins, bins], ax=ax, s=5, edgecolor='none')
ax.set_title("Scatter histogram combined!")
axs[0,1].set_visible(False)
ax = axs[1, 1]
ax.hexbin(x, y, gridsize=hexbin_gridsize, extent=extent, linewidths=0.2)
ax.set_title("A 'hexbin' histogram")
ax = axs[2, 1]
scatter_hexbin(x, y, gridsize=hexbin_gridsize, extent=extent, ax=ax, s=5, edgecolor='none')
ax.set_title("Scatter hexbin combined!")
ax.set_xlim([-10,10])
ax.set_ylim([-5,5])
fig.text(0.5, 0.02, "Note: The hexbin (right) doesn't have as much 'striping'\n"
"as a rectangular 2-D histogram (left).", ha='center', va='bottom', size='large')
fig.savefig('ScatterHist_Example.png', dpi=200)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment