From f2da1f00b8ea16e32096f9bf1f323468730317fe Mon Sep 17 00:00:00 2001 From: Jody Klymak Date: Sun, 3 Dec 2023 08:54:22 -0800 Subject: [PATCH] DOC: normalizing histograms --- galleries/examples/statistics/hist.py | 4 +- .../examples/statistics/histogram_features.py | 60 ----- .../statistics/histogram_normalization.py | 255 ++++++++++++++++++ 3 files changed, 257 insertions(+), 62 deletions(-) delete mode 100644 galleries/examples/statistics/histogram_features.py create mode 100644 galleries/examples/statistics/histogram_normalization.py diff --git a/galleries/examples/statistics/hist.py b/galleries/examples/statistics/hist.py index 9e1a308153ef..8b06093913df 100644 --- a/galleries/examples/statistics/hist.py +++ b/galleries/examples/statistics/hist.py @@ -36,6 +36,8 @@ axs[0].hist(dist1, bins=n_bins) axs[1].hist(dist2, bins=n_bins) +plt.show() + # %% # Updating histogram colors @@ -99,8 +101,6 @@ # We can also define custom numbers of bins for each axis axs[2].hist2d(dist1, dist2, bins=(80, 10), norm=colors.LogNorm()) -plt.show() - # %% # # .. admonition:: References diff --git a/galleries/examples/statistics/histogram_features.py b/galleries/examples/statistics/histogram_features.py deleted file mode 100644 index 21f74fd1ac44..000000000000 --- a/galleries/examples/statistics/histogram_features.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -============================================== -Some features of the histogram (hist) function -============================================== - -In addition to the basic histogram, this demo shows a few optional features: - -* Setting the number of data bins. -* The *density* parameter, which normalizes bin heights so that the integral of - the histogram is 1. The resulting histogram is an approximation of the - probability density function. - -Selecting different bin counts and sizes can significantly affect the shape -of a histogram. The Astropy docs have a great section_ on how to select these -parameters. - -.. _section: http://docs.astropy.org/en/stable/visualization/histogram.html -""" - -import matplotlib.pyplot as plt -import numpy as np - -rng = np.random.default_rng(19680801) - -# example data -mu = 106 # mean of distribution -sigma = 17 # standard deviation of distribution -x = rng.normal(loc=mu, scale=sigma, size=420) - -num_bins = 42 - -fig, ax = plt.subplots() - -# the histogram of the data -n, bins, patches = ax.hist(x, num_bins, density=True) - -# add a 'best fit' line -y = ((1 / (np.sqrt(2 * np.pi) * sigma)) * - np.exp(-0.5 * (1 / sigma * (bins - mu))**2)) -ax.plot(bins, y, '--') -ax.set_xlabel('Value') -ax.set_ylabel('Probability density') -ax.set_title('Histogram of normal distribution sample: ' - fr'$\mu={mu:.0f}$, $\sigma={sigma:.0f}$') - -# Tweak spacing to prevent clipping of ylabel -fig.tight_layout() -plt.show() - -# %% -# -# .. admonition:: References -# -# The use of the following functions, methods, classes and modules is shown -# in this example: -# -# - `matplotlib.axes.Axes.hist` / `matplotlib.pyplot.hist` -# - `matplotlib.axes.Axes.set_title` -# - `matplotlib.axes.Axes.set_xlabel` -# - `matplotlib.axes.Axes.set_ylabel` diff --git a/galleries/examples/statistics/histogram_normalization.py b/galleries/examples/statistics/histogram_normalization.py new file mode 100644 index 000000000000..9418b7af002b --- /dev/null +++ b/galleries/examples/statistics/histogram_normalization.py @@ -0,0 +1,255 @@ +""" +.. redirect-from:: /gallery/statistics/histogram_features + +=================================== +Histogram bins, density, and weight +=================================== + +The `.Axes.hist` method can flexibly create histograms in a few different ways, +which is flexible and helpful, but can also lead to confusion. In particular, +you can: + +- bin the data as you want, either with an automatically chosen number of + bins, or with fixed bin edges, +- normalize the histogram so that its integral is one, +- and assign weights to the data points, so that each data point affects the + count in its bin differently. + +The Matplotlib ``hist`` method calls `numpy.histogram` and plots the results, +therefore users should consult the numpy documentation for a definitive guide. + +Histograms are created by defining bin edges, and taking a dataset of values +and sorting them into the bins, and counting or summing how much data is in +each bin. In this simple example, 9 numbers between 1 and 4 are sorted into 3 +bins: +""" + +import matplotlib.pyplot as plt +import numpy as np + +rng = np.random.default_rng(19680801) + +xdata = np.array([1.2, 2.3, 3.3, 3.1, 1.7, 3.4, 2.1, 1.25, 1.3]) +xbins = np.array([1, 2, 3, 4]) + +# changing the style of the histogram bars just to make it +# very clear where the boundaries of the bins are: +style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3} + +fig, ax = plt.subplots() +ax.hist(xdata, bins=xbins, **style) + +# plot the xdata locations on the x axis: +ax.plot(xdata, 0*xdata, 'd') +ax.set_ylabel('Number per bin') +ax.set_xlabel('x bins (dx=1.0)') + +# %% +# Modifying bins +# ============== +# +# Changing the bin size changes the shape of this sparse histogram, so its a +# good idea to choose bins with some care with respect to your data. Here we +# make the bins half as wide. + +xbins = np.arange(1, 4.5, 0.5) + +fig, ax = plt.subplots() +ax.hist(xdata, bins=xbins, **style) +ax.plot(xdata, 0*xdata, 'd') +ax.set_ylabel('Number per bin') +ax.set_xlabel('x bins (dx=0.5)') + +# %% +# We can also let numpy (via Matplotlib) choose the bins automatically, or +# specify a number of bins to choose automatically: + +fig, ax = plt.subplot_mosaic([['auto', 'n4']], + sharex=True, sharey=True, layout='constrained') + +ax['auto'].hist(xdata, **style) +ax['auto'].plot(xdata, 0*xdata, 'd') +ax['auto'].set_ylabel('Number per bin') +ax['auto'].set_xlabel('x bins (auto)') + +ax['n4'].hist(xdata, bins=4, **style) +ax['n4'].plot(xdata, 0*xdata, 'd') +ax['n4'].set_xlabel('x bins ("bins=4")') + +# %% +# Normalizing histograms: density and weight +# ========================================== +# +# Counts-per-bin is the default length of each bar in the histogram. However, +# we can also normalize the bar lengths as a probability density function using +# the ``density`` parameter: + +fig, ax = plt.subplots() +ax.hist(xdata, bins=xbins, density=True, **style) +ax.set_ylabel('Probability density [$V^{-1}$])') +ax.set_xlabel('x bins (dx=0.5 $V$)') + +# %% +# This normalization can be a little hard to interpret when just exploring the +# data. The value attached to each bar is divided by the total number of data +# points *and* the width of the bin, and thus the values _integrate_ to one +# when integrating across the full range of data. +# e.g. :: +# +# density = counts / (sum(counts) * np.diff(bins)) +# np.sum(density * np.diff(bins)) == 1 +# +# This normalization is how `probability density functions +# `_ are defined in +# statistics. If :math:`X` is a random variable on :math:`x`, then :math:`f_X` +# is is the probability density function if :math:`P[a`_, and also calculate the +# known probability density function: + +xdata = rng.normal(size=1000) +xpdf = np.arange(-4, 4, 0.1) +pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2) + +# %% +# If we don't use ``density=True``, we need to scale the expected probability +# distribution function by both the length of the data and the width of the +# bins: + +fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained') +dx = 0.1 +xbins = np.arange(-4, 4, dx) +ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts') + +# scale and plot the expected pdf: +ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$') +ax['False'].set_ylabel('Count per bin') +ax['False'].set_xlabel('x bins [V]') +ax['False'].legend() + +ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density') +ax['True'].plot(xpdf, pdf, label='$f_X(x)$') +ax['True'].set_ylabel('Probability density [$V^{-1}$]') +ax['True'].set_xlabel('x bins [$V$]') +ax['True'].legend() + +# %% +# One advantage of using the density is therefore that the shape and amplitude +# of the histogram does not depend on the size of the bins. Consider an +# extreme case where the bins do not have the same width. In this example, the +# bins below ``x=-1.25`` are six times wider than the rest of the bins. By +# normalizing by density, we preserve the shape of the distribution, whereas if +# we do not, then the wider bins have much higher counts than the thinner bins: + +fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained') +dx = 0.1 +xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)]) +ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts') +ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$') +ax['False'].set_ylabel('Count per bin') +ax['False'].set_xlabel('x bins [V]') +ax['False'].legend() + +ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density') +ax['True'].plot(xpdf, pdf, label='$f_X(x)$') +ax['True'].set_ylabel('Probability density [$V^{-1}$]') +ax['True'].set_xlabel('x bins [$V$]') +ax['True'].legend() + +# %% +# Similarly, if we want to compare histograms with different bin widths, we may +# want to use ``density=True``: + +fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained') + +# expected PDF +ax['True'].plot(xpdf, pdf, '--', label='$f_X(x)$', color='k') + +for nn, dx in enumerate([0.1, 0.4, 1.2]): + xbins = np.arange(-4, 4, dx) + # expected histogram: + ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}') + ax['False'].hist(xdata, bins=xbins, density=False, histtype='step') + + ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx) + +# Labels: +ax['False'].set_xlabel('x bins [$V$]') +ax['False'].set_ylabel('Count per bin') +ax['True'].set_ylabel('Probability density [$V^{-1}$]') +ax['True'].set_xlabel('x bins [$V$]') +ax['True'].legend(fontsize='small', title='bin width:') + +# %% +# Sometimes people want to normalize so that the sum of counts is one. This is +# analogous to a `probability mass function +# `_ for a discrete +# variable where the sum of probabilities for all the values equals one. Using +# ``hist``, we can get this normalization if we set the *weights* to 1/N. +# Note that the amplitude of this normalized histogram still depends on +# width and/or number of the bins: + +fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3)) + +for nn, dx in enumerate([0.1, 0.4, 1.2]): + xbins = np.arange(-4, 4, dx) + ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)), + histtype='step', label=f'{dx}') +ax.set_xlabel('x bins [$V$]') +ax.set_ylabel('Bin count / N') +ax.legend(fontsize='small', title='bin width:') + +# %% +# The value of normalizing histograms is comparing two distributions that have +# different sized populations. Here we compare the distribution of ``xdata`` +# with a population of 1000, and ``xdata2`` with 100 members. + +xdata2 = rng.normal(size=100) + +fig, ax = plt.subplot_mosaic([['no_norm', 'density', 'weight']], + layout='constrained', figsize=(8, 4)) + +xbins = np.arange(-4, 4, 0.25) + +ax['no_norm'].hist(xdata, bins=xbins, histtype='step') +ax['no_norm'].hist(xdata2, bins=xbins, histtype='step') +ax['no_norm'].set_ylabel('Counts') +ax['no_norm'].set_xlabel('x bins [$V$]') +ax['no_norm'].set_title('No normalization') + +ax['density'].hist(xdata, bins=xbins, histtype='step', density=True) +ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True) +ax['density'].set_ylabel('Probability density [$V^{-1}$]') +ax['density'].set_title('Density=True') +ax['density'].set_xlabel('x bins [$V$]') + +ax['weight'].hist(xdata, bins=xbins, histtype='step', + weights=1 / len(xdata) * np.ones(len(xdata)), + label='N=1000') +ax['weight'].hist(xdata2, bins=xbins, histtype='step', + weights=1 / len(xdata2) * np.ones(len(xdata2)), + label='N=100') +ax['weight'].set_xlabel('x bins [$V$]') +ax['weight'].set_ylabel('Counts / N') +ax['weight'].legend(fontsize='small') +ax['weight'].set_title('Weight = 1/N') + +plt.show() + +# %% +# +# .. admonition:: References +# +# The use of the following functions, methods, classes and modules is shown +# in this example: +# +# - `matplotlib.axes.Axes.hist` / `matplotlib.pyplot.hist` +# - `matplotlib.axes.Axes.set_title` +# - `matplotlib.axes.Axes.set_xlabel` +# - `matplotlib.axes.Axes.set_ylabel` +# - `matplotlib.axes.Axes.legend` pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy