{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# GoF from binned cost functions\n", "\n", "The builtin cost functions for binned data, `BinnedNLL` and `ExtendedBinnedNLL` have a minimum value which is asymptotically chi2-distributed and thus can be used as a goodness-of-fit statistic. This example shows, that one still needs a large number of entries in each bin to reach the asymptotic regime." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%config InlineBackend.figure_formats = ['svg']\n", "from iminuit import Minuit\n", "from iminuit.cost import BinnedNLL, ExtendedBinnedNLL\n", "import numpy as np\n", "from numba_stats import norm, expon\n", "import matplotlib.pyplot as plt\n", "import joblib\n", "from scipy.stats import chi2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def generate(n, seed):\n", " rng = np.random.default_rng(seed)\n", " s = rng.normal(1, 0.1, size=rng.poisson(n))\n", " b = rng.exponential(size=rng.poisson(n))\n", " x = np.append(s, b)\n", " return x[(x > 0) & (x < 2)]\n", "\n", "\n", "x = generate(1000, 1)\n", "plt.hist(x, bins=20, range=(0, 2));" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "@joblib.delayed\n", "def run(n, seed):\n", " x = generate(n, seed)\n", " xrange = (0, 2)\n", " w, xe = np.histogram(x, bins=20, range=xrange)\n", "\n", " def model1(x, z, mu, sigma, tau):\n", " return z * norm.cdf(x, mu, sigma) / np.diff(norm.cdf(xrange, mu, sigma)) + (\n", " 1 - z\n", " ) * expon.cdf(x, 0, tau) / np.diff(expon.cdf(xrange, 0, tau))\n", "\n", " def model2(x, s, b, mu, sigma, tau):\n", " return s * n * norm.cdf(x, mu, sigma) + b * n * expon.cdf(x, 0, tau)\n", "\n", " m = [\n", " Minuit(BinnedNLL(w, xe, model1), z=0.5, mu=0.5, sigma=0.5, tau=0.5),\n", " Minuit(ExtendedBinnedNLL(w, xe, model2), s=1, b=1, mu=0.5, sigma=0.5, tau=0.5),\n", " ]\n", " for mi in m:\n", " mi.limits[\"mu\"] = (0, 2)\n", " mi.limits[\"sigma\", \"tau\"] = (0.1, None)\n", " m[0].limits[\"z\"] = (0, 1)\n", " m[1].limits[\"s\", \"b\"] = (0, None)\n", " r = []\n", " for mi in m:\n", " mi.migrad()\n", " if mi.valid:\n", " pvalue = chi2(mi.fcn._fcn.ndata - mi.nfit).sf(mi.fval)\n", " r.append(pvalue)\n", " else:\n", " r.append(np.nan)\n", " return r\n", "\n", "\n", "pvalues = {}\n", "for n in (20, 100, 1000, 10000):\n", " pvalues[n] = np.array(joblib.Parallel(-1)(run(n, i) for i in range(500)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots(2, 4, figsize=(8, 4), constrained_layout=True)\n", "for i, (ni, vi) in enumerate(pvalues.items()):\n", " ax[0, i].hist(vi[:, 0])\n", " ax[1, i].hist(vi[:, 1])\n", " ax[0, i].set_title(\n", " f\"n = {ni}, failed fits = {np.sum(np.isnan(vi[:, 0]))}\", fontsize=\"small\"\n", " )\n", " ax[1, i].set_title(\n", " f\"n = {ni}, failed fits = {np.sum(np.isnan(vi[:, 1]))}\", fontsize=\"small\"\n", " )\n", "fig.supxlabel(\"pvalue\");" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The top row shows the p-values of the test statistic for `BinnedNLL` and the bottom row for `ExtendedBinnedNLL`. If the test statistic was perfectly chi-square-distributed, the p-value distribution should be uniform." ] } ], "metadata": { "interpreter": { "hash": "bdbf20ff2e92a3ae3002db8b02bd1dd1b287e934c884beb29a73dced9dbd0fa3" }, "kernelspec": { "display_name": "Python 3.8.12 ('venv': venv)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }