GoF from binned cost functions
The builtin cost functions for binned data, BinnedNLL
and ExtendedBinnedNLL
have a minimum value which is asymptotically chi2-distributed and thus can be used as a goodness-of-fit statistic. This example shows, that one still needs a large number of entries in each bin to reach the asymptotic regime.
[1]:
%config InlineBackend.figure_formats = ['svg']
from iminuit import Minuit
from iminuit.cost import BinnedNLL, ExtendedBinnedNLL
import numpy as np
from numba_stats import norm, expon
import matplotlib.pyplot as plt
import joblib
from scipy.stats import chi2
[2]:
def generate(n, seed):
rng = np.random.default_rng(seed)
s = rng.normal(1, 0.1, size=rng.poisson(n))
b = rng.exponential(size=rng.poisson(n))
x = np.append(s, b)
return x[(x > 0) & (x < 2)]
x = generate(1000, 1)
plt.hist(x, bins=20, range=(0, 2));
[3]:
@joblib.delayed
def run(n, seed):
x = generate(n, seed)
xrange = (0, 2)
w, xe = np.histogram(x, bins=20, range=xrange)
def model1(x, z, mu, sigma, tau):
return z * norm.cdf(x, mu, sigma) / np.diff(norm.cdf(xrange, mu, sigma)) + (
1 - z
) * expon.cdf(x, 0, tau) / np.diff(expon.cdf(xrange, 0, tau))
def model2(x, s, b, mu, sigma, tau):
return s * n * norm.cdf(x, mu, sigma) + b * n * expon.cdf(x, 0, tau)
m = [
Minuit(BinnedNLL(w, xe, model1), z=0.5, mu=0.5, sigma=0.5, tau=0.5),
Minuit(ExtendedBinnedNLL(w, xe, model2), s=1, b=1, mu=0.5, sigma=0.5, tau=0.5),
]
for mi in m:
mi.limits["mu"] = (0, 2)
mi.limits["sigma", "tau"] = (0.1, None)
m[0].limits["z"] = (0, 1)
m[1].limits["s", "b"] = (0, None)
r = []
for mi in m:
mi.migrad()
if mi.valid:
pvalue = chi2(mi.fcn._fcn.ndata - mi.nfit).sf(mi.fval)
r.append(pvalue)
else:
r.append(np.nan)
return r
pvalues = {}
for n in (20, 100, 1000, 10000):
pvalues[n] = np.array(joblib.Parallel(-1)(run(n, i) for i in range(500)))
[4]:
fig, ax = plt.subplots(2, 4, figsize=(8, 4), constrained_layout=True)
for i, (ni, vi) in enumerate(pvalues.items()):
ax[0, i].hist(vi[:, 0])
ax[1, i].hist(vi[:, 1])
ax[0, i].set_title(
f"n = {ni}, failed fits = {np.sum(np.isnan(vi[:, 0]))}", fontsize="small"
)
ax[1, i].set_title(
f"n = {ni}, failed fits = {np.sum(np.isnan(vi[:, 1]))}", fontsize="small"
)
fig.supxlabel("pvalue");
The top row shows the p-values of the test statistic for BinnedNLL
and the bottom row for ExtendedBinnedNLL
. If the test statistic was perfectly chi-square-distributed, the p-value distribution should be uniform.