Closed
Description
Feature
Proposal:
I propose promoting the KDE statistics recipe to be an actual part of the statistics module.
See: https://docs.python.org/3.13/library/statistics.html#kernel-density-estimation
My thought Is that it is an essential part of statistical reasoning about sample data
See: https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf
Discussion of this feature:
This was discussed and reviewed with the module author who gave it his full support:
This looks really great, both as a concept and the code itself. It's a
good showcase for match, and an important statistics function.Thank you for working on this.
Working Draft:
from typing import Sequence, Callable from bisect import bisect_left, bisect_right from math import sqrt, exp, cos, pi, cosh def kde(sample: Sequence[float], h: float, kernel_function: str='gauss', ) -> Callable[[float], float]: """Kernel Density Estimation. Creates a continuous probability density function from discrete samples. The basic idea is to smooth the data using a kernel function to help draw inferences about a population from a sample. The degree of smoothing is controlled by the scaling parameter h which is called the bandwidth. Smaller values emphasize local features while larger values give smoother results. Kernel functions that give some weight to every sample point: gauss or normal logistic sigmoid Kernel functions that only give weight to sample points within the bandwidth: rectangular or uniform triangular epanechnikov or parabolic biweight or quartic triweight cosine Example ------- Given a sample of six data points, estimate the probablity density at evenly spaced points from -6 to 10: >>> sample = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2] >>> f_hat = kde(sample, h=1.5) >>> total = 0.0 >>> for x in range(-6, 11): ... density = f_hat(x) ... total += density ... plot = ' ' * int(density * 400) + 'x' ... print(f'{x:2}: {density:.3f} {plot}') ... -6: 0.002 x -5: 0.009 x -4: 0.031 x -3: 0.070 x -2: 0.111 x -1: 0.125 x 0: 0.110 x 1: 0.086 x 2: 0.068 x 3: 0.059 x 4: 0.066 x 5: 0.082 x 6: 0.082 x 7: 0.058 x 8: 0.028 x 9: 0.009 x 10: 0.002 x >>> round(total, 3) 0.999 References ---------- Kernel density estimation and its application: https://www.itm-conferences.org/articles/itmconf/pdf/2018/08/itmconf_sam2018_00037.pdf Kernel functions in common use: https://en.wikipedia.org/wiki/Kernel_(statistics)#Kernel_functions_in_common_use Interactive graphical demonstration and exploration: https://demonstrations.wolfram.com/KernelDensityEstimation/ """ kernel: Callable[[float], float] support: float | None match kernel_function: case 'gauss' | 'normal': c = 1 / sqrt(2 * pi) kernel = lambda t: c * exp(-1/2 * t * t) support = None case 'logistic': # 1.0 / (exp(t) + 2.0 + exp(-t)) kernel = lambda t: 1/2 / (1.0 + cosh(t)) support = None case 'sigmoid': # (2/pi) / (exp(t) + exp(-t)) c = 1 / pi kernel = lambda t: c / cosh(t) support = None case 'rectangular' | 'uniform': kernel = lambda t: 1/2 support = 1.0 case 'triangular': kernel = lambda t: 1.0 - abs(t) support = 1.0 case 'epanechnikov' | 'parabolic': kernel = lambda t: 3/4 * (1.0 - t * t) support = 1.0 case 'biweight' | 'quartic': kernel = lambda t: 15/16 * (1.0 - t * t) ** 2 support = 1.0 case 'triweight': kernel = lambda t: 35/32 * (1.0 - t * t) ** 3 support = 1.0 case 'cosine': c1 = pi / 4 c2 = pi / 2 kernel = lambda t: c1 * cos(c2 * t) support = 1.0 case _: raise ValueError(f'Unknown kernel function: {kernel_function!r}') n = len(sample) if support is None: def pdf(x: float) -> float: return sum(kernel((x - x_i) / h) for x_i in sample) / (n * h) else: sample = sorted(sample) bandwidth = h * support def pdf(x: float) -> float: i = bisect_left(sample, x - bandwidth) j = bisect_right(sample, x + bandwidth) supported = sample[i : j] return sum(kernel((x - x_i) / h) for x_i in supported) / (n * h) return pdf def _test() -> None: from statistics import NormalDist def kde_normal(sample: Sequence[float], h: float) -> Callable[[float], float]: "Create a continuous probability density function from a sample." # Smooth the sample with a normal distribution kernel scaled by h. kernel_h = NormalDist(0.0, h).pdf n = len(sample) def pdf(x: float) -> float: return sum(kernel_h(x - x_i) for x_i in sample) / n return pdf D = NormalDist(250, 50) data = D.samples(100) h = 30 pd = D.pdf fg = kde(data, h, 'gauss') fg2 = kde_normal(data, h) fr = kde(data, h, 'rectangular') ft = kde(data, h, 'triangular') fb = kde(data, h, 'biweight') fe = kde(data, h, 'epanechnikov') fc = kde(data, h, 'cosine') fl = kde(data, h, 'logistic') fs = kde(data, h, 'sigmoid') def show(x: float) -> None: for func in (pd, fg, fg2, fr, ft, fb, fe, fc, fl, fs): print(func(x)) print() show(197) show(255) show(320) def integrate(func: Callable[[float], float], low: float, high: float, steps: int=1000) -> float: width = (high - low) / steps xarr = (low + width * i for i in range(steps)) return sum(map(func, xarr)) * width print('\nIntegrals:') for func in (pd, fg, fg2, fr, ft, fb, fe, fc, fl, fs): print(integrate(func, 0, 500, steps=10_000)) if __name__ == '__main__': import doctest _test() print(doctest.testmod())