%matplotlib inline
from scipy.stats import expon
import numpy as np
from scipy.special import kolmogorov
from scipy.misc import derivative
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10, 6)
def show(sample):
return ", ".join("{:.4f}".format(x) for x in sample)
The exponential distribution (with parameter $\lambda > 0$) is the following: $$\mathbb{P}( X \in [a, b]) = \int_a^b \lambda e ^{- \lambda x} dx $$
For each $\lambda$, this defines a different distribution on the set of non-negative real numbers.
It's distribution function is given by $$F(x) = \mathbb{P} ( X \in (-\infty, x]) = \int_{-\infty}^x \lambda e^{-\lambda s} ds = 1 - e^{-\lambda x}$$
# take 100 samples from the exponential distribution with lambda = 1
distro = expon
sample = distro.rvs(size=100)
def empirical_cdf(sample, plotting=True):
N = len(sample)
rng = max(sample) - min(sample)
if plotting:
xs = np.concatenate([np.array([min(sample)-rng/3]), np.sort(sample) , np.array([max(sample)+rng/3])])
ys = np.append(np.arange(N+1)/N, 1)
else:
xs = np.sort(sample)
ys = np.arange(1, N+1)/N
return (xs, ys)
subsample = sample[:2]
print(subsample)
plt.title("Plot of $y = \mathrm{emp}("+show(subsample)+")(x)$")
plt.step(*empirical_cdf(subsample), where="post")
subsample = sample[:3]
print(subsample)
plt.title("Plot of $y = \mathrm{emp}("+show(subsample)+")(x)$")
plt.step(*empirical_cdf(subsample), where="post")
subsample = sample[:4]
print(subsample)
plt.title("Plot of $y = \mathrm{emp}("+show(subsample)+")(x)$")
plt.step(*empirical_cdf(subsample), where="post")
xs, ys = empirical_cdf(sample)
xs_ = np.linspace(min(xs), max(xs), 1000)
plt.plot(xs_, distro.cdf(xs_))
plt.step(xs, ys, where="post")
Write $\mathrm{emp}(x_1, \dots x_n) (x)$ for the empirical distribution function:
$$ \mathrm{emp}(x_1, \dots x_n) (x) = \frac{1}{n} \sum_{i=1}^n \mathbb{1}_{x_i \le x}$$Let $X_i$ i.i.d. random variables with distribution $F$, and $x \in \mathbb{R}$.
The strong law of large numbers: $$ \mathrm{emp}(X_1, \dots X_n) (x) - F(x) \to 0 \quad \text{ a. s. } $$ The central limit theorem: $$ \sqrt{n} \left( \mathrm{emp}(X_1, \dots X_n) (x) - F(x) \right) \Rightarrow N(0, \sigma^2) $$ The Glivenko-Cantelli theorem: $$ \sup_{x} \big| \mathrm{emp}(X_1, \dots X_n) (x) - F(x) \big| \to 0 \quad \text{ a. s. }$$ Kolmogorov's theorem: $$ \sqrt{n} \left( \sup_x \big| \mathrm{emp}(X_1, \dots X_n) (x) - F(x) \big| \right) \Rightarrow ? $$ (for continuous $F(x)$)
In what follows $F$ is continuous (and hence invertible).
Lemma (the probability integral transform): $F(X)$ is uniformly distributed: $$\mathbb{P} ( F(X) \leq u ) = \mathbb{P} ( X \leq F^{-1} (u) ) = F(F^{-1}(u)) = u \quad \text{ for } 0 \le u \le 1 $$
Definition (the Kolmogorov-Smirnov statistic):
$$ KS(X_1, \dots X_n) = \sqrt{n} \left( \sup_{x \in \mathbb{R}} \big| \mathrm{emp}(X_1, \dots X_n) (x) - F(x) \big| \right)$$It is a random variable and its distribution is called the Kolmogorov distribution, $K_n$.
Theorem: $K_n$ does not depend on the distribution of $X_i$.
\begin{align} \sup_{x \in \mathbb{R}} \big| \mathrm{emp}(X_1, \dots X_n) (x) - F(x) \big| &= \sup_{u \in [0,1]} \big| \mathrm{emp}(X_1, \dots X_n) (F^{-1}(u)) - F(F^{-1}(u)) \big| \\ &= \sup_{u \in [0,1]} \bigg| \frac{1}{n} \sum_{i=1}^n \mathbb{1}_{X_i \le F^{-1}(u)} - F(F^{-1}(u)) \bigg| \\ &= \sup_{u \in [0,1]} \bigg| \frac{1}{n} \sum_{i=1}^n \mathbb{1}_{F(X_i) \le u} - F(F^{-1}(u)) \bigg| \\ &= \sup_{u \in [0,1]} \big| \mathrm{emp}(F(X_1), \dots, F(X_n)) (u) - F(F^{-1}(u)) \big| \\ &= \sup_{u \in [0,1]} \big| \mathrm{emp}(F(X_1), \dots, F(X_n)) (u) - u \big| \\ \end{align}Theorem (Kolmogorov): The limiting distribution $K_\infty$ (in the weak sense) of $K_n$ has CDF: $$\mathbb{P}(K_\infty \leq x)=1-2\sum_{k=1}^\infty (-1)^{k-1} e^{-2k^2 x^2}$$
Proof: involves stronger form of the central limit theorem (Donsker's theorem). Kolmogorov's 1933 proof is more elementary.
xs = np.linspace(-1, 3, 10000)
plt.title("PDF of the Kolmogorov distribution")
plt.plot(xs, np.vectorize(lambda x: derivative(lambda x: 1-kolmogorov(x), x))(xs))
Suppose you draw 1000 samples from an exponential($\lambda = 1$) distribution, then calculate the empirical CDF and calculate the furthest distance from $1 - e^{-x}$. The resulting quantity is a draw from $K_{1000}$.
Now suppose that you're hypothesis is that you're drawing from an exponential($\lambda = 1$) distribution, but you're not certain. You calculate the Kolmogorov-Smirnov statistic and see if it's a likely outcome of a $K_{1000}$ distribution.
There is no exact formula for $K_{1000}$, but $1000 \approx \infty$, so we can pretend it's a draw from $K_\infty$.
N = 1000
sample = expon.rvs(size=N)
xs, ys = empirical_cdf(sample, plotting=False)
k_s_statitic = np.sqrt(N) * max(np.abs(ys - expon.cdf(xs)))
print("Kolmogorov-Smirnov statistic: {:.4f}".format(k_s_statitic))
print("p-value: {:.4f}".format(kolmogorov(k_s_statitic)))
N = 1000
sample = expon.rvs(size=N)
xs, ys = empirical_cdf(sample, plotting=False)
k_s_statitic = np.sqrt(N) * max(np.abs(ys - expon.cdf(xs)))
print("Kolmogorov-Smirnov statistic: {:.4f}".format(k_s_statitic))
print("p-value: {:.4f}".format(kolmogorov(k_s_statitic)))
N = 1000
sample = expon.rvs(size=N)
xs, ys = empirical_cdf(sample, plotting=False)
k_s_statitic = np.sqrt(N) * max(np.abs(ys - expon.cdf(xs)))
print("Kolmogorov-Smirnov statistic: {:.4f}".format(k_s_statitic))
print("p-value: {:.4f}".format(kolmogorov(k_s_statitic)))
Sample from exponential ($\lambda = 1.2$) and test against the hypothesis that ($\lambda = 1$) and see what happens
N = 1000
sample = expon(scale=1/1.2).rvs(size=N)
xs, ys = empirical_cdf(sample, plotting=False)
k_s_statitic = np.sqrt(N) * max(np.abs(ys - expon.cdf(xs)))
print("Kolmogorov-Smirnov statistic: {:.4f}".format(k_s_statitic))
print("p-value: {:.4f}".format(kolmogorov(k_s_statitic)))
N = 1000
sample = expon(scale=1/1.2).rvs(size=N)
xs, ys = empirical_cdf(sample, plotting=False)
k_s_statitic = np.sqrt(N) * max(np.abs(ys - expon.cdf(xs)))
print("Kolmogorov-Smirnov statistic: {:.4f}".format(k_s_statitic))
print("p-value: {:.4f}".format(kolmogorov(k_s_statitic)))
N = 1000
sample = expon(scale=1/1.2).rvs(size=N)
xs, ys = empirical_cdf(sample, plotting=False)
k_s_statitic = np.sqrt(N) * max(np.abs(ys - expon.cdf(xs)))
print("Kolmogorov-Smirnov statistic: {:.4f}".format(k_s_statitic))
print("p-value: {:.4f}".format(kolmogorov(k_s_statitic)))
The Kolmogorov-Smirnov statistic can be used to detect if your sample isn't coming from the distribution you think.