%matplotlib inline
from scipy.stats import expon
import numpy as np
from scipy.special import kolmogorov
from scipy.misc import derivative
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10, 6)

def show(sample):
    return ", ".join("{:.4f}".format(x) for x in sample)

Intro¶

The exponential distribution (with parameter $\lambda > 0$) is the following: $$\mathbb{P}( X \in [a, b]) = \int_a^b \lambda e ^{- \lambda x} dx $$

For each $\lambda$, this defines a different distribution on the set of non-negative real numbers.

It's distribution function is given by $$F(x) = \mathbb{P} ( X \in (-\infty, x]) = \int_{-\infty}^x \lambda e^{-\lambda s} ds = 1 - e^{-\lambda x}$$

# take 100 samples from the exponential distribution with lambda = 1
distro = expon
sample = distro.rvs(size=100)

def empirical_cdf(sample, plotting=True):
    N = len(sample)
    rng = max(sample) - min(sample)
    if plotting:
        xs = np.concatenate([np.array([min(sample)-rng/3]), np.sort(sample) , np.array([max(sample)+rng/3])])
        ys = np.append(np.arange(N+1)/N, 1)
    else:
        xs = np.sort(sample)
        ys = np.arange(1, N+1)/N
    return (xs, ys)

subsample = sample[:2]
print(subsample)
plt.title("Plot of $y = \mathrm{emp}("+show(subsample)+")(x)$")
plt.step(*empirical_cdf(subsample), where="post")

[ 0.17347635  0.49504531]

[<matplotlib.lines.Line2D at 0x7ff7c9f72b70>]

subsample = sample[:3]
print(subsample)
plt.title("Plot of $y = \mathrm{emp}("+show(subsample)+")(x)$")
plt.step(*empirical_cdf(subsample), where="post")

[ 0.17347635  0.49504531  0.59395951]

[<matplotlib.lines.Line2D at 0x7ff78c681fd0>]

subsample = sample[:4]
print(subsample)
plt.title("Plot of $y = \mathrm{emp}("+show(subsample)+")(x)$")
plt.step(*empirical_cdf(subsample), where="post")

[ 0.17347635  0.49504531  0.59395951  0.45773609]

[<matplotlib.lines.Line2D at 0x7ff78c637c88>]

xs, ys = empirical_cdf(sample)
xs_ = np.linspace(min(xs), max(xs), 1000)
plt.plot(xs_, distro.cdf(xs_))
plt.step(xs, ys, where="post")

[<matplotlib.lines.Line2D at 0x7ff78c697748>]

Write $\mathrm{emp}(x_1, \dots x_n) (x)$ for the empirical distribution function:

$$ \mathrm{emp}(x_1, \dots x_n) (x) = \frac{1}{n} \sum_{i=1}^n \mathbb{1}_{x_i \le x}$$

Let $X_i$ i.i.d. random variables with distribution $F$, and $x \in \mathbb{R}$.

The strong law of large numbers: $$ \mathrm{emp}(X_1, \dots X_n) (x) - F(x) \to 0 \quad \text{ a. s. } $$ The central limit theorem: $$ \sqrt{n} \left( \mathrm{emp}(X_1, \dots X_n) (x) - F(x) \right) \Rightarrow N(0, \sigma^2) $$ The Glivenko-Cantelli theorem: $$ \sup_{x} \big| \mathrm{emp}(X_1, \dots X_n) (x) - F(x) \big| \to 0 \quad \text{ a. s. }$$ Kolmogorov's theorem: $$ \sqrt{n} \left( \sup_x \big| \mathrm{emp}(X_1, \dots X_n) (x) - F(x) \big| \right) \Rightarrow ? $$ (for continuous $F(x)$)

In what follows $F$ is continuous (and hence invertible).

Lemma (the probability integral transform): $F(X)$ is uniformly distributed: $$\mathbb{P} ( F(X) \leq u ) = \mathbb{P} ( X \leq F^{-1} (u) ) = F(F^{-1}(u)) = u \quad \text{ for } 0 \le u \le 1 $$

Definition (the Kolmogorov-Smirnov statistic):

$$ KS(X_1, \dots X_n) = \sqrt{n} \left( \sup_{x \in \mathbb{R}} \big| \mathrm{emp}(X_1, \dots X_n) (x) - F(x) \big| \right)$$

It is a random variable and its distribution is called the Kolmogorov distribution, $K_n$.

Theorem: $K_n$ does not depend on the distribution of $X_i$.

\begin{align} \sup_{x \in \mathbb{R}} \big| \mathrm{emp}(X_1, \dots X_n) (x) - F(x) \big| &= \sup_{u \in [0,1]} \big| \mathrm{emp}(X_1, \dots X_n) (F^{-1}(u)) - F(F^{-1}(u)) \big| \\ &= \sup_{u \in [0,1]} \bigg| \frac{1}{n} \sum_{i=1}^n \mathbb{1}_{X_i \le F^{-1}(u)} - F(F^{-1}(u)) \bigg| \\ &= \sup_{u \in [0,1]} \bigg| \frac{1}{n} \sum_{i=1}^n \mathbb{1}_{F(X_i) \le u} - F(F^{-1}(u)) \bigg| \\ &= \sup_{u \in [0,1]} \big| \mathrm{emp}(F(X_1), \dots, F(X_n)) (u) - F(F^{-1}(u)) \big| \\ &= \sup_{u \in [0,1]} \big| \mathrm{emp}(F(X_1), \dots, F(X_n)) (u) - u \big| \\ \end{align}

Theorem (Kolmogorov): The limiting distribution $K_\infty$ (in the weak sense) of $K_n$ has CDF: $$\mathbb{P}(K_\infty \leq x)=1-2\sum_{k=1}^\infty (-1)^{k-1} e^{-2k^2 x^2}$$

Proof: involves stronger form of the central limit theorem (Donsker's theorem). Kolmogorov's 1933 proof is more elementary.

xs = np.linspace(-1, 3, 10000)
plt.title("PDF of the Kolmogorov distribution")
plt.plot(xs, np.vectorize(lambda x: derivative(lambda x: 1-kolmogorov(x), x))(xs))

[<matplotlib.lines.Line2D at 0x7ff78c4bf358>]

Suppose you draw 1000 samples from an exponential($\lambda = 1$) distribution, then calculate the empirical CDF and calculate the furthest distance from $1 - e^{-x}$. The resulting quantity is a draw from $K_{1000}$.

Now suppose that you're hypothesis is that you're drawing from an exponential($\lambda = 1$) distribution, but you're not certain. You calculate the Kolmogorov-Smirnov statistic and see if it's a likely outcome of a $K_{1000}$ distribution.

There is no exact formula for $K_{1000}$, but $1000 \approx \infty$, so we can pretend it's a draw from $K_\infty$.

N = 1000
sample = expon.rvs(size=N)
xs, ys = empirical_cdf(sample, plotting=False)
k_s_statitic = np.sqrt(N) * max(np.abs(ys - expon.cdf(xs)))

print("Kolmogorov-Smirnov statistic: {:.4f}".format(k_s_statitic))
print("p-value: {:.4f}".format(kolmogorov(k_s_statitic)))

Kolmogorov-Smirnov statistic: 0.7726
p-value: 0.5893

N = 1000
sample = expon.rvs(size=N)
xs, ys = empirical_cdf(sample, plotting=False)
k_s_statitic = np.sqrt(N) * max(np.abs(ys - expon.cdf(xs)))

print("Kolmogorov-Smirnov statistic: {:.4f}".format(k_s_statitic))
print("p-value: {:.4f}".format(kolmogorov(k_s_statitic)))

Kolmogorov-Smirnov statistic: 0.5151
p-value: 0.9535

N = 1000
sample = expon.rvs(size=N)
xs, ys = empirical_cdf(sample, plotting=False)
k_s_statitic = np.sqrt(N) * max(np.abs(ys - expon.cdf(xs)))

print("Kolmogorov-Smirnov statistic: {:.4f}".format(k_s_statitic))
print("p-value: {:.4f}".format(kolmogorov(k_s_statitic)))

Kolmogorov-Smirnov statistic: 0.7366
p-value: 0.6497

Now change the distribution¶

Sample from exponential ($\lambda = 1.2$) and test against the hypothesis that ($\lambda = 1$) and see what happens

N = 1000
sample = expon(scale=1/1.2).rvs(size=N)
xs, ys = empirical_cdf(sample, plotting=False)
k_s_statitic = np.sqrt(N) * max(np.abs(ys - expon.cdf(xs)))

print("Kolmogorov-Smirnov statistic: {:.4f}".format(k_s_statitic))
print("p-value: {:.4f}".format(kolmogorov(k_s_statitic)))

Kolmogorov-Smirnov statistic: 3.4215
p-value: 0.0000

N = 1000
sample = expon(scale=1/1.2).rvs(size=N)
xs, ys = empirical_cdf(sample, plotting=False)
k_s_statitic = np.sqrt(N) * max(np.abs(ys - expon.cdf(xs)))

print("Kolmogorov-Smirnov statistic: {:.4f}".format(k_s_statitic))
print("p-value: {:.4f}".format(kolmogorov(k_s_statitic)))

Kolmogorov-Smirnov statistic: 2.9100
p-value: 0.0000

N = 1000
sample = expon(scale=1/1.2).rvs(size=N)
xs, ys = empirical_cdf(sample, plotting=False)
k_s_statitic = np.sqrt(N) * max(np.abs(ys - expon.cdf(xs)))

print("Kolmogorov-Smirnov statistic: {:.4f}".format(k_s_statitic))
print("p-value: {:.4f}".format(kolmogorov(k_s_statitic)))

Kolmogorov-Smirnov statistic: 2.2681
p-value: 0.0001

Conclusion¶

The Kolmogorov-Smirnov statistic can be used to detect if your sample isn't coming from the distribution you think.