150 lines
8.2 KiB
Plaintext
150 lines
8.2 KiB
Plaintext
[/
|
|
Copyright (c) 2019 Nick Thompson
|
|
Use, modification and distribution are subject to the
|
|
Boost Software License, Version 1.0. (See accompanying file
|
|
LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
]
|
|
|
|
[section:anderson_darling The Anderson-Darling Test]
|
|
|
|
[heading Synopsis]
|
|
|
|
```
|
|
#include <boost/math/statistics/anderson_darling.hpp>
|
|
|
|
namespace boost{ namespace math { namespace { statistics {
|
|
|
|
template<class RandomAccessContainer>
|
|
auto anderson_darling_normality_statistic(RandomAccessContainer const & v,
|
|
typename RandomAccessContainer::value_type mu = std::numeric_limits<typename RandomAccessContainer::value_type>::quiet_NaN(),
|
|
typename RandomAccessContainer::value_type sd = std::numeric_limits<typename RandomAccessContainer::value_type>::quiet_NaN());
|
|
|
|
}}}
|
|
```
|
|
|
|
[heading Background]
|
|
|
|
The Anderson-Darling test for normality asks if a given sequence of numbers are drawn from a normal distribution by computing an integral over the empirical cumulative distribution function.
|
|
The test statistic /A/[super 2] is given by
|
|
|
|
[$../graphs/anderson_darling_definition.svg]
|
|
|
|
where /F/[sub /n/] is the empirical cumulative distribution and /F/ is the CDF of the normal distribution.
|
|
|
|
The value returned by the routine is /A/[super 2].
|
|
|
|
If /A/[super 2]\/n converges to zero as /n/ goes to infinity, then the hypothesis that the data is normally distributed is supported by the test.
|
|
|
|
If /A/[super 2]\/n converges to a finite positive value as /n/ goes to infinity, then the hypothesis is not supported by the test.
|
|
|
|
An example usage is demonstrated below:
|
|
|
|
```
|
|
#include <vector>
|
|
#include <random>
|
|
#include <iostream>
|
|
#include <boost/math/statistics/anderson_darling.hpp>
|
|
using boost::math::statistics::anderson_darling_normality_statistic;
|
|
std::random_device rd;
|
|
std::normal_distribution<double> dis(0, 1);
|
|
std::vector<double> v(8192);
|
|
for (auto & x : v) { x = dis(rd); }
|
|
std::sort(v.begin(), v.end());
|
|
double presumed_mean = 0;
|
|
double presumed_standard_deviation = 0;
|
|
double Asq = anderson_darling_normality_statistic(v, presumed_mean, presumed_standard_deviation);
|
|
std::cout << "A^2/n = " << Asq/v.size() << "\n";
|
|
5.39e-05 // should be small . . .
|
|
// Now use an incorrect hypothesis:
|
|
presumed_mean = 4;
|
|
Asq = anderson_darling_normality_statistic(v, presumed_mean, presumed_standard_deviation);
|
|
std::cout << "A^2/n = " << Asq/v.size() << "\n";
|
|
7.41 // should be somewhat large . . .
|
|
```
|
|
|
|
The Anderson-Darling normality requires sorted data.
|
|
If the data are not sorted an exception is thrown.
|
|
|
|
If you simply wish to know whether or not data is normally distributed, and not whether it is normally distributed with a presumed mean and variance,
|
|
then you can call the function without the final two arguments, and the mean and variance will be estimated from the data themselves:
|
|
|
|
```
|
|
double Asq = anderson_darling_normality_statistic(v);
|
|
```
|
|
|
|
The following graph demonstrates the convergence of the test statistic.
|
|
Each data point represents a vector of length /n/ which is filled with normally distributed data.
|
|
The test statistic is computed over this vector, divided by /n/, and passed to the natural logarithm.
|
|
This exhibits the (admittedly slow) convergence of the integral to zero when the hypothesis is true.
|
|
|
|
[$../graphs/anderson_darling_simulation.svg]
|
|
|
|
|
|
[heading Performance]
|
|
|
|
```
|
|
---------------------------------------------------------------
|
|
Benchmark Time
|
|
---------------------------------------------------------------
|
|
AndersonDarlingNormalityTest<float>/8 224 ns bytes_per_second=136.509M/s
|
|
AndersonDarlingNormalityTest<float>/16 435 ns bytes_per_second=140.254M/s
|
|
AndersonDarlingNormalityTest<float>/32 898 ns bytes_per_second=135.995M/s
|
|
AndersonDarlingNormalityTest<float>/64 1773 ns bytes_per_second=137.675M/s
|
|
AndersonDarlingNormalityTest<float>/128 3455 ns bytes_per_second=141.338M/s
|
|
AndersonDarlingNormalityTest<float>/256 7001 ns bytes_per_second=139.488M/s
|
|
AndersonDarlingNormalityTest<float>/512 13996 ns bytes_per_second=139.551M/s
|
|
AndersonDarlingNormalityTest<float>/1024 28129 ns bytes_per_second=138.868M/s
|
|
AndersonDarlingNormalityTest<float>/2048 55723 ns bytes_per_second=140.206M/s
|
|
AndersonDarlingNormalityTest<float>/4096 112008 ns bytes_per_second=139.501M/s
|
|
AndersonDarlingNormalityTest<float>/8192 224643 ns bytes_per_second=139.11M/s
|
|
AndersonDarlingNormalityTest<float>/16384 450320 ns bytes_per_second=138.791M/s
|
|
AndersonDarlingNormalityTest<float>/32768 896409 ns bytes_per_second=139.45M/s
|
|
AndersonDarlingNormalityTest<float>/65536 1797800 ns bytes_per_second=139.058M/s
|
|
AndersonDarlingNormalityTest<float>/131072 3604995 ns bytes_per_second=138.698M/s
|
|
AndersonDarlingNormalityTest<float>/262144 7235625 ns bytes_per_second=138.207M/s
|
|
AndersonDarlingNormalityTest<float>/524288 14502815 ns bytes_per_second=137.904M/s
|
|
AndersonDarlingNormalityTest<float>/1048576 29058087 ns bytes_per_second=137.659M/s
|
|
AndersonDarlingNormalityTest<float>/2097152 58470439 ns bytes_per_second=136.824M/s
|
|
AndersonDarlingNormalityTest<float>/4194304 117476365 ns bytes_per_second=136.201M/s
|
|
AndersonDarlingNormalityTest<float>/8388608 239887895 ns bytes_per_second=133.397M/s
|
|
AndersonDarlingNormalityTest<float>/16777216 488787211 ns bytes_per_second=130.94M/s
|
|
AndersonDarlingNormalityTest<float>_BigO 28.96 N 28.96 N
|
|
AndersonDarlingNormalityTest<double>/8 470 ns bytes_per_second=129.733M/s
|
|
AndersonDarlingNormalityTest<double>/16 911 ns bytes_per_second=133.989M/s
|
|
AndersonDarlingNormalityTest<double>/32 1773 ns bytes_per_second=137.723M/s
|
|
AndersonDarlingNormalityTest<double>/64 3368 ns bytes_per_second=144.966M/s
|
|
AndersonDarlingNormalityTest<double>/128 6627 ns bytes_per_second=147.357M/s
|
|
AndersonDarlingNormalityTest<double>/256 12458 ns bytes_per_second=156.777M/s
|
|
AndersonDarlingNormalityTest<double>/512 23060 ns bytes_per_second=169.395M/s
|
|
AndersonDarlingNormalityTest<double>/1024 44529 ns bytes_per_second=175.45M/s
|
|
AndersonDarlingNormalityTest<double>/2048 88735 ns bytes_per_second=176.087M/s
|
|
AndersonDarlingNormalityTest<double>/4096 175583 ns bytes_per_second=177.978M/s
|
|
AndersonDarlingNormalityTest<double>/8192 348042 ns bytes_per_second=179.577M/s
|
|
AndersonDarlingNormalityTest<double>/16384 701439 ns bytes_per_second=178.206M/s
|
|
AndersonDarlingNormalityTest<double>/32768 1394597 ns bytes_per_second=179.262M/s
|
|
AndersonDarlingNormalityTest<double>/65536 2777943 ns bytes_per_second=179.994M/s
|
|
AndersonDarlingNormalityTest<double>/131072 5571455 ns bytes_per_second=179.487M/s
|
|
AndersonDarlingNormalityTest<double>/262144 11161456 ns bytes_per_second=179.193M/s
|
|
AndersonDarlingNormalityTest<double>/524288 22048950 ns bytes_per_second=181.417M/s
|
|
AndersonDarlingNormalityTest<double>/1048576 44094409 ns bytes_per_second=181.429M/s
|
|
AndersonDarlingNormalityTest<double>/2097152 88300185 ns bytes_per_second=181.199M/s
|
|
AndersonDarlingNormalityTest<double>/4194304 176140378 ns bytes_per_second=181.678M/s
|
|
AndersonDarlingNormalityTest<double>/8388608 352102955 ns bytes_per_second=181.769M/s
|
|
AndersonDarlingNormalityTest<double>/16777216 706160246 ns bytes_per_second=181.267M/s
|
|
AndersonDarlingNormalityTest<double>_BigO 42.06 N
|
|
```
|
|
|
|
[heading Caveats]
|
|
|
|
Some authors, including [@https://www.itl.nist.gov/div898/handbook/eda/section3/eda35e.htm NIST], give the following definition of the Anderson-Darling test statistic:
|
|
|
|
[$../graphs/alternative_anderson_darling_definition.svg]
|
|
|
|
This is an approximation to the quadrature sum we use as our definition.
|
|
Boost.Math /does not compute this quantity/.
|
|
(However, with a sufficiently large amount of data the two definitions seem to agree to two digits, so the importance of making a clear distinction between the two is unclear.)
|
|
Our computation of the Anderson-Darling test statistic agrees with Mathematica.
|
|
|
|
[endsect]
|
|
[/section:anderson_darling]
|