365 lines
15 KiB
C++
365 lines
15 KiB
C++
// geometric_examples.cpp
|
|
|
|
// Copyright Paul A. Bristow 2010.
|
|
|
|
// Use, modification and distribution are subject to the
|
|
// Boost Software License, Version 1.0.
|
|
// (See accompanying file LICENSE_1_0.txt
|
|
// or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
// This file is written to be included from a Quickbook .qbk document.
|
|
// It can still be compiled by the C++ compiler, and run.
|
|
// Any output can also be added here as comment or included or pasted in elsewhere.
|
|
// Caution: this file contains Quickbook markup as well as code
|
|
// and comments: don't change any of the special comment markups!
|
|
|
|
// Examples of using the geometric distribution.
|
|
|
|
//[geometric_eg1_1
|
|
/*`
|
|
For this example, we will opt to #define two macros to control
|
|
the error and discrete handling policies.
|
|
For this simple example, we want to avoid throwing
|
|
an exception (the default policy) and just return infinity.
|
|
We want to treat the distribution as if it was continuous,
|
|
so we choose a discrete_quantile policy of real,
|
|
rather than the default policy integer_round_outwards.
|
|
*/
|
|
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
|
|
#define BOOST_MATH_DISCRETE_QUANTILE_POLICY real
|
|
/*`
|
|
[caution It is vital to #include distributions etc *after* the above #defines]
|
|
After that we need some includes to provide easy access to the negative binomial distribution,
|
|
and we need some std library iostream, of course.
|
|
*/
|
|
#include <boost/math/distributions/geometric.hpp>
|
|
// for geometric_distribution
|
|
using ::boost::math::geometric_distribution; //
|
|
using ::boost::math::geometric; // typedef provides default type is double.
|
|
using ::boost::math::pdf; // Probability mass function.
|
|
using ::boost::math::cdf; // Cumulative density function.
|
|
using ::boost::math::quantile;
|
|
|
|
#include <boost/math/distributions/negative_binomial.hpp>
|
|
// for negative_binomial_distribution
|
|
using boost::math::negative_binomial; // typedef provides default type is double.
|
|
|
|
#include <boost/math/distributions/normal.hpp>
|
|
// for negative_binomial_distribution
|
|
using boost::math::normal; // typedef provides default type is double.
|
|
|
|
#include <iostream>
|
|
using std::cout; using std::endl;
|
|
using std::noshowpoint; using std::fixed; using std::right; using std::left;
|
|
#include <iomanip>
|
|
using std::setprecision; using std::setw;
|
|
|
|
#include <limits>
|
|
using std::numeric_limits;
|
|
//] [geometric_eg1_1]
|
|
|
|
int main()
|
|
{
|
|
cout <<"Geometric distribution example" << endl;
|
|
cout << endl;
|
|
|
|
cout.precision(4); // But only show a few for this example.
|
|
try
|
|
{
|
|
//[geometric_eg1_2
|
|
/*`
|
|
It is always sensible to use try and catch blocks because defaults policies are to
|
|
throw an exception if anything goes wrong.
|
|
|
|
Simple try'n'catch blocks (see below) will ensure that you get a
|
|
helpful error message instead of an abrupt (and silent) program abort.
|
|
|
|
[h6 Throwing a dice]
|
|
The Geometric distribution describes the probability (/p/) of a number of failures
|
|
to get the first success in /k/ Bernoulli trials.
|
|
(A [@http://en.wikipedia.org/wiki/Bernoulli_distribution Bernoulli trial]
|
|
is one with only two possible outcomes, success of failure,
|
|
and /p/ is the probability of success).
|
|
|
|
Suppose an 'fair' 6-face dice is thrown repeatedly:
|
|
*/
|
|
double success_fraction = 1./6; // success_fraction (p) = 0.1666
|
|
// (so failure_fraction is 1 - success_fraction = 5./6 = 1- 0.1666 = 0.8333)
|
|
|
|
/*`If the dice is thrown repeatedly until the *first* time a /three/ appears.
|
|
The probablility distribution of the number of times it is thrown *not* getting a /three/
|
|
(/not-a-threes/ number of failures to get a /three/)
|
|
is a geometric distribution with the success_fraction = 1/6 = 0.1666[recur].
|
|
|
|
We therefore start by constructing a geometric distribution
|
|
with the one parameter success_fraction, the probability of success.
|
|
*/
|
|
geometric g6(success_fraction); // type double by default.
|
|
/*`
|
|
To confirm, we can echo the success_fraction parameter of the distribution.
|
|
*/
|
|
cout << "success fraction of a six-sided dice is " << g6.success_fraction() << endl;
|
|
/*`So the probability of getting a three at the first throw (zero failures) is
|
|
*/
|
|
cout << pdf(g6, 0) << endl; // 0.1667
|
|
cout << cdf(g6, 0) << endl; // 0.1667
|
|
/*`Note that the cdf and pdf are identical because the is only one throw.
|
|
If we want the probability of getting the first /three/ on the 2nd throw:
|
|
*/
|
|
cout << pdf(g6, 1) << endl; // 0.1389
|
|
|
|
/*`If we want the probability of getting the first /three/ on the 1st or 2nd throw
|
|
(allowing one failure):
|
|
*/
|
|
cout << "pdf(g6, 0) + pdf(g6, 1) = " << pdf(g6, 0) + pdf(g6, 1) << endl;
|
|
/*`Or more conveniently, and more generally,
|
|
we can use the Cumulative Distribution Function CDF.*/
|
|
|
|
cout << "cdf(g6, 1) = " << cdf(g6, 1) << endl; // 0.3056
|
|
|
|
/*`If we allow many more (12) throws, the probability of getting our /three/ gets very high:*/
|
|
cout << "cdf(g6, 12) = " << cdf(g6, 12) << endl; // 0.9065 or 90% probability.
|
|
/*`If we want to be much more confident, say 99%,
|
|
we can estimate the number of throws to be this sure
|
|
using the inverse or quantile.
|
|
*/
|
|
cout << "quantile(g6, 0.99) = " << quantile(g6, 0.99) << endl; // 24.26
|
|
/*`Note that the value returned is not an integer:
|
|
if you want an integer result you should use either floor, round or ceil functions,
|
|
or use the policies mechanism.
|
|
|
|
See __understand_dis_quant.
|
|
|
|
The geometric distribution is related to the negative binomial
|
|
__spaces `negative_binomial_distribution(RealType r, RealType p);` with parameter /r/ = 1.
|
|
So we could get the same result using the negative binomial,
|
|
but using the geometric the results will be faster, and may be more accurate.
|
|
*/
|
|
negative_binomial nb(1, success_fraction);
|
|
cout << pdf(nb, 1) << endl; // 0.1389
|
|
cout << cdf(nb, 1) << endl; // 0.3056
|
|
/*`We could also the complement to express the required probability
|
|
as 1 - 0.99 = 0.01 (and get the same result):
|
|
*/
|
|
cout << "quantile(complement(g6, 1 - p)) " << quantile(complement(g6, 0.01)) << endl; // 24.26
|
|
/*`
|
|
Note too that Boost.Math geometric distribution is implemented as a continuous function.
|
|
Unlike other implementations (for example R) it *uses* the number of failures as a *real* parameter,
|
|
not as an integer. If you want this integer behaviour, you may need to enforce this by
|
|
rounding the parameter you pass, probably rounding down, to the nearest integer.
|
|
For example, R returns the success fraction probability for all values of failures
|
|
from 0 to 0.999999 thus:
|
|
[pre
|
|
__spaces R> formatC(pgeom(0.0001,0.5, FALSE), digits=17) " 0.5"
|
|
] [/pre]
|
|
So in Boost.Math the equivalent is
|
|
*/
|
|
geometric g05(0.5); // Probability of success = 0.5 or 50%
|
|
// Output all potentially significant digits for the type, here double.
|
|
|
|
#ifdef BOOST_NO_CXX11_NUMERIC_LIMITS
|
|
int max_digits10 = 2 + (boost::math::policies::digits<double, boost::math::policies::policy<> >() * 30103UL) / 100000UL;
|
|
cout << "BOOST_NO_CXX11_NUMERIC_LIMITS is defined" << endl;
|
|
#else
|
|
int max_digits10 = std::numeric_limits<double>::max_digits10;
|
|
#endif
|
|
cout << "Show all potentially significant decimal digits std::numeric_limits<double>::max_digits10 = "
|
|
<< max_digits10 << endl;
|
|
cout.precision(max_digits10); //
|
|
|
|
cout << cdf(g05, 0.0001) << endl; // returns 0.5000346561579232, not exact 0.5.
|
|
/*`To get the R discrete behaviour, you simply need to round with,
|
|
for example, the `floor` function.
|
|
*/
|
|
cout << cdf(g05, floor(0.0001)) << endl; // returns exactly 0.5
|
|
/*`
|
|
[pre
|
|
`> formatC(pgeom(0.9999999,0.5, FALSE), digits=17) [1] " 0.25"`
|
|
`> formatC(pgeom(1.999999,0.5, FALSE), digits=17)[1] " 0.25" k = 1`
|
|
`> formatC(pgeom(1.9999999,0.5, FALSE), digits=17)[1] "0.12500000000000003" k = 2`
|
|
] [/pre]
|
|
shows that R makes an arbitrary round-up decision at about 1e7 from the next integer above.
|
|
This may be convenient in practice, and could be replicated in C++ if desired.
|
|
|
|
[h6 Surveying customers to find one with a faulty product]
|
|
A company knows from warranty claims that 2% of their products will be faulty,
|
|
so the 'success_fraction' of finding a fault is 0.02.
|
|
It wants to interview a purchaser of faulty products to assess their 'user experience'.
|
|
|
|
To estimate how many customers they will probably need to contact
|
|
in order to find one who has suffered from the fault,
|
|
we first construct a geometric distribution with probability 0.02,
|
|
and then chose a confidence, say 80%, 95%, or 99% to finding a customer with a fault.
|
|
Finally, we probably want to round up the result to the integer above using the `ceil` function.
|
|
(We could also use a policy, but that is hardly worthwhile for this simple application.)
|
|
|
|
(This also assumes that each customer only buys one product:
|
|
if customers bought more than one item,
|
|
the probability of finding a customer with a fault obviously improves.)
|
|
*/
|
|
cout.precision(5);
|
|
geometric g(0.02); // On average, 2 in 100 products are faulty.
|
|
double c = 0.95; // 95% confidence.
|
|
cout << " quantile(g, " << c << ") = " << quantile(g, c) << endl;
|
|
|
|
cout << "To be " << c * 100
|
|
<< "% confident of finding we customer with a fault, need to survey "
|
|
<< ceil(quantile(g, c)) << " customers." << endl; // 148
|
|
c = 0.99; // Very confident.
|
|
cout << "To be " << c * 100
|
|
<< "% confident of finding we customer with a fault, need to survey "
|
|
<< ceil(quantile(g, c)) << " customers." << endl; // 227
|
|
c = 0.80; // Only reasonably confident.
|
|
cout << "To be " << c * 100
|
|
<< "% confident of finding we customer with a fault, need to survey "
|
|
<< ceil(quantile(g, c)) << " customers." << endl; // 79
|
|
|
|
/*`[h6 Basket Ball Shooters]
|
|
According to Wikipedia, average pro basket ball players get
|
|
[@http://en.wikipedia.org/wiki/Free_throw free throws]
|
|
in the baskets 70 to 80 % of the time,
|
|
but some get as high as 95%, and others as low as 50%.
|
|
Suppose we want to compare the probabilities
|
|
of failing to get a score only on the first or on the fifth shot?
|
|
To start we will consider the average shooter, say 75%.
|
|
So we construct a geometric distribution
|
|
with success_fraction parameter 75/100 = 0.75.
|
|
*/
|
|
cout.precision(2);
|
|
geometric gav(0.75); // Shooter averages 7.5 out of 10 in the basket.
|
|
/*`What is probability of getting 1st try in the basket, that is with no failures? */
|
|
cout << "Probability of score on 1st try = " << pdf(gav, 0) << endl; // 0.75
|
|
/*`This is, of course, the success_fraction probability 75%.
|
|
What is the probability that the shooter only scores on the fifth shot?
|
|
So there are 5-1 = 4 failures before the first success.*/
|
|
cout << "Probability of score on 5th try = " << pdf(gav, 4) << endl; // 0.0029
|
|
/*`Now compare this with the poor and the best players success fraction.
|
|
We need to constructing new distributions with the different success fractions,
|
|
and then get the corresponding probability density functions values:
|
|
*/
|
|
geometric gbest(0.95);
|
|
cout << "Probability of score on 5th try = " << pdf(gbest, 4) << endl; // 5.9e-6
|
|
geometric gmediocre(0.50);
|
|
cout << "Probability of score on 5th try = " << pdf(gmediocre, 4) << endl; // 0.031
|
|
/*`So we can see the very much smaller chance (0.000006) of 4 failures by the best shooters,
|
|
compared to the 0.03 of the mediocre.*/
|
|
|
|
/*`[h6 Estimating failures]
|
|
Of course one man's failure is an other man's success.
|
|
So a fault can be defined as a 'success'.
|
|
|
|
If a fault occurs once after 100 flights, then one might naively say
|
|
that the risk of fault is obviously 1 in 100 = 1/100, a probability of 0.01.
|
|
|
|
This is the best estimate we can make, but while it is the truth,
|
|
it is not the whole truth,
|
|
for it hides the big uncertainty when estimating from a single event.
|
|
"One swallow doesn't make a summer."
|
|
To show the magnitude of the uncertainty, the geometric
|
|
(or the negative binomial) distribution can be used.
|
|
|
|
If we chose the popular 95% confidence in the limits, corresponding to an alpha of 0.05,
|
|
because we are calculating a two-sided interval, we must divide alpha by two.
|
|
*/
|
|
double alpha = 0.05;
|
|
double k = 100; // So frequency of occurrence is 1/100.
|
|
cout << "Probability is failure is " << 1/k << endl;
|
|
double t = geometric::find_lower_bound_on_p(k, alpha/2);
|
|
cout << "geometric::find_lower_bound_on_p(" << int(k) << ", " << alpha/2 << ") = "
|
|
<< t << endl; // 0.00025
|
|
t = geometric::find_upper_bound_on_p(k, alpha/2);
|
|
cout << "geometric::find_upper_bound_on_p(" << int(k) << ", " << alpha/2 << ") = "
|
|
<< t << endl; // 0.037
|
|
/*`So while we estimate the probability is 0.01, it might lie between 0.0003 and 0.04.
|
|
Even if we relax our confidence to alpha = 90%, the bounds only contract to 0.0005 and 0.03.
|
|
And if we require a high confidence, they widen to 0.00005 to 0.05.
|
|
*/
|
|
alpha = 0.1; // 90% confidence.
|
|
t = geometric::find_lower_bound_on_p(k, alpha/2);
|
|
cout << "geometric::find_lower_bound_on_p(" << int(k) << ", " << alpha/2 << ") = "
|
|
<< t << endl; // 0.0005
|
|
t = geometric::find_upper_bound_on_p(k, alpha/2);
|
|
cout << "geometric::find_upper_bound_on_p(" << int(k) << ", " << alpha/2 << ") = "
|
|
<< t << endl; // 0.03
|
|
|
|
alpha = 0.01; // 99% confidence.
|
|
t = geometric::find_lower_bound_on_p(k, alpha/2);
|
|
cout << "geometric::find_lower_bound_on_p(" << int(k) << ", " << alpha/2 << ") = "
|
|
<< t << endl; // 5e-005
|
|
t = geometric::find_upper_bound_on_p(k, alpha/2);
|
|
cout << "geometric::find_upper_bound_on_p(" << int(k) << ", " << alpha/2 << ") = "
|
|
<< t << endl; // 0.052
|
|
/*`In real life, there will usually be more than one event (fault or success),
|
|
when the negative binomial, which has the neccessary extra parameter, will be needed.
|
|
*/
|
|
|
|
/*`As noted above, using a catch block is always a good idea,
|
|
even if you hope not to use it!
|
|
*/
|
|
}
|
|
catch(const std::exception& e)
|
|
{ // Since we have set an overflow policy of ignore_error,
|
|
// an overflow exception should never be thrown.
|
|
std::cout << "\nMessage from thrown exception was:\n " << e.what() << std::endl;
|
|
/*`
|
|
For example, without a ignore domain error policy,
|
|
if we asked for ``pdf(g, -1)`` for example,
|
|
we would get an unhelpful abort, but with a catch:
|
|
[pre
|
|
Message from thrown exception was:
|
|
Error in function boost::math::pdf(const exponential_distribution<double>&, double):
|
|
Number of failures argument is -1, but must be >= 0 !
|
|
] [/pre]
|
|
*/
|
|
//] [/ geometric_eg1_2]
|
|
}
|
|
return 0;
|
|
} // int main()
|
|
|
|
|
|
/*
|
|
Output is:
|
|
|
|
Geometric distribution example
|
|
|
|
success fraction of a six-sided dice is 0.1667
|
|
0.1667
|
|
0.1667
|
|
0.1389
|
|
pdf(g6, 0) + pdf(g6, 1) = 0.3056
|
|
cdf(g6, 1) = 0.3056
|
|
cdf(g6, 12) = 0.9065
|
|
quantile(g6, 0.99) = 24.26
|
|
0.1389
|
|
0.3056
|
|
quantile(complement(g6, 1 - p)) 24.26
|
|
0.5000346561579232
|
|
0.5
|
|
quantile(g, 0.95) = 147.28
|
|
To be 95% confident of finding we customer with a fault, need to survey 148 customers.
|
|
To be 99% confident of finding we customer with a fault, need to survey 227 customers.
|
|
To be 80% confident of finding we customer with a fault, need to survey 79 customers.
|
|
Probability of score on 1st try = 0.75
|
|
Probability of score on 5th try = 0.0029
|
|
Probability of score on 5th try = 5.9e-006
|
|
Probability of score on 5th try = 0.031
|
|
Probability is failure is 0.01
|
|
geometric::find_lower_bound_on_p(100, 0.025) = 0.00025
|
|
geometric::find_upper_bound_on_p(100, 0.025) = 0.037
|
|
geometric::find_lower_bound_on_p(100, 0.05) = 0.00051
|
|
geometric::find_upper_bound_on_p(100, 0.05) = 0.03
|
|
geometric::find_lower_bound_on_p(100, 0.005) = 5e-005
|
|
geometric::find_upper_bound_on_p(100, 0.005) = 0.052
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|