mirror of
				https://github.com/saitohirga/WSJT-X.git
				synced 2025-10-20 15:40:24 -04:00 
			
		
		
		
	
		
			
	
	
		
			230 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			230 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
|  | [section:hypergeometric_dist Hypergeometric Distribution] | ||
|  | 
 | ||
|  | ``#include <boost/math/distributions/hypergeometric.hpp>`` | ||
|  | 
 | ||
|  |    namespace boost{ namespace math{ | ||
|  | 
 | ||
|  |    template <class RealType = double, | ||
|  |              class ``__Policy``   = ``__policy_class`` > | ||
|  |    class hypergeometric_distribution; | ||
|  | 
 | ||
|  |    template <class RealType, class Policy> | ||
|  |    class hypergeometric_distribution | ||
|  |    { | ||
|  |    public: | ||
|  |       typedef RealType value_type; | ||
|  |       typedef Policy   policy_type; | ||
|  |       // Construct: | ||
|  |       hypergeometric_distribution(unsigned r, unsigned n, unsigned N); | ||
|  |       // Accessors: | ||
|  |       unsigned total()const; | ||
|  |       unsigned defective()const; | ||
|  |       unsigned sample_count()const; | ||
|  |    }; | ||
|  | 
 | ||
|  |    typedef hypergeometric_distribution<> hypergeometric; | ||
|  | 
 | ||
|  |    }} // namespaces | ||
|  | 
 | ||
|  | The hypergeometric distribution describes the number of "events" /k/ | ||
|  | from a sample /n/ drawn from a total population /N/ ['without replacement]. | ||
|  | 
 | ||
|  | Imagine we have a sample of /N/ objects of which /r/ are "defective" | ||
|  | and N-r are "not defective" | ||
|  | (the terms "success\/failure" or "red\/blue" are also used).  If we sample /n/ | ||
|  | items /without replacement/ then what is the probability that exactly | ||
|  | /k/ items in the sample are defective?  The answer is given by the pdf of the | ||
|  | hypergeometric distribution `f(k; r, n, N)`, whilst the probability of | ||
|  | /k/ defectives or fewer is given by F(k; r, n, N), where F(k) is the | ||
|  | CDF of the hypergeometric distribution. | ||
|  | 
 | ||
|  | [note Unlike almost all of the other distributions in this library, | ||
|  | the hypergeometric distribution is strictly discrete: it can not be | ||
|  | extended to real valued arguments of its parameters or random variable.] | ||
|  | 
 | ||
|  | The following graph shows how the distribution changes as the proportion | ||
|  | of "defective" items changes, while keeping the population and sample sizes | ||
|  | constant: | ||
|  | 
 | ||
|  | [graph hypergeometric_pdf_1] | ||
|  | 
 | ||
|  | Note that since the distribution is symmetrical in parameters /n/ and /r/, if we | ||
|  | change the sample size and keep the population and proportion "defective" the same | ||
|  | then we obtain basically the same graphs: | ||
|  | 
 | ||
|  | [graph hypergeometric_pdf_2] | ||
|  | 
 | ||
|  | [h4 Member Functions] | ||
|  | 
 | ||
|  |    hypergeometric_distribution(unsigned r, unsigned n, unsigned N); | ||
|  | 
 | ||
|  | Constructs a hypergeometric distribution with a population of /N/ objects, | ||
|  | of which /r/ are defective, and from which /n/ are sampled. | ||
|  | 
 | ||
|  |    unsigned total()const; | ||
|  | 
 | ||
|  | Returns the total number of objects /N/. | ||
|  | 
 | ||
|  |    unsigned defective()const; | ||
|  | 
 | ||
|  | Returns the number of objects /r/ in population /N/ which are defective. | ||
|  | 
 | ||
|  |    unsigned sample_count()const; | ||
|  | 
 | ||
|  | Returns the number of objects /n/ which are sampled from the population /N/. | ||
|  | 
 | ||
|  | [h4 Non-member Accessors] | ||
|  | 
 | ||
|  | All the [link math_toolkit.dist_ref.nmp usual non-member accessor functions] | ||
|  | that are generic to all distributions are supported: __usual_accessors. | ||
|  | 
 | ||
|  | The domain of the random variable is the unsigned integers in the range | ||
|  | \[max(0, n + r - N), min(n, r)\].  A __domain_error is raised if the | ||
|  | random variable is outside this range, or is not an integral value. | ||
|  | 
 | ||
|  | [caution | ||
|  | The quantile function will by default return an integer result that has been | ||
|  | /rounded outwards/.  That is to say lower quantiles (where the probability is | ||
|  | less than 0.5) are rounded downward, and upper quantiles (where the probability | ||
|  | is greater than 0.5) are rounded upwards.  This behaviour | ||
|  | ensures that if an X% quantile is requested, then /at least/ the requested | ||
|  | coverage will be present in the central region, and /no more than/ | ||
|  | the requested coverage will be present in the tails. | ||
|  | 
 | ||
|  | This behaviour can be changed so that the quantile functions are rounded | ||
|  | differently using | ||
|  | [link math_toolkit.pol_overview Policies].  It is strongly | ||
|  | recommended that you read the tutorial | ||
|  | [link math_toolkit.pol_tutorial.understand_dis_quant | ||
|  | Understanding Quantiles of Discrete Distributions] before | ||
|  | using the quantile function on the Hypergeometric distribution.  The | ||
|  | [link math_toolkit.pol_ref.discrete_quant_ref reference docs] | ||
|  | describe how to change the rounding policy | ||
|  | for these distributions. | ||
|  | 
 | ||
|  | However, note that the implementation method of the quantile function | ||
|  | always returns an integral value, therefore attempting to use a __Policy | ||
|  | that requires (or produces) a real valued result will result in a | ||
|  | compile time error. | ||
|  | ] [/ caution] | ||
|  | 
 | ||
|  | 
 | ||
|  | [h4 Accuracy] | ||
|  | 
 | ||
|  | For small N such that | ||
|  | `N < boost::math::max_factorial<RealType>::value` then table based | ||
|  | lookup of the results gives an accuracy to a few epsilon. | ||
|  | `boost::math::max_factorial<RealType>::value` is 170 at double or long double | ||
|  | precision. | ||
|  | 
 | ||
|  | For larger N such that `N < boost::math::prime(boost::math::max_prime)` | ||
|  | then only basic arithmetic is required for the calculation | ||
|  | and the accuracy is typically < 20 epsilon.  This takes care of N | ||
|  | up to 104729. | ||
|  | 
 | ||
|  | For `N > boost::math::prime(boost::math::max_prime)` then accuracy quickly | ||
|  | degrades, with 5 or 6 decimal digits being lost for N = 110000. | ||
|  | 
 | ||
|  | In general for very large N, the user should expect to lose log[sub 10]N | ||
|  | decimal digits of precision during the calculation, with the results | ||
|  | becoming meaningless for N >= 10[super 15]. | ||
|  | 
 | ||
|  | [h4 Testing] | ||
|  | 
 | ||
|  | There are three sets of tests: our implementation is tested against a table of values | ||
|  | produced by Mathematica's implementation of this distribution. We also sanity check | ||
|  | our implementation against some spot values computed using the online calculator | ||
|  | here [@http://stattrek.com/Tables/Hypergeometric.aspx http://stattrek.com/Tables/Hypergeometric.aspx]. | ||
|  | Finally we test accuracy against some high precision test data using | ||
|  | this implementation and NTL::RR. | ||
|  | 
 | ||
|  | [h4 Implementation] | ||
|  | 
 | ||
|  | The PDF can be calculated directly using the formula: | ||
|  | 
 | ||
|  | [equation hypergeometric1] | ||
|  | 
 | ||
|  | However, this can only be used directly when the largest of the factorials | ||
|  | is guaranteed not to overflow the floating point representation used. | ||
|  | This formula is used directly when `N < max_factorial<RealType>::value` | ||
|  | in which case table lookup of the factorials gives a rapid and accurate | ||
|  | implementation method. | ||
|  | 
 | ||
|  | For larger /N/ the method described in | ||
|  | "An Accurate Computation of the Hypergeometric Distribution Function", | ||
|  | Trong Wu, ACM Transactions on Mathematical Software, Vol. 19, No. 1, | ||
|  | March 1993, Pages 33-43 is used.  The method relies on the fact that | ||
|  | there is an easy method for factorising a factorial into the product | ||
|  | of prime numbers: | ||
|  | 
 | ||
|  | [equation hypergeometric2] | ||
|  | 
 | ||
|  | Where p[sub i] is the i'th prime number, and e[sub i] is a small | ||
|  | positive integer or zero, which can be calculated via: | ||
|  | 
 | ||
|  | [equation hypergeometric3] | ||
|  | 
 | ||
|  | Further we can combine the factorials in the expression for the PDF | ||
|  | to yield the PDF directly as the product of prime numbers: | ||
|  | 
 | ||
|  | [equation hypergeometric4] | ||
|  | 
 | ||
|  | With this time the exponents e[sub i] being either positive, negative | ||
|  | or zero.  Indeed such a degree of cancellation occurs in the calculation | ||
|  | of the e[sub i] that many are zero, and typically most have a magnitude | ||
|  | or no more than 1 or 2. | ||
|  | 
 | ||
|  | Calculation of the product of the primes requires some care to prevent | ||
|  | numerical overflow, we use a novel recursive method which splits the | ||
|  | calculation into a series of sub-products, with a new sub-product | ||
|  | started each time the next multiplication would cause either overflow | ||
|  | or underflow.  The sub-products are stored in a linked list on the | ||
|  | program stack, and combined in an order that will guarantee no overflow | ||
|  | or unnecessary-underflow once the last sub-product has been calculated. | ||
|  | 
 | ||
|  | This method can be used as long as N is smaller than the largest prime | ||
|  | number we have stored in our table of primes (currently 104729).  The method | ||
|  | is relatively slow (calculating the exponents requires the most time), but | ||
|  | requires only a small number of arithmetic operations to | ||
|  | calculate the result (indeed there is no shorter method involving only basic | ||
|  | arithmetic once the exponents have been found), the method is therefore | ||
|  | much more accurate than the alternatives. | ||
|  | 
 | ||
|  | For much larger N, we can calculate the PDF from the factorials using | ||
|  | either lgamma, or by directly combining lanczos approximations to avoid | ||
|  | calculating via logarithms.  We use the latter method, as it is usually | ||
|  | 1 or 2 decimal digits more accurate than computing via logarithms with | ||
|  | lgamma.  However, in this area where N > 104729, the user should expect | ||
|  | to lose around log[sub 10]N decimal digits during the calculation in | ||
|  | the worst case. | ||
|  | 
 | ||
|  | The CDF and its complement is calculated by directly summing the PDF's. | ||
|  | We start by deciding whether the CDF, or its complement, is likely to be | ||
|  | the smaller of the two and then calculate the PDF at /k/ (or /k+1/ if we're | ||
|  | calculating the complement) and calculate successive PDF values via the | ||
|  | recurrence relations: | ||
|  | 
 | ||
|  | [equation hypergeometric5] | ||
|  | 
 | ||
|  | Until we either reach the end of the distributions domain, or the next | ||
|  | PDF value to be summed would be too small to affect the result. | ||
|  | 
 | ||
|  | The quantile is calculated in a similar manner to the CDF: we first guess | ||
|  | which end of the distribution we're nearer to, and then sum PDFs starting | ||
|  | from the end of the distribution this time, until we have some value /k/ that | ||
|  | gives the required CDF. | ||
|  | 
 | ||
|  | The median is simply the quantile at 0.5, and the remaining properties are | ||
|  | calculated via: | ||
|  | 
 | ||
|  | [equation hypergeometric6] | ||
|  | 
 | ||
|  | [endsect] | ||
|  | 
 | ||
|  | [/ hypergeometric.qbk | ||
|  |   Copyright 2008 John Maddock. | ||
|  |   Distributed under the Boost Software License, Version 1.0. | ||
|  |   (See accompanying file LICENSE_1_0.txt or copy at | ||
|  |   http://www.boost.org/LICENSE_1_0.txt). | ||
|  | ] |