blob: 31ca481f68d09cda93e3ad3f2c7af8d045ec5330 [file] [log] [blame]
tanjent@gmail.com7e5c3632010-11-02 00:50:04 +00001#include "Stats.h"
2
3//-----------------------------------------------------------------------------
4
5// If you want to compute these two statistics, uncomment the code and link with
6// the GSL library.
7
8/*
9extern "C"
10{
11 double gsl_sf_gamma_inc_P(const double a, const double x);
12 double gsl_sf_gamma_inc_Q(const double a, const double x);
13};
14
15// P-val for a set of binomial distributions
16
17void pval_binomial ( int * buckets, int len, int n, double p, double & sdev, double & pval )
18{
19 double c = 0;
20
21 double u = n*p;
22 double s = sqrt(n*p*(1-p));
23
24 for(int i = 0; i < len; i++)
25 {
26 double x = buckets[i];
27
28 double n = (x-u)/s;
29
30 c += n*n;
31 }
32
33 sdev = sqrt(c / len);
34
35 pval = gsl_sf_gamma_inc_P( len/2, c/2 );
36}
37
38// P-val for a histogram - K keys distributed between N buckets
39// Note the (len-1) due to the degree-of-freedom reduction
40
41void pval_pearson ( int * buckets, int len, int keys, double & sdev, double & pval )
42{
43 double c = 0;
44
45 double n = keys;
46 double p = 1.0 / double(len);
47
48 double u = n*p;
49 double s = sqrt(n*p*(1-p));
50
51 for(int i = 0; i < len; i++)
52 {
53 double x = buckets[i];
54
55 double n = (x-u)/s;
56
57 c += n*n;
58 }
59
60 sdev = sqrt(c / len);
61
62 pval = gsl_sf_gamma_inc_P( (len-1)/2, c/2 );
63}
64*/
65
66//----------------------------------------------------------------------------
67
68double erf2 ( double x )
69{
70 const double a1 = 0.254829592;
71 const double a2 = -0.284496736;
72 const double a3 = 1.421413741;
73 const double a4 = -1.453152027;
74 const double a5 = 1.061405429;
75 const double p = 0.3275911;
76
77 double sign = 1;
78 if(x < 0) sign = -1;
79
80 x = abs(x);
81
82 double t = 1.0/(1.0 + p*x);
83 double y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-x*x);
84
85 return sign*y;
86}
87
88double normal_cdf ( double u, double s2, double x )
89{
90 x = (x - u) / sqrt(2*s2);
91
92 double c = (1 + erf2(x)) / 2;
93
94 return c;
95}
96
97double binom_cdf ( double n, double p, double k )
98{
99 double u = n*p;
100 double s2 = n*p*(1-p);
101
102 return normal_cdf(u,s2,k);
103}
104
105// return the probability that a random variable from distribution A is greater than a random variable from distribution B
106
107double comparenorms ( double uA, double sA, double uB, double sB )
108{
109 double c = 1.0 - normal_cdf(uA-uB,sA*sA+sB*sB,0);
110
111 return c;
112}
113
114// convert beta distribution to normal distribution approximation
115
116void beta2norm ( double a, double b, double & u, double & s )
117{
118 u = a / (a+b);
119
120 double t1 = a*b;
121 double t2 = a+b;
122 double t3 = t2*t2*(t2+1);
123
124 s = sqrt( t1 / t3 );
125}
126
127#pragma warning(disable : 4189)
128
129double comparecoins ( double hA, double tA, double hB, double tB )
130{
131 double uA,sA,uB,sB;
132
133 beta2norm(hA+1,tA+1,uA,sA);
134 beta2norm(hB+1,tB+1,uB,sB);
135
136 // this is not the right way to handle the discontinuity at 0.5, but i don't want to deal with truncated normal distributions...
137
138 if(uA < 0.5) uA = 1.0 - uA;
139 if(uB < 0.5) uB = 1.0 - uB;
140
141 return 1.0 - comparenorms(uA,sA,uB,sB);
142}
143
144// Binomial distribution using the normal approximation
145
146double binom2 ( double n, double p, double k )
147{
148 double u = n*p;
149 double s2 = n*p*(1-p);
150
151 double a = k-u;
152
153 const double pi = 3.14159265358979323846264338327950288419716939937510;
154
155 a = a*a / (-2.0*s2);
156 a = exp(a) / sqrt(s2*2.0*pi);
157
158 return a;
159}
160
161double RandWork ( double bucketcount, double keycount )
162{
163 double avgload = keycount / bucketcount;
164
165 double total = 0;
166
167 if(avgload <= 16)
168 {
169 // if the load is low enough we can compute the expected work directly
170
171 double p = pow((bucketcount-1)/bucketcount,keycount);
172
173 double work = 0;
174
175 for(double i = 0; i < 50; i++)
176 {
177 work += i;
178 total += work * p;
179
180 p *= (keycount-i) / ( (i+1) * (bucketcount-1) );
181 }
182 }
183 else
184 {
185 // otherwise precision errors screw up the calculation, and so we fall back
186 // to the normal approxmation to the binomial distribution
187
188 double min = avgload / 5.0;
189 double max = avgload * 5.0;
190
191 for(double i = min; i <= max; i++)
192 {
193 double p = binom2(keycount,1.0 / bucketcount,i);
194
195 total += double((i*i+i) / 2) * p;
196 }
197 }
198
199 return total / avgload;
200}
201
202// Normalized standard deviation.
203
204double nsdev ( int * buckets, int len, int keys )
205{
206 double n = len;
207 double k = keys;
208 double p = 1.0/n;
209
210 double u = k*p;
211 double s = sqrt(k*p*(1-p));
212
213 double c = 0;
214
215 for(int i = 0; i < len; i++)
216 {
217 double d = buckets[i];
218
219 d = (d-u)/s;
220
221 c += d*d;
222 }
223
224 double nsd = sqrt(c / n);
225
226 return nsd;
227}
228
229
230double chooseK ( int n, int k )
231{
232 if(k > (n - k)) k = n - k;
233
234 double c = 1;
235
236 for(int i = 0; i < k; i++)
237 {
238 double t = double(n-i) / double(i+1);
239
240 c *= t;
241 }
242
243 return c;
244}
245
246double chooseUpToK ( int n, int k )
247{
248 double c = 0;
249
250 for(int i = 1; i <= k; i++)
251 {
252 c += chooseK(n,i);
253 }
254
255 return c;
256}
257
258//-----------------------------------------------------------------------------
259
260uint32_t bitrev ( uint32_t v )
261{
262 v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
263 v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
264 v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
265 v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
266 v = ( v >> 16 ) | ( v << 16);
267
268 return v;
269}
270
271//-----------------------------------------------------------------------------
272
273// Distribution "score"
274// TODO - big writeup of what this score means
275
276// Basically, we're computing a constant that says "The test distribution is as
277// uniform, RMS-wise, as a random distribution restricted to (1-X)*100 percent of
278// the bins. This makes for a nice uniform way to rate a distribution that isn't
279// dependent on the number of bins or the number of keys
280
281// (as long as # keys > # bins * 3 or so, otherwise random fluctuations show up
282// as distribution weaknesses)
283
284double calcScore ( std::vector<int> const & bins, int keys )
285{
286 double n = (int)bins.size();
287 double k = keys;
288
289 // compute rms value
290
291 double r = 0;
292
293 for(size_t i = 0; i < bins.size(); i++)
294 {
295 double b = bins[i];
296
297 r += b*b;
298 }
299
300 r = sqrt(r / n);
301
302 // compute fill factor
303
304 double f = (k*k - 1) / (n*r*r - k);
305
306 // rescale to (0,1) with 0 = good, 1 = bad
307
308 return 1 - (f / n);
309}
310
311
312//----------------------------------------------------------------------------
313
314void plot ( double n )
315{
316 double n2 = n * 1;
317
318 if(n2 < 0) n2 = 0;
319
320 n2 *= 100;
321
322 if(n2 > 64) n2 = 64;
323
324 int n3 = (int)floor(n2 + 0.5);
325
326 if(n3 == 0)
327 printf(".");
328 else
329 {
330 char x = '0' + char(n3);
331
332 if(x > '9') x = 'X';
333
334 printf("%c",x);
335 }
336}
337
338//-----------------------------------------------------------------------------