tanjent@gmail.com | 7e5c363 | 2010-11-02 00:50:04 +0000 | [diff] [blame^] | 1 | #include "Stats.h"
|
| 2 |
|
| 3 | //-----------------------------------------------------------------------------
|
| 4 |
|
| 5 | // If you want to compute these two statistics, uncomment the code and link with
|
| 6 | // the GSL library.
|
| 7 |
|
| 8 | /*
|
| 9 | extern "C"
|
| 10 | {
|
| 11 | double gsl_sf_gamma_inc_P(const double a, const double x);
|
| 12 | double gsl_sf_gamma_inc_Q(const double a, const double x);
|
| 13 | };
|
| 14 |
|
| 15 | // P-val for a set of binomial distributions
|
| 16 |
|
| 17 | void pval_binomial ( int * buckets, int len, int n, double p, double & sdev, double & pval )
|
| 18 | {
|
| 19 | double c = 0;
|
| 20 |
|
| 21 | double u = n*p;
|
| 22 | double s = sqrt(n*p*(1-p));
|
| 23 |
|
| 24 | for(int i = 0; i < len; i++)
|
| 25 | {
|
| 26 | double x = buckets[i];
|
| 27 |
|
| 28 | double n = (x-u)/s;
|
| 29 |
|
| 30 | c += n*n;
|
| 31 | }
|
| 32 |
|
| 33 | sdev = sqrt(c / len);
|
| 34 |
|
| 35 | pval = gsl_sf_gamma_inc_P( len/2, c/2 );
|
| 36 | }
|
| 37 |
|
| 38 | // P-val for a histogram - K keys distributed between N buckets
|
| 39 | // Note the (len-1) due to the degree-of-freedom reduction
|
| 40 |
|
| 41 | void pval_pearson ( int * buckets, int len, int keys, double & sdev, double & pval )
|
| 42 | {
|
| 43 | double c = 0;
|
| 44 |
|
| 45 | double n = keys;
|
| 46 | double p = 1.0 / double(len);
|
| 47 |
|
| 48 | double u = n*p;
|
| 49 | double s = sqrt(n*p*(1-p));
|
| 50 |
|
| 51 | for(int i = 0; i < len; i++)
|
| 52 | {
|
| 53 | double x = buckets[i];
|
| 54 |
|
| 55 | double n = (x-u)/s;
|
| 56 |
|
| 57 | c += n*n;
|
| 58 | }
|
| 59 |
|
| 60 | sdev = sqrt(c / len);
|
| 61 |
|
| 62 | pval = gsl_sf_gamma_inc_P( (len-1)/2, c/2 );
|
| 63 | }
|
| 64 | */
|
| 65 |
|
| 66 | //----------------------------------------------------------------------------
|
| 67 |
|
| 68 | double erf2 ( double x )
|
| 69 | {
|
| 70 | const double a1 = 0.254829592;
|
| 71 | const double a2 = -0.284496736;
|
| 72 | const double a3 = 1.421413741;
|
| 73 | const double a4 = -1.453152027;
|
| 74 | const double a5 = 1.061405429;
|
| 75 | const double p = 0.3275911;
|
| 76 |
|
| 77 | double sign = 1;
|
| 78 | if(x < 0) sign = -1;
|
| 79 |
|
| 80 | x = abs(x);
|
| 81 |
|
| 82 | double t = 1.0/(1.0 + p*x);
|
| 83 | double y = 1.0 - (((((a5*t + a4)*t) + a3)*t + a2)*t + a1)*t*exp(-x*x);
|
| 84 |
|
| 85 | return sign*y;
|
| 86 | }
|
| 87 |
|
| 88 | double normal_cdf ( double u, double s2, double x )
|
| 89 | {
|
| 90 | x = (x - u) / sqrt(2*s2);
|
| 91 |
|
| 92 | double c = (1 + erf2(x)) / 2;
|
| 93 |
|
| 94 | return c;
|
| 95 | }
|
| 96 |
|
| 97 | double binom_cdf ( double n, double p, double k )
|
| 98 | {
|
| 99 | double u = n*p;
|
| 100 | double s2 = n*p*(1-p);
|
| 101 |
|
| 102 | return normal_cdf(u,s2,k);
|
| 103 | }
|
| 104 |
|
| 105 | // return the probability that a random variable from distribution A is greater than a random variable from distribution B
|
| 106 |
|
| 107 | double comparenorms ( double uA, double sA, double uB, double sB )
|
| 108 | {
|
| 109 | double c = 1.0 - normal_cdf(uA-uB,sA*sA+sB*sB,0);
|
| 110 |
|
| 111 | return c;
|
| 112 | }
|
| 113 |
|
| 114 | // convert beta distribution to normal distribution approximation
|
| 115 |
|
| 116 | void beta2norm ( double a, double b, double & u, double & s )
|
| 117 | {
|
| 118 | u = a / (a+b);
|
| 119 |
|
| 120 | double t1 = a*b;
|
| 121 | double t2 = a+b;
|
| 122 | double t3 = t2*t2*(t2+1);
|
| 123 |
|
| 124 | s = sqrt( t1 / t3 );
|
| 125 | }
|
| 126 |
|
| 127 | #pragma warning(disable : 4189)
|
| 128 |
|
| 129 | double comparecoins ( double hA, double tA, double hB, double tB )
|
| 130 | {
|
| 131 | double uA,sA,uB,sB;
|
| 132 |
|
| 133 | beta2norm(hA+1,tA+1,uA,sA);
|
| 134 | beta2norm(hB+1,tB+1,uB,sB);
|
| 135 |
|
| 136 | // this is not the right way to handle the discontinuity at 0.5, but i don't want to deal with truncated normal distributions...
|
| 137 |
|
| 138 | if(uA < 0.5) uA = 1.0 - uA;
|
| 139 | if(uB < 0.5) uB = 1.0 - uB;
|
| 140 |
|
| 141 | return 1.0 - comparenorms(uA,sA,uB,sB);
|
| 142 | }
|
| 143 |
|
| 144 | // Binomial distribution using the normal approximation
|
| 145 |
|
| 146 | double binom2 ( double n, double p, double k )
|
| 147 | {
|
| 148 | double u = n*p;
|
| 149 | double s2 = n*p*(1-p);
|
| 150 |
|
| 151 | double a = k-u;
|
| 152 |
|
| 153 | const double pi = 3.14159265358979323846264338327950288419716939937510;
|
| 154 |
|
| 155 | a = a*a / (-2.0*s2);
|
| 156 | a = exp(a) / sqrt(s2*2.0*pi);
|
| 157 |
|
| 158 | return a;
|
| 159 | }
|
| 160 |
|
| 161 | double RandWork ( double bucketcount, double keycount )
|
| 162 | {
|
| 163 | double avgload = keycount / bucketcount;
|
| 164 |
|
| 165 | double total = 0;
|
| 166 |
|
| 167 | if(avgload <= 16)
|
| 168 | {
|
| 169 | // if the load is low enough we can compute the expected work directly
|
| 170 |
|
| 171 | double p = pow((bucketcount-1)/bucketcount,keycount);
|
| 172 |
|
| 173 | double work = 0;
|
| 174 |
|
| 175 | for(double i = 0; i < 50; i++)
|
| 176 | {
|
| 177 | work += i;
|
| 178 | total += work * p;
|
| 179 |
|
| 180 | p *= (keycount-i) / ( (i+1) * (bucketcount-1) );
|
| 181 | }
|
| 182 | }
|
| 183 | else
|
| 184 | {
|
| 185 | // otherwise precision errors screw up the calculation, and so we fall back
|
| 186 | // to the normal approxmation to the binomial distribution
|
| 187 |
|
| 188 | double min = avgload / 5.0;
|
| 189 | double max = avgload * 5.0;
|
| 190 |
|
| 191 | for(double i = min; i <= max; i++)
|
| 192 | {
|
| 193 | double p = binom2(keycount,1.0 / bucketcount,i);
|
| 194 |
|
| 195 | total += double((i*i+i) / 2) * p;
|
| 196 | }
|
| 197 | }
|
| 198 |
|
| 199 | return total / avgload;
|
| 200 | }
|
| 201 |
|
| 202 | // Normalized standard deviation.
|
| 203 |
|
| 204 | double nsdev ( int * buckets, int len, int keys )
|
| 205 | {
|
| 206 | double n = len;
|
| 207 | double k = keys;
|
| 208 | double p = 1.0/n;
|
| 209 |
|
| 210 | double u = k*p;
|
| 211 | double s = sqrt(k*p*(1-p));
|
| 212 |
|
| 213 | double c = 0;
|
| 214 |
|
| 215 | for(int i = 0; i < len; i++)
|
| 216 | {
|
| 217 | double d = buckets[i];
|
| 218 |
|
| 219 | d = (d-u)/s;
|
| 220 |
|
| 221 | c += d*d;
|
| 222 | }
|
| 223 |
|
| 224 | double nsd = sqrt(c / n);
|
| 225 |
|
| 226 | return nsd;
|
| 227 | }
|
| 228 |
|
| 229 |
|
| 230 | double chooseK ( int n, int k )
|
| 231 | {
|
| 232 | if(k > (n - k)) k = n - k;
|
| 233 |
|
| 234 | double c = 1;
|
| 235 |
|
| 236 | for(int i = 0; i < k; i++)
|
| 237 | {
|
| 238 | double t = double(n-i) / double(i+1);
|
| 239 |
|
| 240 | c *= t;
|
| 241 | }
|
| 242 |
|
| 243 | return c;
|
| 244 | }
|
| 245 |
|
| 246 | double chooseUpToK ( int n, int k )
|
| 247 | {
|
| 248 | double c = 0;
|
| 249 |
|
| 250 | for(int i = 1; i <= k; i++)
|
| 251 | {
|
| 252 | c += chooseK(n,i);
|
| 253 | }
|
| 254 |
|
| 255 | return c;
|
| 256 | }
|
| 257 |
|
| 258 | //-----------------------------------------------------------------------------
|
| 259 |
|
| 260 | uint32_t bitrev ( uint32_t v )
|
| 261 | {
|
| 262 | v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
|
| 263 | v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
|
| 264 | v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
|
| 265 | v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
|
| 266 | v = ( v >> 16 ) | ( v << 16);
|
| 267 |
|
| 268 | return v;
|
| 269 | }
|
| 270 |
|
| 271 | //-----------------------------------------------------------------------------
|
| 272 |
|
| 273 | // Distribution "score"
|
| 274 | // TODO - big writeup of what this score means
|
| 275 |
|
| 276 | // Basically, we're computing a constant that says "The test distribution is as
|
| 277 | // uniform, RMS-wise, as a random distribution restricted to (1-X)*100 percent of
|
| 278 | // the bins. This makes for a nice uniform way to rate a distribution that isn't
|
| 279 | // dependent on the number of bins or the number of keys
|
| 280 |
|
| 281 | // (as long as # keys > # bins * 3 or so, otherwise random fluctuations show up
|
| 282 | // as distribution weaknesses)
|
| 283 |
|
| 284 | double calcScore ( std::vector<int> const & bins, int keys )
|
| 285 | {
|
| 286 | double n = (int)bins.size();
|
| 287 | double k = keys;
|
| 288 |
|
| 289 | // compute rms value
|
| 290 |
|
| 291 | double r = 0;
|
| 292 |
|
| 293 | for(size_t i = 0; i < bins.size(); i++)
|
| 294 | {
|
| 295 | double b = bins[i];
|
| 296 |
|
| 297 | r += b*b;
|
| 298 | }
|
| 299 |
|
| 300 | r = sqrt(r / n);
|
| 301 |
|
| 302 | // compute fill factor
|
| 303 |
|
| 304 | double f = (k*k - 1) / (n*r*r - k);
|
| 305 |
|
| 306 | // rescale to (0,1) with 0 = good, 1 = bad
|
| 307 |
|
| 308 | return 1 - (f / n);
|
| 309 | }
|
| 310 |
|
| 311 |
|
| 312 | //----------------------------------------------------------------------------
|
| 313 |
|
| 314 | void plot ( double n )
|
| 315 | {
|
| 316 | double n2 = n * 1;
|
| 317 |
|
| 318 | if(n2 < 0) n2 = 0;
|
| 319 |
|
| 320 | n2 *= 100;
|
| 321 |
|
| 322 | if(n2 > 64) n2 = 64;
|
| 323 |
|
| 324 | int n3 = (int)floor(n2 + 0.5);
|
| 325 |
|
| 326 | if(n3 == 0)
|
| 327 | printf(".");
|
| 328 | else
|
| 329 | {
|
| 330 | char x = '0' + char(n3);
|
| 331 |
|
| 332 | if(x > '9') x = 'X';
|
| 333 |
|
| 334 | printf("%c",x);
|
| 335 | }
|
| 336 | }
|
| 337 |
|
| 338 | //-----------------------------------------------------------------------------
|