Blame - Stats.cpp - chromium.googlesource.com/external/smhasher

blob: 31ca481f68d09cda93e3ad3f2c7af8d045ec5330 [file] [log] [blame]

tanjent@gmail.com	7e5c363	2010-11-02 00:50:04 +0000	[diff] [blame^]	1	#include "Stats.h"
				2
				3	//-----------------------------------------------------------------------------
				4
				5	// If you want to compute these two statistics, uncomment the code and link with
				6	// the GSL library.
				7
				8	/*
				9	extern "C"
				10	{
				11	double gsl_sf_gamma_inc_P(const double a, const double x);
				12	double gsl_sf_gamma_inc_Q(const double a, const double x);
				13	};
				14
				15	// P-val for a set of binomial distributions
				16
				17	void pval_binomial ( int * buckets, int len, int n, double p, double & sdev, double & pval )
				18	{
				19	double c = 0;
				20
				21	double u = n*p;
				22	double s = sqrt(np(1-p));
				23
				24	for(int i = 0; i < len; i++)
				25	{
				26	double x = buckets[i];
				27
				28	double n = (x-u)/s;
				29
				30	c += n*n;
				31	}
				32
				33	sdev = sqrt(c / len);
				34
				35	pval = gsl_sf_gamma_inc_P( len/2, c/2 );
				36	}
				37
				38	// P-val for a histogram - K keys distributed between N buckets
				39	// Note the (len-1) due to the degree-of-freedom reduction
				40
				41	void pval_pearson ( int * buckets, int len, int keys, double & sdev, double & pval )
				42	{
				43	double c = 0;
				44
				45	double n = keys;
				46	double p = 1.0 / double(len);
				47
				48	double u = n*p;
				49	double s = sqrt(np(1-p));
				50
				51	for(int i = 0; i < len; i++)
				52	{
				53	double x = buckets[i];
				54
				55	double n = (x-u)/s;
				56
				57	c += n*n;
				58	}
				59
				60	sdev = sqrt(c / len);
				61
				62	pval = gsl_sf_gamma_inc_P( (len-1)/2, c/2 );
				63	}
				64	*/
				65
				66	//----------------------------------------------------------------------------
				67
				68	double erf2 ( double x )
				69	{
				70	const double a1 = 0.254829592;
				71	const double a2 = -0.284496736;
				72	const double a3 = 1.421413741;
				73	const double a4 = -1.453152027;
				74	const double a5 = 1.061405429;
				75	const double p = 0.3275911;
				76
				77	double sign = 1;
				78	if(x < 0) sign = -1;
				79
				80	x = abs(x);
				81
				82	double t = 1.0/(1.0 + p*x);
				83	double y = 1.0 - (((((a5t + a4)t) + a3)t + a2)t + a1)texp(-x*x);
				84
				85	return sign*y;
				86	}
				87
				88	double normal_cdf ( double u, double s2, double x )
				89	{
				90	x = (x - u) / sqrt(2*s2);
				91
				92	double c = (1 + erf2(x)) / 2;
				93
				94	return c;
				95	}
				96
				97	double binom_cdf ( double n, double p, double k )
				98	{
				99	double u = n*p;
				100	double s2 = np(1-p);
				101
				102	return normal_cdf(u,s2,k);
				103	}
				104
				105	// return the probability that a random variable from distribution A is greater than a random variable from distribution B
				106
				107	double comparenorms ( double uA, double sA, double uB, double sB )
				108	{
				109	double c = 1.0 - normal_cdf(uA-uB,sAsA+sBsB,0);
				110
				111	return c;
				112	}
				113
				114	// convert beta distribution to normal distribution approximation
				115
				116	void beta2norm ( double a, double b, double & u, double & s )
				117	{
				118	u = a / (a+b);
				119
				120	double t1 = a*b;
				121	double t2 = a+b;
				122	double t3 = t2t2(t2+1);
				123
				124	s = sqrt( t1 / t3 );
				125	}
				126
				127	#pragma warning(disable : 4189)
				128
				129	double comparecoins ( double hA, double tA, double hB, double tB )
				130	{
				131	double uA,sA,uB,sB;
				132
				133	beta2norm(hA+1,tA+1,uA,sA);
				134	beta2norm(hB+1,tB+1,uB,sB);
				135
				136	// this is not the right way to handle the discontinuity at 0.5, but i don't want to deal with truncated normal distributions...
				137
				138	if(uA < 0.5) uA = 1.0 - uA;
				139	if(uB < 0.5) uB = 1.0 - uB;
				140
				141	return 1.0 - comparenorms(uA,sA,uB,sB);
				142	}
				143
				144	// Binomial distribution using the normal approximation
				145
				146	double binom2 ( double n, double p, double k )
				147	{
				148	double u = n*p;
				149	double s2 = np(1-p);
				150
				151	double a = k-u;
				152
				153	const double pi = 3.14159265358979323846264338327950288419716939937510;
				154
				155	a = aa / (-2.0s2);
				156	a = exp(a) / sqrt(s22.0pi);
				157
				158	return a;
				159	}
				160
				161	double RandWork ( double bucketcount, double keycount )
				162	{
				163	double avgload = keycount / bucketcount;
				164
				165	double total = 0;
				166
				167	if(avgload <= 16)
				168	{
				169	// if the load is low enough we can compute the expected work directly
				170
				171	double p = pow((bucketcount-1)/bucketcount,keycount);
				172
				173	double work = 0;
				174
				175	for(double i = 0; i < 50; i++)
				176	{
				177	work += i;
				178	total += work * p;
				179
				180	p = (keycount-i) / ( (i+1) (bucketcount-1) );
				181	}
				182	}
				183	else
				184	{
				185	// otherwise precision errors screw up the calculation, and so we fall back
				186	// to the normal approxmation to the binomial distribution
				187
				188	double min = avgload / 5.0;
				189	double max = avgload * 5.0;
				190
				191	for(double i = min; i <= max; i++)
				192	{
				193	double p = binom2(keycount,1.0 / bucketcount,i);
				194
				195	total += double((ii+i) / 2) p;
				196	}
				197	}
				198
				199	return total / avgload;
				200	}
				201
				202	// Normalized standard deviation.
				203
				204	double nsdev ( int * buckets, int len, int keys )
				205	{
				206	double n = len;
				207	double k = keys;
				208	double p = 1.0/n;
				209
				210	double u = k*p;
				211	double s = sqrt(kp(1-p));
				212
				213	double c = 0;
				214
				215	for(int i = 0; i < len; i++)
				216	{
				217	double d = buckets[i];
				218
				219	d = (d-u)/s;
				220
				221	c += d*d;
				222	}
				223
				224	double nsd = sqrt(c / n);
				225
				226	return nsd;
				227	}
				228
				229
				230	double chooseK ( int n, int k )
				231	{
				232	if(k > (n - k)) k = n - k;
				233
				234	double c = 1;
				235
				236	for(int i = 0; i < k; i++)
				237	{
				238	double t = double(n-i) / double(i+1);
				239
				240	c *= t;
				241	}
				242
				243	return c;
				244	}
				245
				246	double chooseUpToK ( int n, int k )
				247	{
				248	double c = 0;
				249
				250	for(int i = 1; i <= k; i++)
				251	{
				252	c += chooseK(n,i);
				253	}
				254
				255	return c;
				256	}
				257
				258	//-----------------------------------------------------------------------------
				259
				260	uint32_t bitrev ( uint32_t v )
				261	{
				262	v = ((v >> 1) & 0x55555555) \| ((v & 0x55555555) << 1);
				263	v = ((v >> 2) & 0x33333333) \| ((v & 0x33333333) << 2);
				264	v = ((v >> 4) & 0x0F0F0F0F) \| ((v & 0x0F0F0F0F) << 4);
				265	v = ((v >> 8) & 0x00FF00FF) \| ((v & 0x00FF00FF) << 8);
				266	v = ( v >> 16 ) \| ( v << 16);
				267
				268	return v;
				269	}
				270
				271	//-----------------------------------------------------------------------------
				272
				273	// Distribution "score"
				274	// TODO - big writeup of what this score means
				275
				276	// Basically, we're computing a constant that says "The test distribution is as
				277	// uniform, RMS-wise, as a random distribution restricted to (1-X)*100 percent of
				278	// the bins. This makes for a nice uniform way to rate a distribution that isn't
				279	// dependent on the number of bins or the number of keys
				280
				281	// (as long as # keys > # bins * 3 or so, otherwise random fluctuations show up
				282	// as distribution weaknesses)
				283
				284	double calcScore ( std::vector<int> const & bins, int keys )
				285	{
				286	double n = (int)bins.size();
				287	double k = keys;
				288
				289	// compute rms value
				290
				291	double r = 0;
				292
				293	for(size_t i = 0; i < bins.size(); i++)
				294	{
				295	double b = bins[i];
				296
				297	r += b*b;
				298	}
				299
				300	r = sqrt(r / n);
				301
				302	// compute fill factor
				303
				304	double f = (kk - 1) / (nr*r - k);
				305
				306	// rescale to (0,1) with 0 = good, 1 = bad
				307
				308	return 1 - (f / n);
				309	}
				310
				311
				312	//----------------------------------------------------------------------------
				313
				314	void plot ( double n )
				315	{
				316	double n2 = n * 1;
				317
				318	if(n2 < 0) n2 = 0;
				319
				320	n2 *= 100;
				321
				322	if(n2 > 64) n2 = 64;
				323
				324	int n3 = (int)floor(n2 + 0.5);
				325
				326	if(n3 == 0)
				327	printf(".");
				328	else
				329	{
				330	char x = '0' + char(n3);
				331
				332	if(x > '9') x = 'X';
				333
				334	printf("%c",x);
				335	}
				336	}
				337
				338	//-----------------------------------------------------------------------------