BayesianAnalysisToolkit: BCMath.cxx Source File

Go to the documentation of this file.
00001 /*
00002  * Copyright (C) 2008-2012, Daniel Kollar, Kevin Kroeninger and Jing Liu
00003  * All rights reserved.
00004  *
00005  * For the licensing terms see doc/COPYING.
00006  */
00007 
00008 // ---------------------------------------------------------
00009 
00010 #include "BCMath.h"
00011 #include "BCLog.h"
00012 
00013 #include <math.h>
00014 #include <limits>
00015 #include <set>
00016 
00017 #include <TMath.h>
00018 #include <TF1.h>
00019 #include <TH1D.h>
00020 
00021 #include <Math/PdfFuncMathCore.h>
00022 
00023 namespace BCMath {
00024 
00025 static unsigned int nCacheFact = 10000;
00026 static double * logfact = 0;
00027 
00028 // ---------------------------------------------------------
00029 
00030 double LogGaus(double x, double mean, double sigma, bool norm)
00031 {
00032    // if we have a delta function, return fixed value
00033    if (sigma == 0.)
00034       return 0;
00035 
00036    // if sigma is negative use absolute value
00037    if (sigma < 0.)
00038       sigma *= -1.;
00039 
00040    double arg = (x - mean) / sigma;
00041    double result = -.5 * arg * arg;
00042 
00043    // check if we should add the normalization constant
00044    if (!norm)
00045       return result;
00046 
00047    // subtract the log of the denominator of the normalization constant
00048    // and return
00049    return result - log(sqrt(2. * M_PI) * sigma);
00050 }
00051 
00052 // ---------------------------------------------------------
00053 
00054 double LogPoisson(double x, double par)
00055 {
00056    if (par > 899)
00057       return LogGaus(x, par, sqrt(par), true);
00058 
00059    if (x < 0)
00060       return 0;
00061 
00062    if (x == 0.)
00063       return -par;
00064 
00065    return x * log(par) - par - ApproxLogFact(x);
00066 }
00067 
00068 // ---------------------------------------------------------
00069 
00070 double ApproxBinomial(int n, int k, double p)
00071 {
00072    return exp(LogApproxBinomial(n, k, p));
00073 }
00074 
00075 // ---------------------------------------------------------
00076 
00077 double LogApproxBinomial(int n, int k, double p)
00078 {
00079    // check p
00080    if (p == 0)
00081       return -1e99;
00082 
00083    else if (p == 1)
00084       return 0;
00085 
00086    // switch parameters if n < k
00087    if (n < k) {
00088       int a = n;
00089       n = k;
00090       k = a;
00091    }
00092 
00093    return LogBinomFactor(n, k) + (double) k * log(p) + (double) (n - k) * log(1. - p);
00094 }
00095 
00096 // ---------------------------------------------------------
00097 
00098 double LogBinomFactor(int n, int k)
00099 {
00100    // switch parameters if n < k
00101    if (n < k) {
00102       int a = n;
00103       n = k;
00104       k = a;
00105    }
00106 
00107    if (n == k || k == 0)
00108       return 0.;
00109    if (k == 1 || k == n - 1)
00110       return log((double) n);
00111 
00112    // if no approximation needed
00113    if ( n < BCMATH_NFACT_ALIMIT || (n < (int) BCMath::nCacheFact &&  (n - k) < 10) )
00114       return LogNoverK(n, k);
00115 
00116    // calculate final log(n over k) using approximations if necessary
00117    return ApproxLogFact((double)n) - ApproxLogFact((double)k) - ApproxLogFact((double)(n - k));
00118 }
00119 
00120 // ---------------------------------------------------------
00121 
00122 double ApproxLogFact(double x)
00123 {
00124    if (x > BCMath::nCacheFact)
00125       return x * log(x) - x + log(x * (1. + 4. * x * (1. + 2. * x))) / 6. + log(M_PI) / 2.;
00126 
00127    else
00128       return LogFact((int) x);
00129 }
00130 
00131 // ---------------------------------------------------------
00132 double LogFact(int n)
00133 {
00134    // return NaN for negative argument
00135    if (n<0)
00136       return std::numeric_limits<double>::quiet_NaN();
00137 
00138    // cache the factorials on first call
00139    if ( !BCMath::logfact) {
00140       BCMath::logfact = new double[BCMath::nCacheFact+1];
00141       double tmplogfact = 0;
00142       BCMath::logfact[0] = tmplogfact;
00143       for (unsigned int i=1; i<=BCMath::nCacheFact; i++) {
00144          tmplogfact += log((double) i);
00145          BCMath::logfact[i] = tmplogfact;
00146       }
00147    }
00148 
00149    // return cached value if available
00150    if (n <= (int) BCMath::nCacheFact)
00151       return BCMath::logfact[n];
00152 
00153    // calculate factorial starting from the highest cached value
00154    double ln(0.);
00155    if (BCMath::logfact)
00156       ln = BCMath::logfact[nCacheFact];
00157 
00158    for (int i = BCMath::nCacheFact+1; i <= n; i++)
00159       ln += log((double) i);
00160 
00161    return ln;
00162 }
00163 
00164 // ---------------------------------------------------------
00165 
00166 void CacheFactorial(unsigned int n)
00167 {
00168    nCacheFact = n;
00169 }
00170 
00171 // ---------------------------------------------------------
00172 
00173 double LogNoverK(int n, int k)
00174 {
00175    // switch parameters if n < k
00176    if (n < k) {
00177       int a = n;
00178       n = k;
00179       k = a;
00180    }
00181 
00182    if (n == k || k == 0)
00183       return 0.;
00184    if (k == 1 || k == n - 1)
00185       return log((double) n);
00186 
00187    int lmax = Max(k, n - k);
00188    int lmin = Min(k, n - k);
00189 
00190    double ln = 0.;
00191 
00192    for (int i = n; i > lmax; i--)
00193       ln += log((double) i);
00194    ln -= LogFact(lmin);
00195 
00196    return ln;
00197 }
00198 
00199 // ---------------------------------------------------------
00200 
00201 int Nint(double x)
00202 {
00203    // round to integer
00204    int i;
00205 
00206    if (x >= 0) {
00207       i = (int) (x + .5);
00208       if (x + .5 == (double) i && (i&1))
00209          i--;
00210    }
00211    else {
00212       i = int(x - 0.5);
00213       if (x - 0.5 == double(i) && (i&1))
00214          i++;
00215    }
00216 
00217    return i;
00218 }
00219 
00220 // ---------------------------------------------------------
00221 
00222 double rms(int n, const double *a)
00223 {
00224    if (n <= 0 || !a)
00225       return 0;
00226 
00227    double sum = 0., sum2 = 0.;
00228 
00229    for (int i = 0; i < n; i++) {
00230       sum += a[i];
00231       sum2 += a[i] * a[i];
00232    }
00233 
00234    double n1 = 1. / (double) n;
00235    double mean = sum * n1;
00236 
00237    return sqrt(fabs(sum2 * n1 - mean * mean));
00238 }
00239 
00240 // ---------------------------------------------------------
00241 
00242 double LogBreitWignerNonRel(double x, double mean, double Gamma, bool norm)
00243 {
00244    double bw = log(Gamma) - log((x - mean) * (x - mean) + Gamma*Gamma / 4.);
00245 
00246    if (norm)
00247       bw -= log(2. * M_PI);
00248 
00249    return bw;
00250 }
00251 
00252 // ---------------------------------------------------------
00253 
00254 double LogBreitWignerRel(double x, double mean, double Gamma)
00255 {
00256    return -log((x*x - mean*mean) * (x*x - mean*mean) + mean*mean * Gamma*Gamma);
00257 }
00258 
00259 // ---------------------------------------------------------
00260 
00261 double LogChi2(double x, int n)
00262 {
00263    if (x < 0) {
00264       BCLog::OutWarning("BCMath::LogChi2 : parameter cannot be negative!");
00265       return -1e99;
00266    }
00267 
00268    if (x == 0 && n == 1) {
00269       BCLog::OutWarning("BCMath::LogChi2 : returned value is infinity!");
00270       return 1e99;
00271    }
00272 
00273    double nOver2 = ((double) n) / 2.;
00274 
00275    return (nOver2 - 1.) * log(x) - x / 2. - nOver2 * log(2) - log(TMath::Gamma(nOver2));
00276 }
00277 
00278 // ---------------------------------------------------------
00279 double LogVoigtian(double x, double sigma, double gamma)
00280 {
00281    if (sigma <= 0 || gamma <= 0) {
00282       BCLog::OutWarning("BCMath::LogVoigtian : widths are negative or zero!");
00283       return -1e99;
00284    }
00285 
00286    return log(TMath::Voigt(x, sigma, gamma));
00287 }
00288 
00289 // ---------------------------------------------------------
00290 double SplitGaussian(double* x, double* par)
00291 {
00292    double mean = par[0]; 
00293    double sigmadown = par[1]; 
00294    double sigmaup = par[2];
00295 
00296    double sigma = sigmadown;
00297 
00298    if (x[0] > mean)
00299       sigma = sigmaup; 
00300    
00301    return 1.0/sqrt(2.0*M_PI)/sigma * exp(- (x[0]-mean)*(x[0]-mean)/2./sigma/sigma);
00302 }
00303 
00304 // ---------------------------------------------------------
00305 // wrapper with signature to construct a TF1
00306 double chi2(double *x, double *par)
00307 {
00308    return ROOT::Math::chisquared_pdf(x[0], par[0]);
00309 }
00310 
00311 // ---------------------------------------------------------
00312 void RandomChi2(std::vector<double> &randoms, int K)
00313 {
00314    // fixed upper cutoff to 1000, might be too small
00315    TF1 *f = new TF1("chi2", chi2, 0.0, 1000, 1);
00316    f->SetParameter(0, K);
00317    f->SetNpx(500);
00318    // uses inverse-transform method
00319    // fortunately CDF only built once
00320    for (unsigned int i = 0; i < randoms.size(); i++)
00321       randoms.at(i) = f->GetRandom();
00322 
00323    delete f;
00324 }
00325 
00326 // ---------------------------------------------------------
00327 TH1D * ECDF(const std::vector<double> & data)
00328 {
00329    int N = data.size();
00330 
00331    std::set<double> uniqueObservations;
00332    // sort and filter out multiple instances
00333    for (int i = 0; i < N; ++i)
00334       uniqueObservations.insert(data[i]);
00335 
00336    // extract lower edges for CDF histogram
00337    unsigned nUnique = uniqueObservations.size();
00338    std::vector<double> lowerEdges(nUnique);
00339 
00340    // traverse the set
00341    std::set<double>::iterator iter;
00342    int counter = 0;
00343    for (iter = uniqueObservations.begin(); iter != uniqueObservations.end(); ++iter) {
00344       lowerEdges[counter] = *iter;
00345       counter++;
00346    }
00347 
00348    // create histogram where
00349    // lower edge of first bin = min. data
00350    // upper edge of last bin = max. data
00351    TH1D * ECDF = new TH1D("ECDF", "Empirical cumulative distribution function", nUnique - 1, &lowerEdges[0]);
00352 
00353    // fill the data in to find multiplicities
00354    for (int i = 0; i < N; ++i)
00355       ECDF -> Fill(data[i]);
00356 
00357    // just in case, empty the underflow
00358    ECDF -> SetBinContent(0, 0.0);
00359 
00360    // construct the ecdf
00361    for (int nBin = 1; nBin <= ECDF->GetNbinsX(); nBin++) {
00362       double previousBin = ECDF -> GetBinContent(nBin - 1);
00363       // BCLog::OutDebug(Form("n_%d = %.2f", nBin, ECDF -> GetBinContent(nBin) ));
00364       // BCLog::OutDebug(Form("previous_%d = %.2f", nBin, previousBin));
00365       double thisBin = ECDF -> GetBinContent(nBin) / double(N);
00366       ECDF -> SetBinContent(nBin, thisBin + previousBin);
00367 
00368       // the uncertainty is only correctly estimated in the model
00369       ECDF -> SetBinError(nBin, 0.0);
00370    }
00371 
00372    // set the endpoint to 1, so all larger values are at CDF=1
00373    ECDF -> SetBinContent(ECDF->GetNbinsX() + 1, 1.);
00374 
00375    // adjust for nice plotting
00376    ECDF -> SetMinimum(0.);
00377    ECDF -> SetMaximum(1.);
00378 
00379    return ECDF;
00380 }
00381 
00382 // ---------------------------------------------------------
00383 
00384 std::vector<int> longestRuns(const std::vector<bool> &bitStream)
00385 {
00386    // initialize counter variables
00387    unsigned int maxRunAbove, maxRunBelow, currRun;
00388    maxRunAbove = 0;
00389    maxRunBelow = 0;
00390    currRun = 1;
00391    // set both entries to zero
00392    std::vector<int> runs(2, 0);
00393 
00394    if (bitStream.empty())
00395       return runs;
00396 
00397    // flag about kind of the currently considered run
00398    bool aboveRun = bitStream.at(0);
00399 
00400    // start at second variable
00401    std::vector<bool>::const_iterator iter = bitStream.begin();
00402    ++iter;
00403    while (iter != bitStream.end()) {
00404 
00405       // increase counter if run continues
00406       if (*(iter - 1) == *iter)
00407          currRun++;
00408       else {
00409          // compare terminated run to maximum
00410          if (aboveRun)
00411             maxRunAbove = Max(maxRunAbove, currRun);
00412          else
00413             maxRunBelow = Max(maxRunBelow, currRun);
00414          // set flag to run of opposite kind
00415          aboveRun = !aboveRun;
00416          // restart at length one
00417          currRun = 1;
00418       }
00419       // move to next bit
00420       ++iter;
00421    }
00422 
00423    // check last run
00424    if (aboveRun)
00425       maxRunAbove = Max(maxRunAbove, currRun);
00426    else
00427       maxRunBelow = Max(maxRunBelow, currRun);
00428 
00429    // save the longest runs
00430    runs.at(0) = maxRunBelow;
00431    runs.at(1) = maxRunAbove;
00432 
00433    return runs;
00434 }
00435 // ---------------------------------------------------------
00436 
00437 std::vector<double> longestRunsChi2(
00438       const std::vector<double>& yMeasured,
00439       const std::vector<double>& yExpected, const std::vector<double>& sigma)
00440 {
00441    //initialize counter variables
00442    double maxRunAbove, maxRunBelow, currRun;
00443    maxRunAbove = 0;
00444    maxRunBelow = 0;
00445    currRun = 0;
00446    //set both entries to zero
00447    std::vector<double> runs(2, 0);
00448 
00449    //check input size
00450    if (yMeasured.size() != yExpected.size() || yMeasured.size() != sigma.size()
00451          || yExpected.size() != sigma.size()) {
00452       //should throw exception
00453       return runs;
00454    }
00455 
00456    //exclude zero uncertainty
00457    //...
00458 
00459     int N = yMeasured.size();
00460     if ( N<=0)
00461        return runs;
00462    //BCLog::OutDebug(Form("N = %d", N));
00463 
00464 
00465    //flag about kind of the currently considered run
00466    double residue = (yMeasured.at(0) - yExpected.at(0)) / sigma.at(0);
00467    bool aboveRun = residue >= 0 ? true : false;
00468    currRun = residue * residue;
00469 
00470    //start at second variable
00471    for (int i = 1; i < N; i++) {
00472       residue = (yMeasured.at(i) - yExpected.at(i)) / sigma.at(i);
00473       //run continues
00474       if ((residue >= 0) == aboveRun) {
00475          currRun += residue * residue;
00476       } else {
00477          //compare terminated run to maximum
00478          if (aboveRun)
00479             maxRunAbove = Max(maxRunAbove, currRun);
00480          else
00481             maxRunBelow = Max(maxRunBelow, currRun);
00482          //set flag to run of opposite kind
00483          aboveRun = !aboveRun;
00484          //restart at current residual
00485          currRun = residue * residue;
00486       }
00487       //BCLog::OutDebug(Form("maxRunBelow = %g", maxRunBelow));
00488       //BCLog::OutDebug(Form("maxRunAbove = %g", maxRunAbove));
00489       //BCLog::OutDebug(Form("currRun = %g", currRun));
00490 
00491    }
00492 
00493    //BCLog::OutDebug(Form("maxRunBelow = %g", maxRunBelow));
00494    //BCLog::OutDebug(Form("maxRunAbove = %g", maxRunAbove));
00495    //BCLog::OutDebug(Form("currRun = %g", currRun));
00496 
00497    //check last run
00498    if (aboveRun)
00499       maxRunAbove = Max(maxRunAbove, currRun);
00500    else
00501       maxRunBelow = Max(maxRunBelow, currRun);
00502 
00503    //BCLog::OutDebug(Form("maxRunBelow = %g", maxRunBelow));
00504    //BCLog::OutDebug(Form("maxRunAbove = %g", maxRunAbove));
00505 
00506    //save the longest runs
00507    runs.at(0) = maxRunBelow;
00508    runs.at(1) = maxRunAbove;
00509 
00510    return runs;
00511 }
00512 // ---------------------------------------------------------
00513 double longestRunFrequency(unsigned longestObserved, unsigned int nTrials)
00514 {
00515    // can't observe run that's longer than the whole sequence
00516    if (longestObserved >= nTrials)
00517       return 0.;
00518 
00519    // return value
00520    double prob = 0.;
00521 
00522    // short cuts
00523    typedef unsigned int uint;
00524    uint Lobs = longestObserved;
00525    uint n = nTrials;
00526 
00527    // the result of the inner loop is the cond. P given r successes
00528    double conditionalProb;
00529 
00530    // first method: use the gamma function for the factorials: bit slower and more inaccurate
00531    // in fact may return NaN for n >= 1000
00532 
00533    //   double Gup, Gdown1, Gdown2, Gdown3;
00534    //
00535    //   for (uint r = 0; r <= n; r++) {
00536    //      conditionalProb = 0.0;
00537    //      for (uint i = 1; ( i <= n-r+1) && (i <= uint(r / double(Lobs + 1)) ); i++) {
00538    //
00539    //         Gup =  TMath::Gamma(1 - i * (Lobs + 1) + n);
00540    //         Gdown1 = TMath::Gamma(1 + i);
00541    //         Gdown2 = TMath::Gamma(2 - i + n - r);
00542    //         Gdown3 = TMath::Gamma(1 - i * (Lobs + 1) + r);
00543    //
00544    //         //consider the sign of contribution
00545    //         Gup = i%2 ? Gup : - Gup;
00546    //
00547    //         conditionalProb += Gup/(Gdown1 * Gdown2 * Gdown3);
00548    //
00550    //
00555    //      }
00556    //      prob += (1 + n - r)*conditionalProb;
00558    //   }
00559 
00560    // alternative using log factorial approximations, is faster and more accurate
00561 
00562    double tempLog = 0;
00563    for (uint r = 0; r <= n; r++) {
00564       conditionalProb = 0.0;
00565 
00566       for (uint i = 1; (i <= n - r + 1) && (i <= uint(r / double(Lobs + 1))); i++) {
00567          tempLog = ApproxLogFact(n - i * (Lobs + 1)) - ApproxLogFact(i)
00568                - ApproxLogFact(n - r - i + 1)
00569                - ApproxLogFact(r - i * (Lobs + 1));
00570          if (i % 2)
00571             conditionalProb += exp(tempLog);
00572          else
00573             conditionalProb -= exp(tempLog);
00574          //         printf("tempLog inside = %.2f",prob);
00575       }
00576       //      printf("tempLog outside = %.2f",prob);
00577       prob += (1 + n - r) * conditionalProb;
00578    }
00579 
00580    // Bernoulli probability of each permutation
00581    prob *= pow(2., -double(n));
00582 
00583    return prob;
00584 }
00585 
00586 double Rvalue(const std::vector<double> & chain_means, const std::vector<double> & chain_variances,
00587               const unsigned & chain_length, const bool & strict)  throw (std::invalid_argument, std::domain_error)
00588 {
00589     if (chain_means.size() != chain_variances.size())
00590         throw std::invalid_argument("BCMath::RValue: chain means and chain variances are not aligned!");
00591 
00592     const double n = chain_length;
00593     const double m = chain_means.size();
00594 
00595     if (m <= 1)
00596         throw std::invalid_argument("BCMath:RValue: Need at least two chains to compute R-value!");
00597 
00598     // init
00599     double variance_of_means = 0.0;
00600     double mean_of_means = 0.0;
00601     double mean_of_variances = 0.0;
00602     double variance_of_variances = 0.0;
00603 
00604     // use Welford's method here as well with temporary variables
00605     double previous_mean_of_means = 0;
00606     double previous_mean_of_variances = 0;
00607     for (unsigned i = 0 ; i < m ; ++i)
00608     {
00609         if (0 == i)
00610         {
00611             mean_of_means = chain_means.front();
00612             variance_of_means = 0;
00613             mean_of_variances = chain_variances.front();
00614             variance_of_variances = 0;
00615 
00616             continue;
00617         }
00618 
00619         // temporarily store previous mean of step (i-1)
00620         previous_mean_of_means = mean_of_means;
00621         previous_mean_of_variances = mean_of_variances;
00622 
00623         // update step
00624         mean_of_means += (chain_means[i] - previous_mean_of_means) / (i + 1.0);
00625         variance_of_means += (chain_means[i] - previous_mean_of_means) * (chain_means[i] - mean_of_means);
00626         mean_of_variances += (chain_variances[i] - previous_mean_of_variances) / (i + 1.0);
00627         variance_of_variances += (chain_variances[i] - previous_mean_of_variances) * (chain_variances[i] - mean_of_variances);
00628     }
00629 
00630     variance_of_means /= m - 1.0;
00631     variance_of_variances /= m - 1.0;
00632 
00633     //use Gelman/Rubin notation
00634     double B = variance_of_means * n;
00635     double W = mean_of_variances;
00636     double sigma_squared = (n - 1.0) / n * W + B / n;
00637 
00638     // avoid NaN due to divide by zero
00639     if (0.0 == W)
00640     {
00641         BCLog::OutDebug("BCMath::Rvalue: W = 0. Avoiding R = NaN.");
00642         return std::numeric_limits<double>::max();
00643     }
00644 
00645     // the lax implementation stops here
00646     if (!strict)
00647         return sqrt(sigma_squared / W);
00648 
00649     //estimated scale reduction
00650     double R = 0.0;
00651 
00652     // compute covariances using the means from above
00653     double covariance_22 = 0.0; // cov(s_i^2, \bar{x_i}^2
00654     double covariance_21 = 0.0; // cov(s_i^2, \bar{x_i}
00655 
00656     for (unsigned i = 0 ; i < m ; ++i)
00657     {
00658         covariance_21 += (chain_variances[i] - mean_of_variances) * (chain_means.at(i) - mean_of_means);
00659         covariance_22 += (chain_variances[i] - mean_of_variances) * (chain_means[i] * chain_means[i] - mean_of_means * mean_of_means);
00660     }
00661 
00662     covariance_21 /= m - 1.0;
00663     covariance_22 /= m - 1.0;
00664 
00665     // scale of t-distribution
00666     double V = sigma_squared + B / (m * n);
00667 
00668     // estimation of scale variance
00669     double a = (n - 1.0) * (n - 1.0) / (n * n * m) * variance_of_variances;
00670     double b = (m + 1) * (m + 1) / (m * n * m * n) * 2.0 / (m - 1) * B * B;
00671     double c = 2.0 * (m + 1.0) * (n - 1.0) / (m * n * n) * n / m * (covariance_22 - 2.0 * mean_of_means * covariance_21);
00672     double variance_of_V = a + b + c;
00673 
00674     // degrees of freedom of t-distribution
00675     double df = 2.0 * V * V / variance_of_V;
00676 
00677     if (df <= 2)
00678     {
00679         BCLog::OutDebug(Form("BCMath::Rvalue: DoF (%g) below 2. Avoiding R = NaN.", df));
00680         return std::numeric_limits<double>::max();;
00681     }
00682 
00683     // sqrt of estimated scale reduction if sampling were continued
00684     R = sqrt(V / W * df / (df - 2.0));
00685 
00686     // R smaller, but close to 1 is OK.
00687     if (R < 0.99 && n > 100)
00688         throw std::domain_error(Form("BCMath::Rvalue: %g < 0.99. Check for a bug in the implementation!", R));
00689 
00690     return R;
00691 }
00692 } // end of namespace BCMath