// @(#)root/physics:$Id$ // Author: Anna Kreshuk 08/10/2004 /************************************************************************* * Copyright (C) 1995-2004, Rene Brun and Fons Rademakers. * * All rights reserved. * * * * For the licensing terms see $ROOTSYS/LICENSE. * * For the list of contributors see $ROOTSYS/README/CREDITS. * *************************************************************************/ ////////////////////////////////////////////////////////////////////////////// // // TRobustEstimator // // Minimum Covariance Determinant Estimator - a Fast Algorithm // invented by Peter J.Rousseeuw and Katrien Van Dreissen // "A Fast Algorithm for the Minimum covariance Determinant Estimator" // Technometrics, August 1999, Vol.41, NO.3 // // What are robust estimators? // "An important property of an estimator is its robustness. An estimator // is called robust if it is insensitive to measurements that deviate // from the expected behaviour. There are 2 ways to treat such deviating // measurements: one may either try to recongize them and then remove // them from the data sample; or one may leave them in the sample, taking // care that they do not influence the estimate unduly. In both cases robust // estimators are needed...Robust procedures compensate for systematic errors // as much as possible, and indicate any situation in which a danger of not being // able to operate reliably is detected." // R.Fruhwirth, M.Regler, R.K.Bock, H.Grote, D.Notz // "Data Analysis Techniques for High-Energy Physics", 2nd edition // // What does this algorithm do? // It computes a highly robust estimator of multivariate location and scatter. // Then, it takes those estimates to compute robust distances of all the // data vectors. Those with large robust distances are considered outliers. // Robust distances can then be plotted for better visualization of the data. // // How does this algorithm do it? // The MCD objective is to find h observations(out of n) whose classical // covariance matrix has the lowest determinant. The MCD estimator of location // is then the average of those h points and the MCD estimate of scatter // is their covariance matrix. The minimum(and default) h = (n+nvariables+1)/2 // so the algorithm is effective when less than (n+nvar+1)/2 variables are outliers. // The algorithm also allows for exact fit situations - that is, when h or more // observations lie on a hyperplane. Then the algorithm still yields the MCD location T // and scatter matrix S, the latter being singular as it should be. From (T,S) the // program then computes the equation of the hyperplane. // // How can this algorithm be used? // In any case, when contamination of data is suspected, that might influence // the classical estimates. // Also, robust estimation of location and scatter is a tool to robustify // other multivariate techniques such as, for example, principal-component analysis // and discriminant analysis. // // // // // Technical details of the algorithm: // 0.The default h = (n+nvariables+1)/2, but the user may choose any interger h with // (n+nvariables+1)/2<=h<=n. The program then reports the MCD's breakdown value // (n-h+1)/n. If you are sure that the dataset contains less than 25% contamination // which is usually the case, a good compromise between breakdown value and // efficiency is obtained by putting h=[.75*n]. // 1.If h=n,the MCD location estimate is the average of the whole dataset, and // the MCD scatter estimate is its covariance matrix. Report this and stop // 2.If nvariables=1 (univariate data), compute the MCD estimate by the exact // algorithm of Rousseeuw and Leroy (1987, pp.171-172) in O(nlogn)time and stop // 3.From here on, h=2. // 3a.If n is small: // - repeat (say) 500 times: // -- construct an initial h-subset, starting from a random (nvar+1)-subset // -- carry out 2 C-steps (described in the comments of CStep function) // - for the 10 results with lowest det(S): // -- carry out C-steps until convergence // - report the solution (T, S) with the lowest det(S) // 3b.If n is larger (say, n>600), then // - construct up to 5 disjoint random subsets of size nsub (say, nsub=300) // - inside each subset repeat 500/5 times: // -- construct an initial subset of size hsub=[nsub*h/n] // -- carry out 2 C-steps // -- keep the best 10 results (Tsub, Ssub) // - pool the subsets, yielding the merged set (say, of size nmerged=1500) // - in the merged set, repeat for each of the 50 solutions (Tsub, Ssub) // -- carry out 2 C-steps // -- keep the 10 best results // - in the full dataset, repeat for those best results: // -- take several C-steps, using n and h // -- report the best final result (T, S) // 4.To obtain consistency when the data comes from a multivariate normal // distribution, covariance matrix is multiplied by a correction factor // 5.Robust distances for all elements, using the final (T, S) are calculated // Then the very final mean and covariance estimates are calculated only for // values, whose robust distances are less than a cutoff value (0.975 quantile // of chi2 distribution with nvariables degrees of freedom) // ////////////////////////////////////////////////////////////////////////////// #include "TRobustEstimator.h" #include "TRandom.h" #include "TMath.h" #include "TH1D.h" #include "TPaveLabel.h" #include "TDecompChol.h" ClassImp(TRobustEstimator) const Double_t kChiMedian[50]= { 0.454937, 1.38629, 2.36597, 3.35670, 4.35146, 5.34812, 6.34581, 7.34412, 8.34283, 9.34182, 10.34, 11.34, 12.34, 13.34, 14.34, 15.34, 16.34, 17.34, 18.34, 19.34, 20.34, 21.34, 22.34, 23.34, 24.34, 25.34, 26.34, 27.34, 28.34, 29.34, 30.34, 31.34, 32.34, 33.34, 34.34, 35.34, 36.34, 37.34, 38.34, 39.34, 40.34, 41.34, 42.34, 43.34, 44.34, 45.34, 46.34, 47.34, 48.34, 49.33}; const Double_t kChiQuant[50]={ 5.02389, 7.3776,9.34840,11.1433,12.8325, 14.4494,16.0128,17.5346,19.0228,20.4831,21.920,23.337, 24.736,26.119,27.488,28.845,30.191,31.526,32.852,34.170, 35.479,36.781,38.076,39.364,40.646,41.923,43.194,44.461, 45.722,46.979,48.232,49.481,50.725,51.966,53.203,54.437, 55.668,56.896,58.120,59.342,60.561,61.777,62.990,64.201, 65.410,66.617,67.821,69.022,70.222,71.420}; //_____________________________________________________________________________ TRobustEstimator::TRobustEstimator(){ //this constructor should be used in a univariate case: //first call this constructor, then - the EvaluateUni(..) function } //______________________________________________________________________________ TRobustEstimator::TRobustEstimator(Int_t nvectors, Int_t nvariables, Int_t hh) :fMean(nvariables), fCovariance(nvariables), fInvcovariance(nvariables), fCorrelation(nvariables), fRd(nvectors), fSd(nvectors), fOut(1), fHyperplane(nvariables), fData(nvectors, nvariables) { //constructor if ((nvectors<=1)||(nvariables<=0)){ Error("TRobustEstimator","Not enough vectors or variables"); return; } if (nvariables==1){ Error("TRobustEstimator","For the univariate case, use the default constructor and EvaluateUni() function"); return; } fN=nvectors; fNvar=nvariables; if (hh<(fN+fNvar+1)/2){ if (hh>0) Warning("TRobustEstimator","chosen h is too small, default h is taken instead"); fH=(fN+fNvar+1)/2; } else fH=hh; fVarTemp=0; fVecTemp=0; fExact=0; } //_____________________________________________________________________________ void TRobustEstimator::AddColumn(Double_t *col) { //adds a column to the data matrix //it is assumed that the column has size fN //variable fVarTemp keeps the number of columns l //already added if (fVarTemp==fNvar) { fNvar++; fCovariance.ResizeTo(fNvar, fNvar); fInvcovariance.ResizeTo(fNvar, fNvar); fCorrelation.ResizeTo(fNvar, fNvar); fMean.ResizeTo(fNvar); fHyperplane.ResizeTo(fNvar); fData.ResizeTo(fN, fNvar); } for (Int_t i=0; ikEps) { det=CStep(fN, fH, index, fData, sscp, ndist); if(TMath::Abs(det-deti[i])cutoff) { fOut[j]=i; j++; } } } else { fExact=Exact(ndist); } delete [] index; delete [] ndist; delete [] deti; return; } ///////////////////////////////////////////////// //if n>nmini, the dataset should be partitioned //partitioning //////////////////////////////////////////////// Int_t indsubdat[5]; Int_t nsub; for (ii=0; ii<5; ii++) indsubdat[ii]=0; nsub = Partition(nmini, indsubdat); Int_t sum=0; for (ii=0; ii<5; ii++) sum+=indsubdat[ii]; Int_t *subdat=new Int_t[sum]; RDraw(subdat, nsub, indsubdat); //now the indexes of selected cases are in the array subdat //matrices to store best means and covariances Int_t nbestsub=nbest*nsub; TMatrixD mstockbig(nbestsub, fNvar); TMatrixD cstockbig(fNvar, fNvar*nbestsub); TMatrixD hyperplane(nbestsub, fNvar); for (i=0; i=fH) { fExact = nh; delete [] detibig; delete [] deti; delete [] subdat; delete [] ndist; delete [] index; return; } else { CreateOrtSubset(datmerged, index, hmerged, sum, sscp, ndist); } } det=CStep(sum, hmerged, index, datmerged, sscp, ndist); if (det=fH) { fExact = nh; delete [] detibig; delete [] deti; delete [] subdat; delete [] ndist; delete [] index; return; } } detibig[k]=det; for(i=0; ikEps) { det=CStep(fN, fH, index, fData, sscp, ndist); if(TMath::Abs(det-detibig[minind])cutoff) { fOut[j]=i; j++; } } delete [] detibig; delete [] deti; delete [] subdat; delete [] ndist; delete [] index; return; } //___________________________________________________________________________________________ void TRobustEstimator::EvaluateUni(Int_t nvectors, Double_t *data, Double_t &mean, Double_t &sigma, Int_t hh) { //for the univariate case //estimates of location and scatter are returned in mean and sigma parameters //the algorithm works on the same principle as in multivariate case - //it finds a subset of size hh with smallest sigma, and then returns mean and //sigma of this subset if (hh==0) hh=(nvectors+2)/2; Double_t faclts[]={2.6477,2.5092,2.3826,2.2662,2.1587,2.0589,1.9660,1.879,1.7973,1.7203,1.6473}; Int_t *index=new Int_t[nvectors]; TMath::Sort(nvectors, data, index, kFALSE); Int_t nquant; nquant=TMath::Min(Int_t(Double_t(((hh*1./nvectors)-0.5)*40))+1, 11); Double_t factor=faclts[nquant-1]; Double_t *aw=new Double_t[nvectors]; Double_t *aw2=new Double_t[nvectors]; Double_t sq=0; Double_t sqmin=0; Int_t ndup=0; Int_t len=nvectors-hh; Double_t *slutn=new Double_t[len]; for(Int_t i=0; i= 50) return 0; return kChiQuant[i]; } //_______________________________________________________________________ void TRobustEstimator::GetCovariance(TMatrixDSym &matr) { //returns the covariance matrix if (matr.GetNrows()!=fNvar || matr.GetNcols()!=fNvar){ Warning("GetCovariance","provided matrix is of the wrong size, it will be resized"); matr.ResizeTo(fNvar, fNvar); } matr=fCovariance; } //_______________________________________________________________________ void TRobustEstimator::GetCorrelation(TMatrixDSym &matr) { //returns the correlation matrix if (matr.GetNrows()!=fNvar || matr.GetNcols()!=fNvar) { Warning("GetCorrelation","provided matrix is of the wrong size, it will be resized"); matr.ResizeTo(fNvar, fNvar); } matr=fCorrelation; } //____________________________________________________________________ const TVectorD* TRobustEstimator::GetHyperplane() const { //if the points are on a hyperplane, returns this hyperplane if (fExact==0) { Error("GetHyperplane","the data doesn't lie on a hyperplane!\n"); return 0; } else { return &fHyperplane; } } //______________________________________________________________________ void TRobustEstimator::GetHyperplane(TVectorD &vec) { //if the points are on a hyperplane, returns this hyperplane if (fExact==0){ Error("GetHyperplane","the data doesn't lie on a hyperplane!\n"); return; } if (vec.GetNoElements()!=fNvar) { Warning("GetHyperPlane","provided vector is of the wrong size, it will be resized"); vec.ResizeTo(fNvar); } vec=fHyperplane; } //________________________________________________________________________ void TRobustEstimator::GetMean(TVectorD &means) { //return the estimate of the mean if (means.GetNoElements()!=fNvar) { Warning("GetMean","provided vector is of the wrong size, it will be resized"); means.ResizeTo(fNvar); } means=fMean; } //_________________________________________________________________________ void TRobustEstimator::GetRDistances(TVectorD &rdist) { //returns the robust distances (helps to find outliers) if (rdist.GetNoElements()!=fN) { Warning("GetRDistances","provided vector is of the wrong size, it will be resized"); rdist.ResizeTo(fN); } rdist=fRd; } //__________________________________________________________________________ Int_t TRobustEstimator::GetNOut() { //returns the number of outliers return fOut.GetSize(); } //_________________________________________________________________________ void TRobustEstimator::AddToSscp(TMatrixD &sscp, TVectorD &vec) { //update the sscp matrix with vector vec Int_t i, j; for (j=1; j1e-14) sd[i]=TMath::Sqrt(f); else sd[i]=0; m(i)/=nvec; } for (i=0; iUniform(0, 1)*(ntotal-1)); if (i>0){ for(j=0; j<=i-1; j++) { if(index[j]==num) repeat=kTRUE; } } if(repeat==kTRUE) { i--; repeat=kFALSE; } else { index[i]=num; nindex++; } } ClearSscp(sscp); TVectorD vec(fNvar); Double_t det; for (i=0; iUniform(0,1)*(ntotal-1)); repeat=kFALSE; for(i=0; i=fH) { ClearSscp(sscp); for (i=0; i=2*nmini) && (fN<=(3*nmini-1))) { if (fN%2==1){ indsubdat[0]=Int_t(fN*0.5); indsubdat[1]=Int_t(fN*0.5)+1; } else indsubdat[0]=indsubdat[1]=Int_t(fN/2); nsub=2; } else{ if((fN>=3*nmini) && (fN<(4*nmini -1))) { if(fN%3==0){ indsubdat[0]=indsubdat[1]=indsubdat[2]=Int_t(fN/3); } else { indsubdat[0]=Int_t(fN/3); indsubdat[1]=Int_t(fN/3)+1; if (fN%3==1) indsubdat[2]=Int_t(fN/3); else indsubdat[2]=Int_t(fN/3)+1; } nsub=3; } else{ if((fN>=4*nmini)&&(fN<=(5*nmini-1))){ if (fN%4==0) indsubdat[0]=indsubdat[1]=indsubdat[2]=indsubdat[3]=Int_t(fN/4); else { indsubdat[0]=Int_t(fN/4); indsubdat[1]=Int_t(fN/4)+1; if(fN%4==1) indsubdat[2]=indsubdat[3]=Int_t(fN/4); if(fN%4==2) { indsubdat[2]=Int_t(fN/4)+1; indsubdat[3]=Int_t(fN/4); } if(fN%4==3) indsubdat[2]=indsubdat[3]=Int_t(fN/4)+1; } nsub=4; } else { for(Int_t i=0; i<5; i++) indsubdat[i]=nmini; nsub=5; } } } return nsub; } //___________________________________________________________________________ Int_t TRobustEstimator::RDist(TMatrixD &sscp) { //Calculates robust distances.Then the samples with robust distances //greater than a cutoff value (0.975 quantile of chi2 distribution with //fNvar degrees of freedom, multiplied by a correction factor), are given //weiht=0, and new, reweighted estimates of location and scatter are calculated //The function returns the number of outliers. Int_t i, j; Int_t nout=0; TVectorD temp(fNvar); TDecompChol chol(fCovariance); fInvcovariance = chol.Invert(); for (i=0; iUniform(0, 1) * (fN-jndex))+1; jndex++; if (jndex==1) { subdat[0]=nrand; } else { subdat[jndex-1]=nrand+jndex-2; for (i=1; i<=jndex-1; i++) { if(subdat[i-1] > nrand+i-2) { for(j=jndex; j>=i+1; j--) { subdat[j-1]=subdat[j-2]; } subdat[i-1]=nrand+i-2; break; //breaking the loop for(i=1... } } } } } } //_____________________________________________________________________________ Double_t TRobustEstimator::KOrdStat(Int_t ntotal, Double_t *a, Int_t k, Int_t *work){ //because I need an Int_t work array Bool_t isAllocated = kFALSE; const Int_t kWorkMax=100; Int_t i, ir, j, l, mid; Int_t arr; Int_t *ind; Int_t workLocal[kWorkMax]; Int_t temp; if (work) { ind = work; } else { ind = workLocal; if (ntotal > kWorkMax) { isAllocated = kTRUE; ind = new Int_t[ntotal]; } } for (Int_t ii=0; ii> 1; //choose median of left, center and right {temp = ind[mid]; ind[mid]=ind[l+1]; ind[l+1]=temp;}//elements as partitioning element arr. if (a[ind[l]]>a[ind[ir]]) //also rearrange so that a[l]<=a[l+1] {temp = ind[l]; ind[l]=ind[ir]; ind[ir]=temp;} if (a[ind[l+1]]>a[ind[ir]]) {temp=ind[l+1]; ind[l+1]=ind[ir]; ind[ir]=temp;} if (a[ind[l]]>a[ind[l+1]]) {temp = ind[l]; ind[l]=ind[l+1]; ind[l+1]=temp;} i=l+1; //initialize pointers for partitioning j=ir; arr = ind[l+1]; for (;;) { do i++; while (a[ind[i]]a[arr]); if (j=rk) ir = j-1; //keep active the partition that if (j<=rk) l=i; //contains the k_th element } } }