// @(#)root/splot:$Id$
// Author: Muriel Pivk, Anna Kreshuk 10/2005
/**********************************************************************
* *
* Copyright (c) 2005 ROOT Foundation, CERN/PH-SFT *
* *
**********************************************************************/
#include "TSPlot.h"
#include "TVirtualFitter.h"
#include "TH1.h"
#include "TTreePlayer.h"
#include "TTreeFormula.h"
#include "TTreeFormulaManager.h"
#include "TSelectorDraw.h"
#include "TBrowser.h"
#include "TClass.h"
#include "TMath.h"
extern void Yields(Int_t &, Double_t *, Double_t &f, Double_t *x, Int_t iflag);
ClassImp(TSPlot)
//____________________________________________________________________
//Begin_Html
Overview
A common method used in High Energy Physics to perform measurements is
the maximum Likelihood method, exploiting discriminating variables to
disentangle signal from background. The crucial point for such an
analysis to be reliable is to use an exhaustive list of sources of
events combined with an accurate description of all the Probability
Density Functions (PDF).
To assess the validity of the fit, a convincing quality check
is to explore further the data sample by examining the distributions of
control variables. A control variable can be obtained for instance by
removing one of the discriminating variables before performing again
the maximum Likelihood fit: this removed variable is a control
variable. The expected distribution of this control variable, for
signal, is to be compared to the one extracted, for signal, from the
data sample. In order to be able to do so, one must be able to unfold
from the distribution of the whole data sample.
The TSPlot method allows to reconstruct the distributions for
the control variable, independently for each of the various sources of
events, without making use of any a priori knowledge on this
variable. The aim is thus to use the knowledge available for the
discriminating variables to infer the behaviour of the individual
sources of events with respect to the control variable.
TSPlot is optimal if the control variable is uncorrelated with the discriminating variables.
A detail description of the formalism itself, called
, is given in [1].
The method
The
technique is developped in the above context of a maximum Likelihood method making use of discriminating variables.
One considers a data sample in which are merged several species
of events. These species represent various signal components and
background components which all together account for the data sample.
The different terms of the log-Likelihood are:
- : the total number of events in the data sample,
-
: the number of species of events populating the data sample,
- : the number of events expected on the average for the species,
-
: the value of the PDFs of the discriminating variables for the species and for event ,
- : the set of control variables which, by definition, do not appear in the expression of the Likelihood function .
The extended log-Likelihood reads:
|
(1) |
From this expression, after maximization of with respect to the parameters, a weight can be computed for every event and each species, in order to obtain later the true distribution
of variable . If is one of the
species present in the data sample, the weight for this species is defined by:
|
(2) |
where
is the covariance matrix resulting from the Likelihood maximization.
This matrix can be used directly from the fit, but this is numerically
less accurate than the direct computation:
|
(3) |
The distribution of the control variable obtained by histogramming the weighted events reproduces, on average, the true distribution
.
The class TSPlot allows to reconstruct the true distribution
of a control variable for each of the
species from the sole knowledge of the PDFs of the discriminating variables . The plots obtained thanks to the TSPlot class are called
.
Some properties and checks
Beside reproducing the true distribution,
bear remarkable properties:
Different steps followed by TSPlot
- A maximum Likelihood fit is performed to obtain the yields of the various species.
The fit relies on discriminating variables uncorrelated with a control variable :
the later is therefore totally absent from the fit.
- The weights are calculated using Eq. (2) where the covariance matrix is taken from Minuit.
- Histograms of are filled by weighting the events with .
- Error bars per bin are given by Eq. (6).
The
reproduce the true distributions of the species in the control variable , within the above defined statistical uncertainties.
Illustrations
To illustrate the technique, one considers an example derived from the analysis where
have been first used (charmless B decays). One is dealing with a data
sample in which two species are present: the first is termed signal and
the second background. A maximum Likelihood fit is performed to obtain
the two yields and . The fit relies on two discriminating variables collectively denoted which are chosen within three possible variables denoted , and .
The variable which is not incorporated in is used as the control variable . The six distributions of the three variables are assumed to be the ones depicted in Fig. 1.
Figure 1:
Distributions of the three discriminating variables available to perform the Likelihood fit:
, , .
Among the three variables, two are used to perform the fit while one is
kept out of the fit to serve the purpose of a control variable. The
three distributions on the top (resp. bottom) of the figure correspond
to the signal (resp. background). The unit of the vertical axis is
chosen such that it indicates the number of entries per bin, if one
slices the histograms in 25 bins.
|
A data sample being built through a Monte Carlo simulation based on the distributions shown in Fig. 1, one obtains the three distributions of Fig. 2. Whereas the distribution of clearly indicates the presence of the signal, the distribution of and are less obviously populated by signal.
Figure 2:
Distributions of the three discriminating variables for signal plus
background. The three distributions are the ones obtained from a data
sample obtained through a Monte Carlo simulation based on the
distributions shown in Fig. 1. The data sample consists of 500 signal events and 5000 background events.
|
Chosing and as discriminating variables to determine and through a maximum Likelihood fit, one builds, for the control variable which is unknown to the fit, the two
for signal and background shown in Fig. 3. One observes that the
for signal reproduces correctly the PDF even where the latter vanishes,
although the error bars remain sizeable. This results from the almost
complete cancellation between positive and negative weights: the sum of
weights is close to zero while the sum of weights squared is not. The
occurence of negative weights occurs through the appearance of the
covariance matrix, and its negative components, in the definition of
Eq. (2).
A word of caution is in order with respect to the error bars. Whereas
their sum in quadrature is identical to the statistical uncertainties
of the yields determined by the fit, and if, in addition, they are
asymptotically correct, the error bars should be handled with care for
low statistics and/or for too fine binning. This is because the error
bars do not incorporate two known properties of the PDFs: PDFs are
positive definite and can be non-zero in a given x-bin, even if in the
particular data sample at hand, no event is observed in this bin. The
latter limitation is not specific to
,
rather it is always present when one is willing to infer the PDF at the
origin of an histogram, when, for some bins, the number of entries does
not guaranty the applicability of the Gaussian regime. In such
situations, a satisfactory practice is to attach allowed ranges to the
histogram to indicate the upper and lower limits of the PDF value which
are consistent with the actual observation, at a given confidence
level.
Figure 3:
The
(signal on the left, background on the right) obtained for are represented as dots with error bars. They are obtained from a fit using only information from and .
|
Chosing and as discriminating variables to determine and through a maximum Likelihood fit, one builds, for the control variable which is unknown to the fit, the two
for signal and background shown in Fig. 4. In the
for signal one observes that error bars are the largest in the regions where the background is the largest.
Figure 4:
The
(signal on the left, background on the right) obtained for are represented as dots with error bars. They are obtained from a fit using only information from and .
|
The results above can be obtained by running the tutorial TestSPlot.C
End_Html
//____________________________________________________________________
TSPlot::TSPlot() :
fTree(0),
fTreename(0),
fVarexp(0),
fSelection(0)
{
// default constructor (used by I/O only)
fNx = 0;
fNy=0;
fNevents = 0;
fNSpecies=0;
fNumbersOfEvents=0;
}
//____________________________________________________________________
TSPlot::TSPlot(Int_t nx, Int_t ny, Int_t ne, Int_t ns, TTree *tree) :
fTreename(0),
fVarexp(0),
fSelection(0)
{
//normal TSPlot constructor
// nx : number of control variables
// ny : number of discriminating variables
// ne : total number of events
// ns : number of species
// tree: input data
fNx = nx;
fNy=ny;
fNevents = ne;
fNSpecies=ns;
fXvar.ResizeTo(fNevents, fNx);
fYvar.ResizeTo(fNevents, fNy);
fYpdf.ResizeTo(fNevents, fNSpecies*fNy);
fSWeights.ResizeTo(fNevents, fNSpecies*(fNy+1));
fTree = tree;
fNumbersOfEvents = 0;
}
//____________________________________________________________________
TSPlot::~TSPlot()
{
// destructor
if (fNumbersOfEvents)
delete [] fNumbersOfEvents;
if (!fXvarHists.IsEmpty())
fXvarHists.Delete();
if (!fYvarHists.IsEmpty())
fYvarHists.Delete();
if (!fYpdfHists.IsEmpty())
fYpdfHists.Delete();
}
//____________________________________________________________________
void TSPlot::Browse(TBrowser *b)
{
//To browse the histograms
if (!fSWeightsHists.IsEmpty()) {
TIter next(&fSWeightsHists);
TH1D* h = 0;
while ((h = (TH1D*)next()))
b->Add(h,h->GetName());
}
if (!fYpdfHists.IsEmpty()) {
TIter next(&fYpdfHists);
TH1D* h = 0;
while ((h = (TH1D*)next()))
b->Add(h,h->GetName());
}
if (!fYvarHists.IsEmpty()) {
TIter next(&fYvarHists);
TH1D* h = 0;
while ((h = (TH1D*)next()))
b->Add(h,h->GetName());
}
if (!fXvarHists.IsEmpty()) {
TIter next(&fXvarHists);
TH1D* h = 0;
while ((h = (TH1D*)next()))
b->Add(h,h->GetName());
}
b->Add(&fSWeights, "sWeights");
}
//____________________________________________________________________
void TSPlot::SetInitialNumbersOfSpecies(Int_t *numbers)
{
//Set the initial number of events of each species - used
//as initial estimates in minuit
if (!fNumbersOfEvents)
fNumbersOfEvents = new Double_t[fNSpecies];
for (Int_t i=0; iIsA()->GetName(), s);
if (strdiff!=0)
delete TVirtualFitter::GetFitter();
}
TVirtualFitter *minuit = TVirtualFitter::Fitter(0, 2);
fPdfTot.ResizeTo(fNevents, fNSpecies);
//now let's do it, excluding different yvars
//for iplot = -1 none is excluded
for (Int_t iplot=-1; iplotClear();
minuit->SetFCN(Yields);
Double_t arglist[10];
//set the print level
if (opt.Contains("Q")||opt.Contains("V")){
arglist[0]=-1;
}
if (opt.Contains("W"))
arglist[0]=0;
minuit->ExecuteCommand("SET PRINT", arglist, 1);
minuit->SetObjectFit(&fPdfTot); //a tricky way to get fPdfTot matrix to fcn
for (ispecies=0; ispeciesSetParameter(ispecies, "", fNumbersOfEvents[ispecies], 1, 0, 0);
minuit->ExecuteCommand("MIGRAD", arglist, 0);
for (ispecies=0; ispeciesGetParameter(ispecies);
if (!opt.Contains("Q"))
printf("estimated #of events in species %d = %f\n", ispecies, fNumbersOfEvents[ispecies]);
}
if (!opt.Contains("Q"))
printf("\n");
Double_t *covmat = minuit->GetCovarianceMatrix();
SPlots(covmat, iplot);
if (opt.Contains("W")){
Double_t *sumweight = new Double_t[fNSpecies];
for (i=0; iGetNbinsX()!=nbins)
fXvarHists.Delete();
else
return;
}
//make the histograms
char name[10];
for (i=0; iFill(fXvar(j, i));
fXvarHists.Add(h);
}
}
//____________________________________________________________________
TObjArray* TSPlot::GetXvarHists()
{
//Returns the array of histograms of x variables (not weighted)
//If histograms have not already
//been filled, they are filled with default binning 100.
Int_t nbins = 100;
if (fXvarHists.IsEmpty())
FillXvarHists(nbins);
else if (((TH1D*)fXvarHists.First())->GetNbinsX()!=nbins)
FillXvarHists(nbins);
return &fXvarHists;
}
//____________________________________________________________________
TH1D *TSPlot::GetXvarHist(Int_t ixvar)
{
//Returns the histogram of variable #ixvar
//If histograms have not already
//been filled, they are filled with default binning 100.
Int_t nbins = 100;
if (fXvarHists.IsEmpty())
FillXvarHists(nbins);
else if (((TH1D*)fXvarHists.First())->GetNbinsX()!=nbins)
FillXvarHists(nbins);
return (TH1D*)fXvarHists.UncheckedAt(ixvar);
}
//____________________________________________________________________
void TSPlot::FillYvarHists(Int_t nbins)
{
//Fill the histograms of y variables
Int_t i, j;
if (!fYvarHists.IsEmpty()){
if (((TH1D*)fYvarHists.First())->GetNbinsX()!=nbins)
fYvarHists.Delete();
else
return;
}
//make the histograms
char name[10];
for (i=0; iFill(fYvar(j, i));
fYvarHists.Add(h);
}
}
//____________________________________________________________________
TObjArray* TSPlot::GetYvarHists()
{
//Returns the array of histograms of y variables. If histograms have not already
//been filled, they are filled with default binning 100.
Int_t nbins = 100;
if (fYvarHists.IsEmpty())
FillYvarHists(nbins);
else if (((TH1D*)fYvarHists.First())->GetNbinsX()!=nbins)
FillYvarHists(nbins);
return &fYvarHists;
}
//____________________________________________________________________
TH1D *TSPlot::GetYvarHist(Int_t iyvar)
{
//Returns the histogram of variable iyvar.If histograms have not already
//been filled, they are filled with default binning 100.
Int_t nbins = 100;
if (fYvarHists.IsEmpty())
FillYvarHists(nbins);
else if (((TH1D*)fYvarHists.First())->GetNbinsX()!=nbins)
FillYvarHists(nbins);
return (TH1D*)fYvarHists.UncheckedAt(iyvar);
}
//____________________________________________________________________
void TSPlot::FillYpdfHists(Int_t nbins)
{
//Fills the histograms of pdf-s of y variables with binning nbins
Int_t i, j, ispecies;
if (!fYpdfHists.IsEmpty()){
if (((TH1D*)fYpdfHists.First())->GetNbinsX()!=nbins)
fYpdfHists.Delete();
else
return;
}
char name[30];
for (ispecies=0; ispeciesFill(fYpdf(j, ispecies*fNy+i));
fYpdfHists.Add(h);
}
}
}
//____________________________________________________________________
TObjArray* TSPlot::GetYpdfHists()
{
//Returns the array of histograms of pdf's of y variables with binning nbins
//If histograms have not already
//been filled, they are filled with default binning 100.
Int_t nbins = 100;
if (fYpdfHists.IsEmpty())
FillYpdfHists(nbins);
return &fYpdfHists;
}
//____________________________________________________________________
TH1D *TSPlot::GetYpdfHist(Int_t iyvar, Int_t ispecies)
{
//Returns the histogram of the pdf of variable #iyvar for species #ispecies, binning nbins
//If histograms have not already
//been filled, they are filled with default binning 100.
Int_t nbins = 100;
if (fYpdfHists.IsEmpty())
FillYpdfHists(nbins);
return (TH1D*)fYpdfHists.UncheckedAt(fNy*ispecies+iyvar);
}
//____________________________________________________________________
void TSPlot::FillSWeightsHists(Int_t nbins)
{
//The order of histograms in the array:
//x0_species0, x0_species1,..., x1_species0, x1_species1,..., y0_species0, y0_species1,...
//If the histograms have already been filled with a different binning, they are refilled
//and all histograms are deleted
if (fSWeights.GetNoElements()==0){
Error("GetSWeightsHists", "SWeights were not computed");
return;
}
if (!fSWeightsHists.IsEmpty()){
if (((TH1D*)fSWeightsHists.First())->GetNbinsX()!=nbins)
fSWeightsHists.Delete();
else
return;
}
char name[30];
//Fill histograms of x-variables weighted with sWeights
for (Int_t ivar=0; ivarSumw2();
for (Int_t ievent=0; ieventFill(fXvar(ievent, ivar), fSWeights(ievent, ispecies));
fSWeightsHists.AddLast(h);
}
}
//Fill histograms of y-variables (exluded from the fit), weighted with sWeights
for (Int_t iexcl=0; iexclSumw2();
for (Int_t ievent=0; ieventFill(fYvar(ievent, iexcl), fSWeights(ievent, fNSpecies*(iexcl+1)+ispecies));
fSWeightsHists.AddLast(h);
}
}
}
//____________________________________________________________________
TObjArray *TSPlot::GetSWeightsHists()
{
//Returns an array of all histograms of variables, weighted with sWeights
//If histograms have not been already filled, they are filled with default binning 50
//The order of histograms in the array:
//x0_species0, x0_species1,..., x1_species0, x1_species1,..., y0_species0, y0_species1,...
Int_t nbins = 50; //default binning
if (fSWeightsHists.IsEmpty())
FillSWeightsHists(nbins);
return &fSWeightsHists;
}
//____________________________________________________________________
void TSPlot::RefillHist(Int_t type, Int_t nvar, Int_t nbins, Double_t min, Double_t max, Int_t nspecies)
{
//The Fill...Hist() methods fill the histograms with the real limits on the variables
//This method allows to refill the specified histogram with user-set boundaries min and max
//Parameters:
//type = 1 - histogram of x variable #nvar
// = 2 - histogram of y variable #nvar
// = 3 - histogram of y_pdf for y #nvar and species #nspecies
// = 4 - histogram of x variable #nvar, species #nspecies, WITH sWeights
// = 5 - histogram of y variable #nvar, species #nspecies, WITH sWeights
if (type<1 || type>5){
Error("RefillHist", "type must lie between 1 and 5");
return;
}
char name[20];
Int_t j;
TH1D *hremove;
if (type==1){
hremove = (TH1D*)fXvarHists.RemoveAt(nvar);
delete hremove;
snprintf(name,20,"x%d",nvar);
TH1D *h = new TH1D(name, name, nbins, min, max);
for (j=0; jFill(fXvar(j, nvar));
fXvarHists.AddAt(h, nvar);
}
if (type==2){
hremove = (TH1D*)fYvarHists.RemoveAt(nvar);
delete hremove;
snprintf(name,20, "y%d", nvar);
TH1D *h = new TH1D(name, name, nbins, min, max);
for (j=0; jFill(fYvar(j, nvar));
fXvarHists.AddAt(h, nvar);
}
if (type==3){
hremove = (TH1D*)fYpdfHists.RemoveAt(nspecies*fNy+nvar);
delete hremove;
snprintf(name,20, "pdf_species%d_y%d", nspecies, nvar);
TH1D *h=new TH1D(name, name, nbins, min, max);
for (j=0; jFill(fYpdf(j, nspecies*fNy+nvar));
fYpdfHists.AddAt(h, nspecies*fNy+nvar);
}
if (type==4){
hremove = (TH1D*)fSWeightsHists.RemoveAt(fNSpecies*nvar+nspecies);
delete hremove;
snprintf(name,20, "x%d_species%d", nvar, nspecies);
TH1D *h = new TH1D(name, name, nbins, min, max);
h->Sumw2();
for (Int_t ievent=0; ieventFill(fXvar(ievent, nvar), fSWeights(ievent, nspecies));
fSWeightsHists.AddAt(h, fNSpecies*nvar+nspecies);
}
if (type==5){
hremove = (TH1D*)fSWeightsHists.RemoveAt(fNx*fNSpecies + fNSpecies*nvar+nspecies);
delete hremove;
snprintf(name,20, "y%d_species%d", nvar, nspecies);
TH1D *h = new TH1D(name, name, nbins, min, max);
h->Sumw2();
for (Int_t ievent=0; ieventFill(fYvar(ievent, nvar), fSWeights(ievent, nspecies));
fSWeightsHists.AddAt(h, fNx*fNSpecies + fNSpecies*nvar+nspecies);
}
}
//____________________________________________________________________
TH1D *TSPlot::GetSWeightsHist(Int_t ixvar, Int_t ispecies,Int_t iyexcl)
{
//Returns the histogram of a variable, weithed with sWeights
//If histograms have not been already filled, they are filled with default binning 50
//If parameter ixvar!=-1, the histogram of x-variable #ixvar is returned for species ispecies
//If parameter ixvar==-1, the histogram of y-variable #iyexcl is returned for species ispecies
//If the histogram has already been filled and the binning is different from the parameter nbins
//all histograms with old binning will be deleted and refilled.
Int_t nbins = 50; //default binning
if (fSWeightsHists.IsEmpty())
FillSWeightsHists(nbins);
if (ixvar==-1)
return (TH1D*)fSWeightsHists.UncheckedAt(fNx*fNSpecies + fNSpecies*iyexcl+ispecies);
else
return (TH1D*)fSWeightsHists.UncheckedAt(fNSpecies*ixvar + ispecies);
}
//____________________________________________________________________
void TSPlot::SetTree(TTree *tree)
{
// Set the input Tree
fTree = tree;
}
//____________________________________________________________________
void TSPlot::SetTreeSelection(const char* varexp, const char *selection, Long64_t firstentry)
{
//Specifies the variables from the tree to be used for splot
//
//Variables fNx, fNy, fNSpecies and fNEvents should already be set!
//
//In the 1st parameter it is assumed that first fNx variables are x(control variables),
//then fNy y variables (discriminating variables),
//then fNy*fNSpecies ypdf variables (probability distribution functions of dicriminating
//variables for different species). The order of pdfs should be: species0_y0, species0_y1,...
//species1_y0, species1_y1,...species[fNSpecies-1]_y0...
//The 2nd parameter allows to make a cut
//TTree::Draw method description contains more details on specifying expression and selection
TTreeFormula **var;
std::vector cnames;
TList *formulaList = new TList();
TSelectorDraw *selector = (TSelectorDraw*)(((TTreePlayer*)fTree->GetPlayer())->GetSelector());
Long64_t entry, entryNumber;
Int_t i,nch;
Int_t ncols;
TObjArray *leaves = fTree->GetListOfLeaves();
fTreename= new TString(fTree->GetName());
if (varexp)
fVarexp = new TString(varexp);
if (selection)
fSelection= new TString(selection);
nch = varexp ? strlen(varexp) : 0;
//*-*- Compile selection expression if there is one
TTreeFormula *select = 0;
if (selection && strlen(selection)) {
select = new TTreeFormula("Selection",selection,fTree);
if (!select) return;
if (!select->GetNdim()) { delete select; return; }
formulaList->Add(select);
}
//*-*- if varexp is empty, take first nx + ny + ny*nspecies columns by default
if (nch == 0) {
ncols = fNx + fNy + fNy*fNSpecies;
for (i=0;iAt(i)->GetName() );
}
//*-*- otherwise select only the specified columns
} else {
ncols = selector->SplitNames(varexp,cnames);
}
var = new TTreeFormula* [ncols];
Double_t *xvars = new Double_t[ncols];
fMinmax.ResizeTo(2, ncols);
for (i=0; iAdd(var[i]);
}
//*-*- Create a TreeFormulaManager to coordinate the formulas
TTreeFormulaManager *manager=0;
if (formulaList->LastIndex()>=0) {
manager = new TTreeFormulaManager;
for(i=0;i<=formulaList->LastIndex();i++) {
manager->Add((TTreeFormula*)formulaList->At(i));
}
manager->Sync();
}
//*-*- loop on all selected entries
// fSelectedRows = 0;
Int_t tnumber = -1;
Long64_t selectedrows=0;
for (entry=firstentry;entryGetEntryNumber(entry);
if (entryNumber < 0) break;
Long64_t localEntry = fTree->LoadTree(entryNumber);
if (localEntry < 0) break;
if (tnumber != fTree->GetTreeNumber()) {
tnumber = fTree->GetTreeNumber();
if (manager) manager->UpdateFormulaLeaves();
}
Int_t ndata = 1;
if (manager && manager->GetMultiplicity()) {
ndata = manager->GetNdata();
}
for(Int_t inst=0;instEvalInstance(inst) == 0) {
continue;
}
}
if (inst==0) loaded = kTRUE;
else if (!loaded) {
// EvalInstance(0) always needs to be called so that
// the proper branches are loaded.
for (i=0;iEvalInstance(0);
}
loaded = kTRUE;
}
for (i=0;iEvalInstance(inst);
}
// curentry = entry-firstentry;
//printf("event#%d\n", curentry);
//for (i=0; i fMinmax(1, i))
fMinmax(1, i)=fXvar(selectedrows, i);
}
for (i=0; i fMinmax(1, i+fNx))
fMinmax(1, i+fNx) = fYvar(selectedrows, i);
for (Int_t j=0; j fMinmax(1, j*fNy+i+fNx+fNy))
fMinmax(1, j*fNy+i+fNx+fNy) = fYpdf(selectedrows, j*fNy+i);
}
}
selectedrows++;
}
}
fNevents=selectedrows;
// for (i=0; iGetObjectFit();
Int_t nev = pdftot->GetNrows();
Int_t nes = pdftot->GetNcols();
f=0;
for (i=0; i