REST-for-Physics  v2.3
Rare Event Searches ToolKit for Physics
TRestComponentDataSet.cxx
1 /*************************************************************************
2  * This file is part of the REST software framework. *
3  * *
4  * Copyright (C) 2016 GIFNA/TREX (University of Zaragoza) *
5  * For more information see https://gifna.unizar.es/trex *
6  * *
7  * REST is free software: you can redistribute it and/or modify *
8  * it under the terms of the GNU General Public License as published by *
9  * the Free Software Foundation, either version 3 of the License, or *
10  * (at your option) any later version. *
11  * *
12  * REST is distributed in the hope that it will be useful, *
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15  * GNU General Public License for more details. *
16  * *
17  * You should have a copy of the GNU General Public License along with *
18  * REST in $REST_PATH/LICENSE. *
19  * If not, see https://www.gnu.org/licenses/. *
20  * For the list of contributors see $REST_PATH/CREDITS. *
21  *************************************************************************/
22 
85 #include "TRestComponentDataSet.h"
86 
87 #include <TKey.h>
88 
89 #include <numeric>
90 
91 ClassImp(TRestComponentDataSet);
92 
97 
102 
117 TRestComponentDataSet::TRestComponentDataSet(const char* cfgFileName, const std::string& name)
118  : TRestComponent(cfgFileName) {
120 
122 }
123 
130 
131  SetSectionName(this->ClassName());
132 
133  LoadDataSets();
134 }
135 
141 
142  if (!fDataSetFileNames.empty()) {
143  RESTMetadata << " " << RESTendl;
144  RESTMetadata << " == Dataset filenames ==" << RESTendl;
145 
146  for (const auto& x : fDataSetFileNames) RESTMetadata << "- " << x << RESTendl;
147 
148  RESTMetadata << " " << RESTendl;
149  }
150 
151  if (fDFRange.X() != 0 || fDFRange.Y() != 0) {
152  RESTMetadata << " DataFrame range: ( " << fDFRange.X() << ", " << fDFRange.Y() << ")" << RESTendl;
153  RESTMetadata << " " << RESTendl;
154  }
155 
156  if (!fParameter.empty() && fParameterizationNodes.empty()) {
157  RESTMetadata << "This component has no nodes!" << RESTendl;
158  RESTMetadata << " Use: LoadDataSets() to initialize the nodes" << RESTendl;
159  }
160 
161  if (!fWeights.empty()) {
162  RESTMetadata << " " << RESTendl;
163  RESTMetadata << " == Weights ==" << RESTendl;
164 
165  for (const auto& x : fWeights) RESTMetadata << "- " << x << RESTendl;
166 
167  RESTMetadata << " " << RESTendl;
168  }
169 
170  RESTMetadata << " Use : PrintStatistics() to check node statistics" << RESTendl;
171  RESTMetadata << "----" << RESTendl;
172 }
173 
178  if (fNSimPerNode.empty() && IsDataSetLoaded()) fNSimPerNode = ExtractNodeStatistics();
179 
180  if (!HasNodes() && !IsDataSetLoaded()) {
181  RESTWarning << "TRestComponentDataSet::PrintStatistics. Empty nodes and no dataset loaded!"
182  << RESTendl;
183  RESTWarning << "Invoking TRestComponentDataSet::Initialize() might solve the problem" << RESTendl;
184  return;
185  }
186 
187  auto result = std::accumulate(fNSimPerNode.begin(), fNSimPerNode.end(), 0);
188  RESTInfo << "Total counts : " << result << RESTendl;
189  std::cout << std::endl;
190 
191  RESTInfo << " Parameter node statistics (" << fParameter << ")" << RESTendl;
192  int n = 0;
193  for (const auto& p : fParameterizationNodes) {
194  RESTInfo << " - Value : " << p << " Counts: " << fNSimPerNode[n] << RESTendl;
195  n++;
196  }
197 }
198 
204 
205  auto ele = GetElement("dataset");
206  while (ele != nullptr) {
207  fDataSetFileNames.push_back(GetParameter("filename", ele, ""));
208  ele = GetNextElement(ele);
209  }
210 
211  if (!fDataSetFileNames.empty()) Initialize();
212 }
213 
221  if (!fNodeDensity.empty()) return;
222 
223  if (fNbins.size() == 0) {
224  RESTError
225  << "TRestComponentDataSet::FillHistograms. Trying to fill histograms but no variables found!"
226  << RESTendl;
227  return;
228  }
229 
231 
232  if (!IsDataSetLoaded()) {
233  RESTError << "TRestComponentDataSet::FillHistograms. Dataset has not been initialized!" << RESTendl;
234  return;
235  }
236 
237  if (fParameterizationNodes.empty()) {
238  RESTWarning << "Nodes have not been defined" << RESTendl;
239  RESTWarning << "The full dataset will be used to generate the density distribution" << RESTendl;
240  fParameterizationNodes.push_back(-137);
241  }
242 
243  RESTInfo << "Generating N-dim histograms" << RESTendl;
244  int nIndex = 0;
245  for (const auto& node : fParameterizationNodes) {
246  Int_t from = 0;
247  Int_t to = 0;
248  if (fSamples > 0 && fTotalSamples[nIndex] - fSamples > 0) {
249  from = fRandom->Integer(fTotalSamples[nIndex] - fSamples);
250  to = from + fSamples;
251  fNSimPerNode[nIndex] = fSamples;
252  }
253 
254  ROOT::RDF::RNode df = ROOT::RDataFrame(0);
257  if (fParameterizationNodes.size() == 1 && node == -137) {
258  RESTInfo << "Creating component with no parameters (full dataset used)" << RESTendl;
259  df = fDataSet.GetDataFrame().Range(from, to);
260  fParameterizationNodes.clear();
261  } else {
262  RESTInfo << "Creating THnD for parameter " << fParameter << ": " << DoubleToString(node)
263  << RESTendl;
264  Double_t pUp = node * (1 + fPrecision / 2);
265  Double_t pDown = node * (1 - fPrecision / 2);
266  std::string filter = fParameter + " < " + DoubleToString(pUp) + " && " + fParameter + " > " +
267  DoubleToString(pDown);
268  df = fDataSet.GetDataFrame().Filter(filter).Range(from, to);
269  }
270 
271  Int_t* bins = new Int_t[fNbins.size()];
272  Double_t* xmin = new Double_t[fNbins.size()];
273  Double_t* xmax = new Double_t[fNbins.size()];
274 
275  for (size_t n = 0; n < fNbins.size(); n++) {
276  bins[n] = fNbins[n];
277  xmin[n] = fRanges[n].X();
278  xmax[n] = fRanges[n].Y();
279  }
280 
281  TString hName = fParameter + "_" + DoubleToString(node);
282  if (fParameterizationNodes.empty()) hName = "full";
283 
284  std::vector<std::string> varsAndWeight = fVariables;
285 
286  if (!fWeights.empty()) {
287  std::string weightsStr = "";
288  for (size_t n = 0; n < fWeights.size(); n++) {
289  if (n > 0) weightsStr += "*";
290 
291  weightsStr += fWeights[n];
292  }
293  df = df.Define("componentWeight", weightsStr);
294  varsAndWeight.push_back("componentWeight");
295  }
296 
297  auto hn = df.HistoND({hName, hName, (int)fNbins.size(), bins, xmin, xmax}, varsAndWeight);
298  THnD* hNd = new THnD(*hn);
299  hNd->Scale(1. / fNSimPerNode[nIndex]);
300 
301  fNodeDensity.push_back(hNd);
302  fActiveNode = nIndex;
303  nIndex++;
304  }
305 }
306 
314  if (fActiveNode >= 0 && fNodeDensity[fActiveNode]) {
315  delete fNodeDensity[fActiveNode];
316  } else {
317  RESTError << "TRestComponentDataSet::RegenerateActiveNode. Active node undefined!" << RESTendl;
318  return;
319  }
320 
321  Int_t from = 0;
322  Int_t to = 0;
323  if (fSamples > 0 && fTotalSamples[fActiveNode] - fSamples > 0) {
324  from = fRandom->Integer(fTotalSamples[fActiveNode] - fSamples);
325  to = from + fSamples;
327  }
328 
329  Double_t node = GetActiveNodeValue();
330  RESTInfo << "Creating THnD for parameter " << fParameter << ": " << DoubleToString(node) << RESTendl;
331 
332  ROOT::RDF::RNode df = ROOT::RDataFrame(0);
333  Double_t pUp = node * (1 + fPrecision / 2);
334  Double_t pDown = node * (1 - fPrecision / 2);
335  std::string filter =
336  fParameter + " < " + DoubleToString(pUp) + " && " + fParameter + " > " + DoubleToString(pDown);
337  df = fDataSet.GetDataFrame().Filter(filter).Range(from, to);
338 
339  Int_t* bins = new Int_t[fNbins.size()];
340  Double_t* xmin = new Double_t[fNbins.size()];
341  Double_t* xmax = new Double_t[fNbins.size()];
342 
343  for (size_t n = 0; n < fNbins.size(); n++) {
344  bins[n] = fNbins[n];
345  xmin[n] = fRanges[n].X();
346  xmax[n] = fRanges[n].Y();
347  }
348 
349  TString hName = fParameter + "_" + DoubleToString(node);
350  if (fParameterizationNodes.empty()) hName = "full";
351 
352  std::vector<std::string> varsAndWeight = fVariables;
353 
354  if (!fWeights.empty()) {
355  std::string weightsStr = "";
356  for (size_t n = 0; n < fWeights.size(); n++) {
357  if (n > 0) weightsStr += "*";
358 
359  weightsStr += fWeights[n];
360  }
361  df = df.Define("componentWeight", weightsStr);
362  varsAndWeight.push_back("componentWeight");
363  }
364 
365  auto hn = df.HistoND({hName, hName, (int)fNbins.size(), bins, xmin, xmax}, varsAndWeight);
366  THnD* hNd = new THnD(*hn);
367  hNd->Scale(1. / fNSimPerNode[fActiveNode]);
368 
369  fNodeDensity[fActiveNode] = hNd;
370 }
371 
380  if (!fParameterizationNodes.empty()) return fParameterizationNodes;
381 
382  RESTInfo << "Extracting parameterization nodes" << RESTendl;
383 
384  std::vector<double> vs;
385  if (!IsDataSetLoaded()) {
386  RESTError << "TRestComponentDataSet::ExtractParameterizationNodes. Dataset has not been initialized!"
387  << RESTendl;
388  return vs;
389  }
390 
391  auto GetUniqueElements = [](const std::vector<double>& vec) {
392  std::set<double> uniqueSet(vec.begin(), vec.end());
393  return std::vector<double>(uniqueSet.begin(), uniqueSet.end());
394  };
395 
396  for (size_t n = 0; n < 1 + fDataSet.GetEntries() / fSplitEntries; n++) {
397  auto nEn = fDataSet.Range(n * fSplitEntries, (n + 1) * fSplitEntries).Count();
398  auto parValues = fDataSet.Range(n * fSplitEntries, (n + 1) * fSplitEntries).Take<double>(fParameter);
399  std::vector<double> uniqueVec = GetUniqueElements(*parValues);
400  vs.insert(vs.end(), uniqueVec.begin(), uniqueVec.end());
401  }
402 
403  return vs;
404 }
405 
417  if (!fNSimPerNode.empty()) return fNSimPerNode;
418 
419  fTotalSamples.clear();
420 
421  std::vector<Int_t> stats;
422  if (!IsDataSetLoaded()) {
423  RESTError << "TRestComponentDataSet::ExtractNodeStatistics. Dataset has not been initialized!"
424  << RESTendl;
425  return stats;
426  }
427 
428  RESTInfo << "Counting statistics for each node ..." << RESTendl;
429  RESTInfo << "Number of nodes : " << fParameterizationNodes.size() << RESTendl;
430  for (const auto& p : fParameterizationNodes) {
431  Double_t pUp = p * (1 + fPrecision / 2);
432  Double_t pDown = p * (1 - fPrecision / 2);
433  std::string filter =
434  fParameter + " < " + DoubleToString(pUp) + " && " + fParameter + " > " + DoubleToString(pDown);
435  RESTInfo << "Counting stats for : " << fParameter << " = " << p << RESTendl;
436  auto nEv = fDataSet.GetDataFrame().Filter(filter).Count();
437  fTotalSamples.push_back(*nEv);
438  RESTInfo << "Total entries for " << fParameter << ":" << p << " = " << *nEv << RESTendl;
439  if (fSamples != 0) {
440  nEv = fDataSet.GetDataFrame().Filter(filter).Range(fSamples).Count();
441  }
442 
443  if ((Int_t)*nEv < fSamples) {
444  RESTWarning << "The number of requested samples (" << fSamples
445  << ") is higher than the number of dataset entries (" << *nEv << ")" << RESTendl;
446  }
447  RESTInfo << "Samples to be used for " << fParameter << ":" << p << " = " << *nEv << RESTendl;
448  stats.push_back(*nEv);
449  }
450  return stats;
451 }
452 
459  if (fDataSetFileNames.empty()) {
460  fDataSetLoaded = false;
461  return fDataSetLoaded;
462  }
463 
464  RESTInfo << "Loading datasets" << RESTendl;
465 
466  std::vector<std::string> fullFileNames;
467  for (const auto& name : fDataSetFileNames) {
468  // TODO we get here a list of files. However, we will need to weight each dataset with a factor
469  // to consider the contribution of each background component.
470  // Of course, we could previously take a factor into account already in the dataset, through the
471  // definition of a new column. But being this way would allow us to play around with the
472  // background model without having to regenerate the dataset.
473  std::string fileName = SearchFile(name);
474  if (fileName.empty()) {
475  RESTError << "TRestComponentDataSet::LoadDataSet. Error loading file : " << name << RESTendl;
476  RESTError << "Does the file exist?" << RESTendl;
477  RESTError << "You may use `<globals> <searchPath ...` to indicate the path location" << RESTendl;
478  return false;
479  }
480  fullFileNames.push_back(fileName);
481  }
482 
483  fDataSet.Import(fullFileNames);
484  fDataSetLoaded = true;
485 
486  if (fDFRange.X() != 0 || fDFRange.Y() != 0)
487  fDataSet.ApplyRange((size_t)fDFRange.X(), (size_t)fDFRange.Y());
488 
489  if (fDataSet.GetTree() == nullptr) {
490  RESTError << "Problem loading dataset from file list :" << RESTendl;
491  for (const auto& f : fDataSetFileNames) RESTError << " - " << f << RESTendl;
492  return false;
493  }
494 
496 
497  if (VariablesOk() && WeightsOk()) {
499  RESTInfo << "Filling histograms" << RESTendl;
500  FillHistograms();
501  return fDataSetLoaded;
502  }
503 
504  return fDataSetLoaded;
505 }
506 
511  Bool_t ok = true;
512  std::vector cNames = fDataSet.GetDataFrame().GetColumnNames();
513 
514  for (const auto& var : fVariables)
515  if (std::count(cNames.begin(), cNames.end(), var) == 0) {
516  RESTError << "Variable ---> " << var << " <--- NOT found on dataset" << RESTendl;
517  ok = false;
518  }
519  return ok;
520 }
521 
526  Bool_t ok = true;
527  std::vector cNames = fDataSet.GetDataFrame().GetColumnNames();
528 
529  for (const auto& var : fWeights) {
530  if (!isANumber(var) && std::count(cNames.begin(), cNames.end(), var) == 0) {
531  RESTError << "Weight ---> " << var << " <--- NOT found on dataset" << RESTendl;
532  ok = false;
533  }
534  }
535  return ok;
536 }
537 
543  if (!IsDataSetLoaded()) {
544  RESTWarning << "TRestComponentDataSet::ValidDataSet. Dataset has not been loaded" << RESTendl;
545  RESTWarning << "Try calling TRestComponentDataSet::Initialize()" << RESTendl;
546 
547  RESTInfo << "Trying to load datasets" << RESTendl;
548  LoadDataSets();
549  if (IsDataSetLoaded()) {
550  RESTInfo << "Sucess!" << RESTendl;
551  } else {
552  RESTError << "Failed loading datasets" << RESTendl;
553  return false;
554  }
555  }
556 
557  if (HasNodes() && fActiveNode == -1) {
558  RESTError << "TRestComponentDataSet::ValidDataSet. Active node has not been defined" << RESTendl;
559  return false;
560  }
561  return true;
562 }
It defines a background/signal model distribution in a given parameter space (tipically x,...
long long unsigned int fSplitEntries
It helps to split large datasets when extracting the parameterization nodes.
Bool_t ValidDataSet()
Takes care of initializing datasets if have not been initialized. On sucess it returns true.
TRestDataSet fDataSet
The dataset used to initialize the distribution.
void PrintMetadata() override
Prints on screen the information about the metadata members of TRestAxionSolarFlux.
Bool_t fDataSetLoaded
It is true of the dataset was loaded without issues.
TVector2 fDFRange
It creates a sample subset using a range definition.
void FillHistograms() override
It will produce a histogram with the distribution defined using the variables and the weights for eac...
std::vector< Int_t > fNSimPerNode
std::vector< std::string > fWeights
A list with the dataset columns used to weight the distribution density and define rate.
Bool_t LoadDataSets()
A method responsible to import a list of TRestDataSet into fDataSet and check that the variables and ...
Bool_t VariablesOk()
It returns true if all variables have been found inside TRestDataSet.
void PrintStatistics()
It prints out the statistics available for each parametric node.
TRestComponentDataSet()
Default constructor.
~TRestComponentDataSet()
Default destructor.
void RegenerateActiveNodeDensity() override
It will regenerate the density histogram for the active node. It is practical in the case when the nu...
std::vector< Int_t > fTotalSamples
It defines the total number of entries for each parameterization node (Initialized by the dataset)
std::vector< Double_t > ExtractParameterizationNodes()
It returns a vector with all the different values found on the dataset column for the user given para...
void Initialize() override
It will initialize the data frame with the filelist and column names (or observables) that have been ...
Bool_t WeightsOk()
It returns true if all weights have been found inside TRestDataSet.
std::vector< std::string > fDataSetFileNames
The filename of the dataset used.
std::vector< Int_t > ExtractNodeStatistics()
It returns a vector with the number of entries found for each parameterization node.
void InitFromConfigFile() override
It customizes the retrieval of XML data values of this class.
It defines a background/signal model distribution in a given parameter space (tipically x,...
void InitFromConfigFile() override
It customizes the retrieval of XML data values of this class.
void PrintMetadata() override
Prints on screen the information about the metadata members of TRestAxionSolarFlux.
Int_t fActiveNode
It is used to define the node that will be accessed for rate retrieval.
Int_t fSamples
It introduces a fixed number of samples (if 0 it will take all available samples)
std::string fParameter
It is used to parameterize a set of distribution densities (e.g. WIMP or axion mass)
Float_t fPrecision
A precision used to select the node value with a given range defined as a fraction of the value.
std::vector< Int_t > fNbins
The number of bins in which we should divide each variable.
void Initialize() override
It initializes the random number. We avoid to define the section name here since we will never define...
std::vector< TVector2 > fRanges
The range of each of the variables used to create the PDF distribution.
std::vector< Double_t > fParameterizationNodes
It defines the nodes of the parameterization (Initialized by the dataset)
Bool_t HasNodes()
It returns true if any nodes have been defined.
TRandom3 * fRandom
Internal process random generator.
std::vector< std::string > fVariables
A list with the branches that will be used to create the distribution space.
std::vector< THnD * > fNodeDensity
The generated N-dimensional variable space density for a given node.
void PrintMetadata() override
Prints on screen the information about the metadata members of TRestDataSet.
void Import(const std::string &fileName)
This function imports metadata from a root file it import metadata info from the previous dataSet whi...
ROOT::RDF::RNode Range(size_t from, size_t to)
This method returns a RDataFrame node with the number of samples inside the dataset by selecting a ra...
size_t GetEntries()
It returns the number of entries found inside fDataFrame and prints out a warning if the number of en...
ROOT::RDF::RNode GetDataFrame() const
Gives access to the RDataFrame.
Definition: TRestDataSet.h:129
TTree * GetTree() const
Gives access to the tree.
Definition: TRestDataSet.h:138
ROOT::RDF::RNode ApplyRange(size_t from, size_t to)
This method reduces the number of samples inside the dataset by selecting a range.
endl_t RESTendl
Termination flag object for TRestStringOutput.
TiXmlElement * GetElement(std::string eleDeclare, TiXmlElement *e=nullptr)
Get an xml element from a given parent element, according to its declaration.
Int_t LoadConfigFromFile(const std::string &configFilename, const std::string &sectionName="")
Give the file name, find out the corresponding section. Then call the main starter.
TRestStringOutput::REST_Verbose_Level GetVerboseLevel()
returns the verboselevel in type of REST_Verbose_Level enumerator
void SetSectionName(std::string sName)
set the section name, clear the section content
std::string SearchFile(std::string filename)
Search files in current directory and directories specified in "searchPath" section.
std::string fConfigFileName
Full name of the rml file.
TiXmlElement * GetNextElement(TiXmlElement *e)
Get the next sibling xml element of this element, with same eleDeclare.
std::string GetParameter(std::string parName, TiXmlElement *e, TString defaultValue=PARAMETER_NOT_FOUND_STR)
Returns the value for the parameter named parName in the given section.
@ REST_Info
+show most of the information for each steps
std::string DoubleToString(Double_t d, std::string format="%8.6e")
Gets a string from a double.
Int_t isANumber(std::string in)
Returns 1 only if a valid number is found in the string in. If not it returns 0.