REST-for-Physics  v2.3
Rare Event Searches ToolKit for Physics
TRestDataSet.h
1 /*************************************************************************
2  * This file is part of the REST software framework. *
3  * *
4  * Copyright (C) 2016 GIFNA/TREX (University of Zaragoza) *
5  * For more information see https://gifna.unizar.es/trex *
6  * *
7  * REST is free software: you can redistribute it and/or modify *
8  * it under the terms of the GNU General Public License as published by *
9  * the Free Software Foundation, either version 3 of the License, or *
10  * (at your option) any later version. *
11  * *
12  * REST is distributed in the hope that it will be useful, *
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15  * GNU General Public License for more details. *
16  * *
17  * You should have a copy of the GNU General Public License along with *
18  * REST in $REST_PATH/LICENSE. *
19  * If not, see https://www.gnu.org/licenses/. *
20  * For the list of contributors see $REST_PATH/CREDITS. *
21  *************************************************************************/
22 
23 #ifndef REST_TRestDataSet
24 #define REST_TRestDataSet
25 
26 #include <TTimeStamp.h>
27 
28 #include <ROOT/RDataFrame.hxx>
29 
30 #include "TRestCut.h"
31 #include "TRestMetadata.h"
32 
34 class TRestDataSet : public TRestMetadata {
35  public:
38  std::string metadata;
39 
41  std::string strategy;
42 
44  std::string description;
45 
47  std::string value;
48  };
49 
50  private:
52  std::string fFilterStartTime = "2000/01/01"; //<
53 
55  std::string fFilterEndTime = "3000/12/31"; //<
56 
58  std::string fFilePattern = ""; //<
59 
61  std::vector<std::string> fObservablesList; //<
62 
64  std::vector<std::string> fProcessObservablesList; //<
65 
67  std::vector<std::string> fFilterMetadata; //<
68 
70  std::vector<std::string> fFilterContains; //<
71 
73  std::vector<Double_t> fFilterGreaterThan; //<
74 
76  std::vector<Double_t> fFilterLowerThan; //<
77 
79  std::vector<Double_t> fFilterEqualsTo; //<
80 
82  std::map<std::string, RelevantQuantity> fQuantity; //<
83 
85  TRestCut* fCut = nullptr; //<
86 
88  Double_t fTotalDuration = 0; //<
89 
91  std::vector<std::string> fFileSelection; //<
92 
95 
98 
100  Bool_t fMergedDataset = false; //<
101 
103  std::vector<std::string> fImportedFiles; //<
104 
106  std::vector<std::pair<std::string, std::string>> fColumnNameExpressions; //<
107 
109  Bool_t fMT = false; //<
110 
111  // If the dataframe was defined externally it will be true
112  Bool_t fExternal = false; //<
113 
115  ROOT::RDF::RNode fDataFrame = ROOT::RDataFrame(0);
116 
118  TChain* fTree = nullptr;
119 
120  void InitFromConfigFile() override;
121 
122  protected:
123  virtual std::vector<std::string> FileSelection();
124 
125  void RegenerateTree(std::vector<std::string> finalList = {});
126 
127  public:
129  ROOT::RDF::RNode GetDataFrame() const {
130  if (!fExternal && fTree == nullptr)
131  RESTWarning << "DataFrame has not been yet initialized" << RESTendl;
132  return fDataFrame;
133  }
134 
135  void EnableMultiThreading(Bool_t enable = true) { fMT = enable; }
136 
138  TTree* GetTree() const {
139  if (fTree == nullptr && fExternal) {
140  RESTInfo << "The tree is not accessible. Only GetDataFrame can be used in an externally "
141  "generated dataset"
142  << RESTendl;
143  RESTInfo << "You may write a tree using GetDataFrame()->Snapshot(\"MyTree\", \"output.root\");"
144  << RESTendl;
145  return fTree;
146  }
147 
148  if (fTree == nullptr) {
149  RESTError << "Tree has not been yet initialized" << RESTendl;
150  RESTError << "You should invoke TRestDataSet::GenerateDataSet() or " << RESTendl;
151  RESTError << "TRestDataSet::Import( fname ) before trying to access the tree" << RESTendl;
152  }
153  return fTree;
154  }
155 
157  size_t GetNumberOfColumns() { return fDataFrame.GetColumnNames().size(); }
158 
161 
163  std::vector<std::string> GetFileSelection() { return fFileSelection; }
164 
166  Double_t GetTotalTimeInSeconds() const { return fTotalDuration; }
167 
168  inline auto GetFilterStartTime() const { return fFilterStartTime; }
169  inline auto GetFilterEndTime() const { return fFilterEndTime; }
170  inline auto GetStartTime() const { return fStartTime; }
171  inline auto GetEndTime() const { return fEndTime; }
172  inline auto GetFilePattern() const { return fFilePattern; }
173  inline auto GetObservablesList() const { return fObservablesList; }
174  inline auto GetFileSelection() const { return fFileSelection; }
175  inline auto GetProcessObservablesList() const { return fProcessObservablesList; }
176  inline auto GetFilterMetadata() const { return fFilterMetadata; }
177  inline auto GetFilterContains() const { return fFilterContains; }
178  inline auto GetFilterGreaterThan() const { return fFilterGreaterThan; }
179  inline auto GetFilterLowerThan() const { return fFilterLowerThan; }
180  inline auto GetFilterEqualsTo() const { return fFilterEqualsTo; }
181  inline auto GetQuantity() const { return fQuantity; }
182  inline auto GetAddedColumns() const { return fColumnNameExpressions; }
183  inline auto GetCut() const { return fCut; }
184  inline auto IsMergedDataSet() const { return fMergedDataset; }
185 
186  inline void SetObservablesList(const std::vector<std::string>& obsList) { fObservablesList = obsList; }
187  inline void SetFilePattern(const std::string& pattern) { fFilePattern = pattern; }
188  inline void SetQuantity(const std::map<std::string, RelevantQuantity>& quantity) { fQuantity = quantity; }
189 
190  void SetTotalTimeInSeconds(Double_t seconds) { fTotalDuration = seconds; }
191  void SetDataFrame(const ROOT::RDF::RNode& dS) {
192  fDataFrame = dS;
193  fExternal = true;
194  }
195 
197  Bool_t Merge(const TRestDataSet& dS);
198  void Import(const std::string& fileName);
199  void Import(std::vector<std::string> fileNames);
200  void Export(const std::string& filename, std::vector<std::string> excludeColumns = {});
201 
202  ROOT::RDF::RNode MakeCut(const TRestCut* cut);
203  ROOT::RDF::RNode ApplyRange(size_t from, size_t to);
204  ROOT::RDF::RNode Range(size_t from, size_t to);
205  ROOT::RDF::RNode DefineColumn(const std::string& columnName, const std::string& formula);
206 
207  size_t GetEntries();
208 
209  void PrintMetadata() override;
210  void Initialize() override;
211 
212  void GenerateDataSet();
213 
214  TRestDataSet();
215  TRestDataSet(const char* cfgFileName, const std::string& name = "");
216  ~TRestDataSet();
217 
218  ClassDefOverride(TRestDataSet, 8);
219 };
220 #endif
A class to help on cuts definitions. To be used with TRestAnalysisTree.
Definition: TRestCut.h:31
It allows to group a number of runs that satisfy given metadata conditions.
Definition: TRestDataSet.h:34
std::vector< std::string > fFilterContains
If not empty it will check if the metadata member contains the string.
Definition: TRestDataSet.h:70
virtual std::vector< std::string > FileSelection()
Function to determine the filenames that satisfy the dataset conditions.
std::vector< Double_t > fFilterLowerThan
If the corresponding element is not empty it will check if the metadata member is lower.
Definition: TRestDataSet.h:76
void PrintMetadata() override
Prints on screen the information about the metadata members of TRestDataSet.
TChain * fTree
A pointer to the generated tree.
Definition: TRestDataSet.h:118
std::vector< std::string > fProcessObservablesList
It contains a list of the process where all observables should be added.
Definition: TRestDataSet.h:64
void Import(const std::string &fileName)
This function imports metadata from a root file it import metadata info from the previous dataSet whi...
std::map< std::string, RelevantQuantity > fQuantity
The properties of a relevant quantity that we want to store together with the dataset.
Definition: TRestDataSet.h:82
ROOT::RDF::RNode Range(size_t from, size_t to)
This method returns a RDataFrame node with the number of samples inside the dataset by selecting a ra...
std::vector< std::pair< std::string, std::string > > fColumnNameExpressions
A list of new columns together with its corresponding expressions added to the dataset.
Definition: TRestDataSet.h:106
ROOT::RDF::RNode DefineColumn(const std::string &columnName, const std::string &formula)
This function will add a new column to the RDataFrame using the same scheme as the usual RDF::Define ...
Double_t fEndTime
TimeStamp for the end time of the last file.
Definition: TRestDataSet.h:97
ROOT::RDF::RNode fDataFrame
The resulting RDF::RNode object after initialization.
Definition: TRestDataSet.h:115
size_t GetNumberOfBranches()
Number of variables (or observables)
Definition: TRestDataSet.h:160
size_t GetEntries()
It returns the number of entries found inside fDataFrame and prints out a warning if the number of en...
TRestDataSet()
Default constructor.
ROOT::RDF::RNode GetDataFrame() const
Gives access to the RDataFrame.
Definition: TRestDataSet.h:129
Double_t GetTotalTimeInSeconds() const
It returns the accumulated run time in seconds.
Definition: TRestDataSet.h:166
ROOT::RDF::RNode MakeCut(const TRestCut *cut)
This function applies a TRestCut to the dataframe and returns a dataframe with the applied cuts....
void GenerateDataSet()
This function generates the data frame with the filelist and column names (or observables) that have ...
Bool_t fMT
A flag to enable Multithreading during dataframe generation.
Definition: TRestDataSet.h:109
TRestCut * fCut
Parameter cuts over the selected dataset.
Definition: TRestDataSet.h:85
void Export(const std::string &filename, std::vector< std::string > excludeColumns={})
It will generate an output file with the dataset compilation. Only the selected branches and the file...
std::string fFilterStartTime
All the selected runs will have a starting date after fStartTime.
Definition: TRestDataSet.h:52
Bool_t Merge(const TRestDataSet &dS)
This function merge different TRestDataSet metadata in current dataSet.
std::vector< std::string > GetFileSelection()
It returns a list of the files that have been finally selected.
Definition: TRestDataSet.h:163
std::vector< std::string > fFilterMetadata
A list of metadata members where filters will be applied.
Definition: TRestDataSet.h:67
std::vector< std::string > fFileSelection
A list populated by the FileSelection method using the conditions of the dataset.
Definition: TRestDataSet.h:91
std::string fFilterEndTime
All the selected runs will have an ending date before fEndTime.
Definition: TRestDataSet.h:55
Double_t fStartTime
TimeStamp for the start time of the first file.
Definition: TRestDataSet.h:94
std::vector< std::string > fObservablesList
It contains a list of the observables that will be added to the final tree or exported file.
Definition: TRestDataSet.h:61
Bool_t fMergedDataset
It keeps track if the generated dataset is a pure dataset or a merged one.
Definition: TRestDataSet.h:100
void Initialize() override
This function initialize different parameters from the TRestDataSet.
void RegenerateTree(std::vector< std::string > finalList={})
It regenerates the tree so that it is an exact copy of the present DataFrame.
std::vector< std::string > fImportedFiles
The list of dataset files imported.
Definition: TRestDataSet.h:103
Double_t fTotalDuration
The total integrated run time of selected files.
Definition: TRestDataSet.h:88
std::string fFilePattern
A glob file pattern that must be satisfied by all files.
Definition: TRestDataSet.h:58
TTree * GetTree() const
Gives access to the tree.
Definition: TRestDataSet.h:138
size_t GetNumberOfColumns()
Number of variables (or observables)
Definition: TRestDataSet.h:157
std::vector< Double_t > fFilterGreaterThan
If the corresponding element is not empty it will check if the metadata member is greater.
Definition: TRestDataSet.h:73
ROOT::RDF::RNode ApplyRange(size_t from, size_t to)
This method reduces the number of samples inside the dataset by selecting a range.
std::vector< Double_t > fFilterEqualsTo
If the corresponding element is not empty it will check if the metadata member is equal.
Definition: TRestDataSet.h:79
void InitFromConfigFile() override
Initialization of specific TRestDataSet members through an RML file.
TRestDataSet & operator=(TRestDataSet &dS)
Operator to copy TRestDataSet metadata.
~TRestDataSet()
Default destructor.
A base class for any REST metadata class.
Definition: TRestMetadata.h:74
endl_t RESTendl
Termination flag object for TRestStringOutput.
time_t StringToTimeStamp(std::string time)
A method to convert a date/time formatted string to a timestamp.
std::string metadata
The associated metadata member used to register the relevant quantity.
Definition: TRestDataSet.h:38
std::string description
A user given description that can be used to define the relevant quantity.
Definition: TRestDataSet.h:44
std::string strategy
It determines how to produce the relevant quantity (accumulate/unique/last/max/min)
Definition: TRestDataSet.h:41
std::string value
The quantity value.
Definition: TRestDataSet.h:47