293#include "TRestDataSet.h"
296#include "TRestTools.h"
339 if (
fTree !=
nullptr) {
340 RESTWarning <<
"Tree has already been loaded. Skipping TRestDataSet::GenerateDataSet ... "
349 RESTError <<
"File selection is empty " <<
RESTendl;
355 std::vector<std::string> finalList;
356 finalList.push_back(
"runOrigin");
357 finalList.push_back(
"eventID");
358 finalList.push_back(
"timeStamp");
362 if (std::find(obsNames.begin(), obsNames.end(), obs) != obsNames.end()) {
363 finalList.push_back(obs);
365 RESTWarning <<
" Observable " << obs <<
" not found in observable list, skipping..." <<
RESTendl;
369 for (
const auto& name : obsNames) {
371 if (name.find(pcs) == 0) finalList.push_back(name);
376 std::sort(finalList.begin(), finalList.end());
377 finalList.erase(std::unique(finalList.begin(), finalList.end()), finalList.end());
380 ROOT::EnableImplicitMT();
382 ROOT::DisableImplicitMT();
384 RESTInfo <<
"Initializing dataset" <<
RESTendl;
387 RESTInfo <<
"Making cuts" <<
RESTendl;
392 RESTInfo <<
"Adding column to dataset: " << cName <<
RESTendl;
393 finalList.emplace_back(cName);
399 RESTInfo <<
" - Dataset generated!" <<
RESTendl;
406 RESTInfo <<
"Generating snapshot." <<
RESTendl;
407 std::string user = getenv(
"USER");
408 std::string fOutName =
"/tmp/rest_output_" + user +
".root";
409 if (!finalList.empty())
410 fDataFrame.Snapshot(
"AnalysisTree", fOutName, finalList);
412 fDataFrame.Snapshot(
"AnalysisTree", fOutName);
414 RESTInfo <<
"Re-importing analysis tree." <<
RESTendl;
415 fDataFrame = ROOT::RDataFrame(
"AnalysisTree", fOutName);
417 TFile* f = TFile::Open(fOutName.c_str());
418 fTree = (TChain*)f->Get(
"AnalysisTree");
430 if (!time_stamp_end || !time_stamp_start) {
431 RESTError <<
"TRestDataSet::FileSelect. Start or end dates not properly formed. Please, check "
432 "REST_StringHelper::StringToTimeStamp documentation for valid formats"
439 RESTInfo <<
"TRestDataSet::FileSelection. Starting file selection." <<
RESTendl;
440 RESTInfo <<
"Total files : " << fileNames.size() <<
RESTendl;
441 RESTInfo <<
"This process may take long computation time in case there are many files." <<
RESTendl;
444 std::cout <<
"Processing file selection.";
446 for (
const auto& file : fileNames) {
447 if (cnt % 100 == 0) {
448 std::cout << std::endl;
449 std::cout <<
"Files processed: " << cnt <<
" ." << std::flush;
453 std::cout <<
"." << std::flush;
454 double runStart = run.GetStartTimestamp();
455 double runEnd = run.GetEndTimestamp();
457 if (runStart < time_stamp_start || runEnd > time_stamp_end) {
458 RESTInfo <<
"Rejecting file out of date range: " << file <<
RESTendl;
465 std::string mdValue = run.GetMetadataMember(md);
468 if (mdValue.find(
fFilterContains[n]) == std::string::npos) accept =
false;
483 if (!accept)
continue;
486 for (
auto& [name, properties] :
fQuantity) {
490 if (properties.strategy ==
"accumulate") {
492 properties.value = StringWithPrecision(val, 2);
495 if (properties.strategy ==
"max")
497 properties.value = value;
499 if (properties.strategy ==
"min")
501 properties.value = value;
503 if (properties.strategy ==
"unique") {
504 if (properties.value.empty())
505 properties.value = value;
506 else if (properties.value != value) {
507 RESTWarning <<
"TRestDataSet::FileSelection. Relevant quantity retrieval." <<
RESTendl;
508 RESTWarning <<
"A unique metadata member used for the `" << name
509 <<
"` quantity is not unique!!" <<
RESTendl;
510 RESTWarning <<
"Pre-registered value : " << properties.value <<
" New value : " << value
515 if (properties.strategy ==
"last") properties.value = value;
522 fTotalDuration += run.GetEndTimestamp() - run.GetStartTimestamp();
525 std::cout << std::endl;
557 if (cut ==
nullptr)
return df;
559 auto paramCut = cut->GetParamCut();
560 auto obsList = df.GetColumnNames();
561 for (
const auto& [param, condition] : paramCut) {
562 if (std::find(obsList.begin(), obsList.end(), param) != obsList.end()) {
563 std::string pCut = param + condition;
564 RESTDebug <<
"Applying cut " << pCut <<
RESTendl;
565 df = df.Filter(pCut);
567 RESTWarning <<
" Cut observable " << param <<
" not found in observable list, skipping..."
572 auto cutString = cut->GetCutStrings();
573 for (
const auto& pCut : cutString) {
575 for (
const auto& obs : obsList) {
576 if (pCut.find(obs) != std::string::npos) {
577 RESTDebug <<
"Applying cut " << pCut <<
RESTendl;
578 df = df.Filter(pCut);
585 RESTWarning <<
" Cut string " << pCut <<
" not found in observable list, skipping..." <<
RESTendl;
599 if (*nEntries == (
long long unsigned int)
GetTree()->
GetEntries())
return *nEntries;
600 RESTWarning <<
"TRestDataSet::GetEntries. Number of tree entries is not the same as RDataFrame entries."
602 RESTWarning <<
"Returning RDataFrame entries" <<
RESTendl;
621 std::string evalFormula = formula;
622 for (
auto const& [name, properties] :
fQuantity)
625 df = df.Define(columnName, evalFormula);
648 RESTMetadata <<
" Single observables added:" <<
RESTendl;
649 RESTMetadata <<
" -------------------------" <<
RESTendl;
656 RESTMetadata <<
" Process observables added: " <<
RESTendl;
657 RESTMetadata <<
" -------------------------- " <<
RESTendl;
664 RESTMetadata <<
" Metadata filters: " <<
RESTendl;
665 RESTMetadata <<
" ----------------- " <<
RESTendl;
670 RESTMetadata <<
" - " << mdFilter <<
".";
685 RESTMetadata <<
" Relevant quantities: " <<
RESTendl;
686 RESTMetadata <<
" -------------------- " <<
RESTendl;
688 for (
auto const& [name, properties] :
fQuantity) {
689 RESTMetadata <<
" - Name : " << name <<
". Value : " << properties.value
690 <<
". Strategy: " << properties.strategy <<
RESTendl;
691 RESTMetadata <<
" - Metadata: " << properties.metadata <<
RESTendl;
692 RESTMetadata <<
" - Description: " << properties.description <<
RESTendl;
698 RESTMetadata <<
" New columns added to generated dataframe: " <<
RESTendl;
699 RESTMetadata <<
" ---------------------------------------- " <<
RESTendl;
701 RESTMetadata <<
" - Name : " << cName <<
RESTendl;
702 RESTMetadata <<
" - Expression: " << cExpression <<
RESTendl;
709 RESTMetadata <<
"This is a combined dataset." <<
RESTendl;
710 RESTMetadata <<
" -------------------- " <<
RESTendl;
711 RESTMetadata <<
" - Relevant quantities have been removed!" <<
RESTendl;
712 RESTMetadata <<
" - Dataset metadata properties correspond to the first file in the list."
715 RESTMetadata <<
"List of imported files: " <<
RESTendl;
716 RESTMetadata <<
" -------------------- " <<
RESTendl;
722 RESTMetadata <<
" - Multithreading was enabled" <<
RESTendl;
724 RESTMetadata <<
" - Multithreading was NOT enabled" <<
RESTendl;
736 TiXmlElement* filterDefinition =
GetElement(
"filter");
737 while (filterDefinition !=
nullptr) {
738 std::string metadata =
GetFieldValue(
"metadata", filterDefinition);
739 if (metadata.empty() || metadata ==
"Not defined") {
740 RESTError <<
"Filter key defined without metadata member!" <<
RESTendl;
746 std::string contains =
GetFieldValue(
"contains", filterDefinition);
747 if (contains ==
"Not defined") contains =
"";
761 TiXmlElement* observablesDefinition =
GetElement(
"observables");
762 while (observablesDefinition !=
nullptr) {
763 std::string observables =
GetFieldValue(
"list", observablesDefinition);
764 if (observables.empty() || observables ==
"Not defined") {
765 RESTError <<
"<observables key does not contain a list!" <<
RESTendl;
777 TiXmlElement* obsProcessDefinition =
GetElement(
"processObservables");
778 while (obsProcessDefinition !=
nullptr) {
779 std::string observables =
GetFieldValue(
"list", obsProcessDefinition);
780 if (observables.empty() || observables ==
"Not defined") {
781 RESTError <<
"<processObservables key does not contain a list!" <<
RESTendl;
793 TiXmlElement* quantityDefinition =
GetElement(
"quantity");
794 while (quantityDefinition !=
nullptr) {
795 std::string name =
GetFieldValue(
"name", quantityDefinition);
796 if (name.empty() || name ==
"Not defined") {
797 RESTError <<
"<quantity key does not contain a name!" <<
RESTendl;
801 std::string metadata =
GetFieldValue(
"metadata", quantityDefinition);
802 if (metadata.empty() || metadata ==
"Not defined") {
803 RESTError <<
"<quantity key does not contain a metadata value!" <<
RESTendl;
807 std::string strategy =
GetFieldValue(
"strategy", quantityDefinition);
808 if (strategy.empty() || strategy ==
"Not defined") {
812 std::string description =
GetFieldValue(
"description", quantityDefinition);
826 TiXmlElement* columnDefinition =
GetElement(
"addColumn");
827 while (columnDefinition !=
nullptr) {
829 if (name.empty() || name ==
"Not defined") {
830 RESTError <<
"<define key does not contain a name name!" <<
RESTendl;
834 std::string expression =
GetFieldValue(
"expression", columnDefinition);
835 if (expression.empty() || expression ==
"Not defined") {
836 RESTError <<
"<addColumn key does not contain a expression value!" <<
RESTendl;
862 RESTInfo <<
"Exporting dataset" <<
RESTendl;
864 std::vector<std::string> columns =
fDataFrame.GetColumnNames();
865 if (!excludeColumns.empty()) {
866 columns.erase(std::remove_if(columns.begin(), columns.end(),
867 [&excludeColumns](std::string elem) {
868 return std::find(excludeColumns.begin(), excludeColumns.end(),
869 elem) != excludeColumns.end();
873 RESTInfo <<
"Re-Generating snapshot." <<
RESTendl;
874 std::string user = getenv(
"USER");
875 std::string fOutName =
"/tmp/rest_output_" + user +
".root";
876 fDataFrame.Snapshot(
"AnalysisTree", fOutName, columns);
878 RESTInfo <<
"Re-importing analysis tree." <<
RESTendl;
879 fDataFrame = ROOT::RDataFrame(
"AnalysisTree", fOutName);
881 TFile* f = TFile::Open(fOutName.c_str());
882 fTree = (TChain*)f->Get(
"AnalysisTree");
887 if (excludeColumns.empty()) {
888 RESTInfo <<
"Re-Generating snapshot." <<
RESTendl;
889 std::string user = getenv(
"USER");
890 std::string fOutName =
"/tmp/rest_output_" + user +
".root";
891 fDataFrame.Snapshot(
"AnalysisTree", fOutName);
893 TFile* f = TFile::Open(fOutName.c_str());
894 fTree = (TChain*)f->Get(
"AnalysisTree");
897 std::vector<std::string> dataTypes;
898 for (
int n = 0; n <
fTree->GetListOfBranches()->GetEntries(); n++) {
899 std::string bName =
fTree->GetListOfBranches()->At(n)->GetName();
900 std::string type =
fTree->GetLeaf((TString)bName)->GetTypeName();
901 dataTypes.push_back(type);
902 if (type !=
"Double_t" && type !=
"Int_t") {
903 RESTError <<
"Branch name : " << bName <<
" is type : " << type <<
RESTendl;
904 RESTError <<
"Only Int_t and Double_t types are allowed for "
905 "exporting to ASCII table"
907 RESTError <<
"File will not be generated" <<
RESTendl;
912 FILE* f = fopen(filename.c_str(),
"wt");
914 fprintf(f,
"### TRestDataSet generated file\n");
915 fprintf(f,
"### \n");
919 fprintf(f,
"### Accumulated run time (seconds) : %lf\n",
fTotalDuration);
920 fprintf(f,
"### Accumulated run time (hours) : %lf\n",
fTotalDuration / 3600.);
921 fprintf(f,
"### Accumulated run time (days) : %lf\n",
fTotalDuration / 3600. / 24.);
927 fprintf(f,
"### Metadata filters : \n");
930 fprintf(f,
"### - %s.", md.c_str());
940 fprintf(f,
"### Relevant quantities: \n");
941 for (
auto& [name, properties] :
fQuantity) {
942 fprintf(f,
"### - %s : %s - %s\n", name.c_str(), properties.value.c_str(),
943 properties.description.c_str());
946 fprintf(f,
"### Observables list: ");
948 std::string bName =
fTree->GetListOfBranches()->At(n)->GetName();
949 fprintf(f,
" %s", bName.c_str());
953 fprintf(f,
"### Data starts here\n");
956 std::string obsListStr =
"";
957 for (
const auto& l : obsNames) {
958 if (!obsListStr.empty()) obsListStr +=
":";
963 fTree->Draw((TString)obsListStr,
"",
"goff");
965 for (
unsigned int n = 0; n <
fTree->GetEntries(); n++) {
967 std::string bName =
fTree->GetListOfBranches()->At(m)->GetName();
968 if (m > 0) fprintf(f,
"\t");
969 if (dataTypes[m] ==
"Double_t")
970 if (bName ==
"timeStamp")
971 fprintf(f,
"%010.0lf",
fTree->GetVal(m)[n]);
973 fprintf(f,
"%05.3e",
fTree->GetVal(m)[n]);
975 fprintf(f,
"%06d", (Int_t)
fTree->GetVal(m)[n]);
983 fDataFrame.Snapshot(
"AnalysisTree", filename);
985 TFile* f = TFile::Open(filename.c_str(),
"UPDATE");
986 std::string name = this->GetName();
987 if (name.empty()) name =
"mock";
988 this->
Write(name.c_str());
994 RESTInfo <<
"Dataset generated: " << filename <<
RESTendl;
1001 SetName(dS.GetName());
1028 auto obsNames = GetObservablesList();
1030 if (std::find(obsNames.begin(), obsNames.end(), obs) != obsNames.end()) {
1031 RESTError <<
"Cannot merge dataSets with different observable list " <<
RESTendl;
1054 RESTError <<
"Datasets can only be imported from root files" <<
RESTendl;
1059 TFile* file = TFile::Open(fileName.c_str(),
"READ");
1060 if (file !=
nullptr) {
1061 TIter nextkey(file->GetListOfKeys());
1063 while ((key = (TKey*)nextkey())) {
1064 std::string kName = key->GetClassName();
1073 if (dS ==
nullptr) {
1074 RESTError << fileName <<
" is not a valid dataSet" <<
RESTendl;
1079 ROOT::EnableImplicitMT();
1081 ROOT::DisableImplicitMT();
1083 fDataFrame = ROOT::RDataFrame(
"AnalysisTree", fileName);
1085 fTree = (TChain*)file->Get(
"AnalysisTree");
1098 for (
const auto& fN : fileNames)
1100 RESTError <<
"Datasets can only be imported from root files" <<
RESTendl;
1105 auto it = fileNames.begin();
1106 while (it != fileNames.end()) {
1107 std::string fileName = *it;
1108 TFile* file = TFile::Open(fileName.c_str(),
"READ");
1109 bool isValid =
false;
1110 if (file !=
nullptr) {
1111 TIter nextkey(file->GetListOfKeys());
1113 while ((key = (TKey*)nextkey())) {
1114 std::string kName = key->GetClassName();
1125 isValid =
Merge(*dS);
1128 if (isValid) count++;
1132 RESTError <<
"Cannot open " << fileName <<
RESTendl;
1136 RESTError << fileName <<
" is not a valid dataSet skipping..." <<
RESTendl;
1137 it = fileNames.erase(it);
1143 if (fileNames.empty()) {
1144 RESTError <<
"File selection is empty, dataSet will not be imported " <<
RESTendl;
1148 RESTInfo <<
"Opening list of files. First file: " << fileNames[0] <<
RESTendl;
1149 fDataFrame = ROOT::RDataFrame(
"AnalysisTree", fileNames);
1151 if (
fTree !=
nullptr) {
1155 fTree =
new TChain(
"AnalysisTree");
1157 for (
const auto& fN : fileNames)
fTree->Add((TString)fN);
std::vector< std::string > GetObservableNames()
It returns a vector with strings containing all the observables that exist in the analysis tree.
A class to help on cuts definitions. To be used with TRestAnalysisTree.
It allows to group a number of runs that satisfy given metadata conditions.
std::vector< std::string > fFilterContains
If not empty it will check if the metadata member contains the string.
virtual std::vector< std::string > FileSelection()
Function to determine the filenames that satisfy the dataset conditions.
std::vector< Double_t > fFilterLowerThan
If the corresponding element is not empty it will check if the metadata member is lower.
void PrintMetadata() override
Prints on screen the information about the metadata members of TRestDataSet.
TChain * fTree
A pointer to the generated tree.
std::vector< std::string > fProcessObservablesList
It contains a list of the process where all observables should be added.
void Import(const std::string &fileName)
This function imports metadata from a root file it import metadata info from the previous dataSet whi...
std::map< std::string, RelevantQuantity > fQuantity
The properties of a relevant quantity that we want to store together with the dataset.
ROOT::RDF::RNode Range(size_t from, size_t to)
This method returns a RDataFrame node with the number of samples inside the dataset by selecting a ra...
std::vector< std::pair< std::string, std::string > > fColumnNameExpressions
A list of new columns together with its corresponding expressions added to the dataset.
ROOT::RDF::RNode DefineColumn(const std::string &columnName, const std::string &formula)
This function will add a new column to the RDataFrame using the same scheme as the usual RDF::Define ...
Double_t fEndTime
TimeStamp for the end time of the last file.
ROOT::RDF::RNode fDataFrame
The resulting RDF::RNode object after initialization.
size_t GetNumberOfBranches()
Number of variables (or observables)
size_t GetEntries()
It returns the number of entries found inside fDataFrame and prints out a warning if the number of en...
TRestDataSet()
Default constructor.
Double_t GetTotalTimeInSeconds() const
It returns the accumulated run time in seconds.
ROOT::RDF::RNode MakeCut(const TRestCut *cut)
This function applies a TRestCut to the dataframe and returns a dataframe with the applied cuts....
void GenerateDataSet()
This function generates the data frame with the filelist and column names (or observables) that have ...
Bool_t fMT
A flag to enable Multithreading during dataframe generation.
TRestCut * fCut
Parameter cuts over the selected dataset.
void Export(const std::string &filename, std::vector< std::string > excludeColumns={})
It will generate an output file with the dataset compilation. Only the selected branches and the file...
std::string fFilterStartTime
All the selected runs will have a starting date after fStartTime.
Bool_t Merge(const TRestDataSet &dS)
This function merge different TRestDataSet metadata in current dataSet.
std::vector< std::string > fFilterMetadata
A list of metadata members where filters will be applied.
std::vector< std::string > fFileSelection
A list populated by the FileSelection method using the conditions of the dataset.
std::vector< std::string > GetFileSelection()
It returns a list of the files that have been finally selected.
std::string fFilterEndTime
All the selected runs will have an ending date before fEndTime.
Double_t fStartTime
TimeStamp for the start time of the first file.
std::vector< std::string > fObservablesList
It contains a list of the observables that will be added to the final tree or exported file.
TTree * GetTree() const
Gives access to the tree.
Bool_t fMergedDataset
It keeps track if the generated dataset is a pure dataset or a merged one.
void Initialize() override
This function initialize different parameters from the TRestDataSet.
void RegenerateTree(std::vector< std::string > finalList={})
It regenerates the tree so that it is an exact copy of the present DataFrame.
std::vector< std::string > fImportedFiles
The list of dataset files imported.
Double_t fTotalDuration
The total integrated run time of selected files.
std::string fFilePattern
A glob file pattern that must be satisfied by all files.
std::vector< Double_t > fFilterGreaterThan
If the corresponding element is not empty it will check if the metadata member is greater.
ROOT::RDF::RNode ApplyRange(size_t from, size_t to)
This method reduces the number of samples inside the dataset by selecting a range.
std::vector< Double_t > fFilterEqualsTo
If the corresponding element is not empty it will check if the metadata member is equal.
void InitFromConfigFile() override
Initialization of specific TRestDataSet members through an RML file.
TRestDataSet & operator=(TRestDataSet &dS)
Operator to copy TRestDataSet metadata.
~TRestDataSet()
Default destructor.
Data provider and manager in REST.
std::string ReplaceMetadataMembers(const std::string &instr, Int_t precision=8)
It will replace the data members contained inside the string given as input. The data members in the ...
@ REST_Info
+show most of the information for each steps
TClass * GetClassQuick()
Get the type of a "class" object, returning the wrapped type identifier "TClass".
time_t StringToTimeStamp(std::string time)
A method to convert a date/time formatted string to a timestamp.
std::vector< std::string > Split(std::string in, std::string separator, bool allowBlankString=false, bool removeWhiteSpaces=false, int startPos=-1)
Split the input string according to the given separator. Returning a vector of fragments.
Double_t StringToDouble(std::string in)
Gets a double from a string.
std::string ToDateTimeString(time_t time)
Format time_t into string.
std::string Replace(std::string in, std::string thisString, std::string byThisString, size_t fromPosition=0, Int_t N=0)
Replace any occurences of thisSring by byThisString inside string in.
std::string metadata
The associated metadata member used to register the relevant quantity.
std::string description
A user given description that can be used to define the relevant quantity.
std::string strategy
It determines how to produce the relevant quantity (accumulate/unique/last/max/min)
std::string value
The quantity value.