300#include "TRestDataSet.h"
305#include "TRestTools.h"
348 if (
fTree !=
nullptr) {
349 RESTWarning <<
"Tree has already been loaded. Skipping TRestDataSet::GenerateDataSet ... "
358 RESTError <<
"File selection is empty " <<
RESTendl;
364 std::set<std::string> finalList;
365 finalList.insert(
"runOrigin");
366 finalList.insert(
"eventID");
367 finalList.insert(
"timeStamp");
371 finalList.insert(obsFromList.begin(), obsFromList.end());
374 ROOT::EnableImplicitMT();
376 ROOT::DisableImplicitMT();
378 RESTInfo <<
"Initializing dataset" <<
RESTendl;
381 RESTInfo <<
"Making cuts" <<
RESTendl;
386 RESTInfo <<
"Adding column to dataset: " << cName <<
RESTendl;
387 finalList.emplace(cName);
391 RegenerateTree(std::vector<std::string>(finalList.begin(), finalList.end()));
393 RESTInfo <<
" - Dataset generated!" <<
RESTendl;
400 RESTInfo <<
"Generating snapshot." <<
RESTendl;
401 std::string user = getenv(
"USER");
402 std::string fOutName =
"/tmp/rest_output_" + user +
".root";
403 if (!finalList.empty())
404 fDataFrame.Snapshot(
"AnalysisTree", fOutName, finalList);
406 fDataFrame.Snapshot(
"AnalysisTree", fOutName);
408 RESTInfo <<
"Re-importing analysis tree." <<
RESTendl;
409 fDataFrame = ROOT::RDataFrame(
"AnalysisTree", fOutName);
411 TFile* f = TFile::Open(fOutName.c_str());
412 fTree = (TChain*)f->Get(
"AnalysisTree");
424 if (!time_stamp_end || !time_stamp_start) {
425 RESTError <<
"TRestDataSet::FileSelect. Start or end dates not properly formed. Please, check "
426 "REST_StringHelper::StringToTimeStamp documentation for valid formats"
433 RESTInfo <<
"TRestDataSet::FileSelection. Starting file selection." <<
RESTendl;
434 RESTInfo <<
"Total files : " << fileNames.size() <<
RESTendl;
435 RESTInfo <<
"This process may take long computation time in case there are many files." <<
RESTendl;
438 std::cout <<
"Processing file selection.";
440 for (
const auto& file : fileNames) {
441 if (cnt % 100 == 0) {
442 std::cout << std::endl;
443 std::cout <<
"Files processed: " << cnt <<
" ." << std::flush;
447 std::cout <<
"." << std::flush;
448 double runStart = run.GetStartTimestamp();
449 double runEnd = run.GetEndTimestamp();
451 if (runStart < time_stamp_start || runEnd > time_stamp_end) {
452 RESTInfo <<
"Rejecting file out of date range: " << file <<
RESTendl;
459 std::string mdValue = run.GetMetadataMember(md);
462 if (mdValue.find(
fFilterContains[n]) == std::string::npos) accept =
false;
477 if (!accept)
continue;
480 for (
auto& [name, properties] :
fQuantity) {
484 if (properties.strategy ==
"accumulate") {
486 properties.value = StringWithPrecision(val, 2);
489 if (properties.strategy ==
"max")
491 properties.value = value;
493 if (properties.strategy ==
"min")
495 properties.value = value;
497 if (properties.strategy ==
"unique") {
498 if (properties.value.empty())
499 properties.value = value;
500 else if (properties.value != value) {
501 RESTWarning <<
"TRestDataSet::FileSelection. Relevant quantity retrieval." <<
RESTendl;
502 RESTWarning <<
"A unique metadata member used for the `" << name
503 <<
"` quantity is not unique!!" <<
RESTendl;
504 RESTWarning <<
"Pre-registered value : " << properties.value <<
" New value : " << value
509 if (properties.strategy ==
"last") properties.value = value;
511 if (properties.strategy.find(
"append") != std::string::npos) {
513 std::string appender = properties.strategy.substr(properties.strategy.find(
"append") + 6);
514 if (properties.value.empty())
515 properties.value = value;
518 if (properties.value.find(value) == std::string::npos)
519 properties.value += appender + value;
522 if (properties.strategy.find(
"extend") != std::string::npos) {
524 std::string appender = properties.strategy.substr(properties.strategy.find(
"extend") + 6);
525 if (properties.value.empty())
526 properties.value = value;
528 properties.value += appender + value;
536 fTotalDuration += run.GetEndTimestamp() - run.GetStartTimestamp();
539 std::cout << std::endl;
571 if (cut ==
nullptr)
return df;
573 auto paramCut = cut->GetParamCut();
574 auto obsList = df.GetColumnNames();
575 for (
const auto& [param, condition] : paramCut) {
576 if (std::find(obsList.begin(), obsList.end(), param) != obsList.end()) {
577 std::string pCut = param + condition;
578 RESTDebug <<
"Applying cut " << pCut <<
RESTendl;
579 df = df.Filter(pCut);
581 RESTWarning <<
" Cut observable " << param <<
" not found in observable list, skipping..."
586 auto cutString = cut->GetCutStrings();
587 for (
const auto& pCut : cutString) {
589 for (
const auto& obs : obsList) {
590 if (pCut.find(obs) != std::string::npos) {
591 RESTDebug <<
"Applying cut " << pCut <<
RESTendl;
592 df = df.Filter(pCut);
599 RESTWarning <<
" Cut string " << pCut <<
" not found in observable list, skipping..." <<
RESTendl;
613 if (*nEntries == (
long long unsigned int)
GetTree()->
GetEntries())
return *nEntries;
614 RESTWarning <<
"TRestDataSet::GetEntries. Number of tree entries is not the same as RDataFrame entries."
616 RESTWarning <<
"Returning RDataFrame entries" <<
RESTendl;
635 std::string evalFormula = formula;
636 for (
auto const& [name, properties] :
fQuantity)
639 df = df.Define(columnName, evalFormula);
662 RESTMetadata <<
" Observables added:" <<
RESTendl;
663 RESTMetadata <<
" -------------------------" <<
RESTendl;
670 RESTMetadata <<
" Metadata filters: " <<
RESTendl;
671 RESTMetadata <<
" ----------------- " <<
RESTendl;
676 RESTMetadata <<
" - " << mdFilter <<
".";
691 RESTMetadata <<
" Relevant quantities: " <<
RESTendl;
692 RESTMetadata <<
" -------------------- " <<
RESTendl;
694 for (
auto const& [name, properties] :
fQuantity) {
695 RESTMetadata <<
" - Name : " << name <<
". Value : " << properties.value
696 <<
". Strategy: " << properties.strategy <<
RESTendl;
697 RESTMetadata <<
" - Metadata: " << properties.metadata <<
RESTendl;
698 RESTMetadata <<
" - Description: " << properties.description <<
RESTendl;
704 RESTMetadata <<
" New columns added to generated dataframe: " <<
RESTendl;
705 RESTMetadata <<
" ---------------------------------------- " <<
RESTendl;
707 RESTMetadata <<
" - Name : " << cName <<
RESTendl;
708 RESTMetadata <<
" - Expression: " << cExpression <<
RESTendl;
715 RESTMetadata <<
"This is a combined dataset." <<
RESTendl;
716 RESTMetadata <<
" -------------------- " <<
RESTendl;
717 RESTMetadata <<
" - Relevant quantities have been removed!" <<
RESTendl;
718 RESTMetadata <<
" - Dataset metadata properties correspond to the first file in the list."
721 RESTMetadata <<
"List of imported files: " <<
RESTendl;
722 RESTMetadata <<
" -------------------- " <<
RESTendl;
728 RESTMetadata <<
" - Multithreading was enabled" <<
RESTendl;
730 RESTMetadata <<
" - Multithreading was NOT enabled" <<
RESTendl;
742 TiXmlElement* filterDefinition =
GetElement(
"filter");
743 while (filterDefinition !=
nullptr) {
744 std::string metadata =
GetFieldValue(
"metadata", filterDefinition);
745 if (metadata.empty() || metadata ==
"Not defined") {
746 RESTError <<
"Filter key defined without metadata member!" <<
RESTendl;
752 std::string contains =
GetFieldValue(
"contains", filterDefinition);
753 if (contains ==
"Not defined") contains =
"";
767 TiXmlElement* observablesDefinition =
GetElement(
"observables");
768 while (observablesDefinition !=
nullptr) {
769 std::string observables =
GetFieldValue(
"list", observablesDefinition);
770 if (observables.empty() || observables ==
"Not defined") {
771 RESTError <<
"<observables key does not contain a list!" <<
RESTendl;
783 TiXmlElement* obsProcessDefinition =
GetElement(
"processObservables");
784 while (obsProcessDefinition !=
nullptr) {
785 std::string observables =
GetFieldValue(
"list", obsProcessDefinition);
786 if (observables.empty() || observables ==
"Not defined") {
787 RESTError <<
"<processObservables key does not contain a list!" <<
RESTendl;
793 for (
const auto& l : obsList) {
794 std::string processObsPattern = l +
"_*";
802 TiXmlElement* quantityDefinition =
GetElement(
"quantity");
803 while (quantityDefinition !=
nullptr) {
804 std::string name =
GetFieldValue(
"name", quantityDefinition);
805 if (name.empty() || name ==
"Not defined") {
806 RESTError <<
"<quantity key does not contain a name!" <<
RESTendl;
810 std::string metadata =
GetFieldValue(
"metadata", quantityDefinition);
811 if (metadata.empty() || metadata ==
"Not defined") {
812 RESTError <<
"<quantity key does not contain a metadata value!" <<
RESTendl;
816 std::string strategy =
GetFieldValue(
"strategy", quantityDefinition);
817 if (strategy.empty() || strategy ==
"Not defined") {
821 std::string description =
GetFieldValue(
"description", quantityDefinition);
835 TiXmlElement* columnDefinition =
GetElement(
"addColumn");
836 while (columnDefinition !=
nullptr) {
838 if (name.empty() || name ==
"Not defined") {
839 RESTError <<
"<define key does not contain a name name!" <<
RESTendl;
843 std::string expression =
GetFieldValue(
"expression", columnDefinition);
844 if (expression.empty() || expression ==
"Not defined") {
845 RESTError <<
"<addColumn key does not contain a expression value!" <<
RESTendl;
871 RESTInfo <<
"Exporting dataset" <<
RESTendl;
873 std::vector<std::string> columns =
fDataFrame.GetColumnNames();
874 if (!excludeColumns.empty()) {
875 columns.erase(std::remove_if(columns.begin(), columns.end(),
876 [&excludeColumns](std::string elem) {
877 return std::find(excludeColumns.begin(), excludeColumns.end(),
878 elem) != excludeColumns.end();
882 RESTInfo <<
"Re-Generating snapshot." <<
RESTendl;
883 std::string user = getenv(
"USER");
884 std::string fOutName =
"/tmp/rest_output_" + user +
".root";
885 fDataFrame.Snapshot(
"AnalysisTree", fOutName, columns);
887 RESTInfo <<
"Re-importing analysis tree." <<
RESTendl;
888 fDataFrame = ROOT::RDataFrame(
"AnalysisTree", fOutName);
890 TFile* f = TFile::Open(fOutName.c_str());
891 fTree = (TChain*)f->Get(
"AnalysisTree");
896 if (excludeColumns.empty()) {
897 RESTInfo <<
"Re-Generating snapshot." <<
RESTendl;
898 std::string user = getenv(
"USER");
899 std::string fOutName =
"/tmp/rest_output_" + user +
".root";
900 fDataFrame.Snapshot(
"AnalysisTree", fOutName);
902 TFile* f = TFile::Open(fOutName.c_str());
903 fTree = (TChain*)f->Get(
"AnalysisTree");
906 std::vector<std::string> dataTypes;
907 for (
int n = 0; n <
fTree->GetListOfBranches()->GetEntries(); n++) {
908 std::string bName =
fTree->GetListOfBranches()->At(n)->GetName();
909 std::string type =
fTree->GetLeaf((TString)bName)->GetTypeName();
910 dataTypes.push_back(type);
911 if (type !=
"Double_t" && type !=
"Int_t") {
912 RESTError <<
"Branch name : " << bName <<
" is type : " << type <<
RESTendl;
913 RESTError <<
"Only Int_t and Double_t types are allowed for "
914 "exporting to ASCII table"
916 RESTError <<
"File will not be generated" <<
RESTendl;
921 FILE* f = fopen(filename.c_str(),
"wt");
923 fprintf(f,
"### TRestDataSet generated file\n");
924 fprintf(f,
"### \n");
928 fprintf(f,
"### Accumulated run time (seconds) : %lf\n",
fTotalDuration);
929 fprintf(f,
"### Accumulated run time (hours) : %lf\n",
fTotalDuration / 3600.);
930 fprintf(f,
"### Accumulated run time (days) : %lf\n",
fTotalDuration / 3600. / 24.);
936 fprintf(f,
"### Metadata filters : \n");
939 fprintf(f,
"### - %s.", md.c_str());
949 fprintf(f,
"### Relevant quantities: \n");
950 for (
auto& [name, properties] :
fQuantity) {
951 fprintf(f,
"### - %s : %s - %s\n", name.c_str(), properties.value.c_str(),
952 properties.description.c_str());
955 fprintf(f,
"### Observables list: ");
957 std::string bName =
fTree->GetListOfBranches()->At(n)->GetName();
958 fprintf(f,
" %s", bName.c_str());
962 fprintf(f,
"### Data starts here\n");
965 std::string obsListStr =
"";
966 for (
const auto& l : obsNames) {
967 if (!obsListStr.empty()) obsListStr +=
":";
972 fTree->Draw((TString)obsListStr,
"",
"goff");
974 for (
unsigned int n = 0; n <
fTree->GetEntries(); n++) {
976 std::string bName =
fTree->GetListOfBranches()->At(m)->GetName();
977 if (m > 0) fprintf(f,
"\t");
978 if (dataTypes[m] ==
"Double_t")
979 if (bName ==
"timeStamp")
980 fprintf(f,
"%010.0lf",
fTree->GetVal(m)[n]);
982 fprintf(f,
"%05.3e",
fTree->GetVal(m)[n]);
984 fprintf(f,
"%06d", (Int_t)
fTree->GetVal(m)[n]);
992 fDataFrame.Snapshot(
"AnalysisTree", filename);
994 TFile* f = TFile::Open(filename.c_str(),
"UPDATE");
995 std::string name = this->GetName();
996 if (name.empty()) name =
"mock";
997 this->
Write(name.c_str());
1003 RESTInfo <<
"Dataset generated: " << filename <<
RESTendl;
1010 SetName(dS.GetName());
1036 auto obsNames = GetObservablesList();
1038 if (std::find(obsNames.begin(), obsNames.end(), obs) != obsNames.end()) {
1039 RESTError <<
"Cannot merge dataSets with different observable list " <<
RESTendl;
1062 RESTError <<
"Datasets can only be imported from root files" <<
RESTendl;
1067 TFile* file = TFile::Open(fileName.c_str(),
"READ");
1068 if (file !=
nullptr) {
1069 TIter nextkey(file->GetListOfKeys());
1071 while ((key = (TKey*)nextkey())) {
1072 std::string kName = key->GetClassName();
1081 if (dS ==
nullptr) {
1082 RESTError << fileName <<
" is not a valid dataSet" <<
RESTendl;
1087 ROOT::EnableImplicitMT();
1089 ROOT::DisableImplicitMT();
1091 fDataFrame = ROOT::RDataFrame(
"AnalysisTree", fileName);
1093 fTree = (TChain*)file->Get(
"AnalysisTree");
1106 for (
const auto& fN : fileNames)
1108 RESTError <<
"Datasets can only be imported from root files" <<
RESTendl;
1113 auto it = fileNames.begin();
1114 while (it != fileNames.end()) {
1115 std::string fileName = *it;
1116 TFile* file = TFile::Open(fileName.c_str(),
"READ");
1117 bool isValid =
false;
1118 if (file !=
nullptr) {
1119 TIter nextkey(file->GetListOfKeys());
1121 while ((key = (TKey*)nextkey())) {
1122 std::string kName = key->GetClassName();
1133 isValid =
Merge(*dS);
1136 if (isValid) count++;
1140 RESTError <<
"Cannot open " << fileName <<
RESTendl;
1144 RESTError << fileName <<
" is not a valid dataSet skipping..." <<
RESTendl;
1145 it = fileNames.erase(it);
1151 if (fileNames.empty()) {
1152 RESTError <<
"File selection is empty, dataSet will not be imported " <<
RESTendl;
1156 RESTInfo <<
"Opening list of files. First file: " << fileNames[0] <<
RESTendl;
1157 fDataFrame = ROOT::RDataFrame(
"AnalysisTree", fileNames);
1159 if (
fTree !=
nullptr) {
1163 fTree =
new TChain(
"AnalysisTree");
1165 for (
const auto& fN : fileNames)
fTree->Add((TString)fN);
std::vector< std::string > GetObservableNames()
It returns a vector with strings containing all the observables that exist in the analysis tree.
A class to help on cuts definitions. To be used with TRestAnalysisTree.
It allows to group a number of runs that satisfy given metadata conditions.
std::vector< std::string > fFilterContains
If not empty it will check if the metadata member contains the string.
virtual std::vector< std::string > FileSelection()
Function to determine the filenames that satisfy the dataset conditions.
std::vector< Double_t > fFilterLowerThan
If the corresponding element is not empty it will check if the metadata member is lower.
void PrintMetadata() override
Prints on screen the information about the metadata members of TRestDataSet.
TChain * fTree
A pointer to the generated tree.
void Import(const std::string &fileName)
This function imports metadata from a root file it import metadata info from the previous dataSet whi...
std::map< std::string, RelevantQuantity > fQuantity
The properties of a relevant quantity that we want to store together with the dataset.
ROOT::RDF::RNode Range(size_t from, size_t to)
This method returns a RDataFrame node with the number of samples inside the dataset by selecting a ra...
std::vector< std::pair< std::string, std::string > > fColumnNameExpressions
A list of new columns together with its corresponding expressions added to the dataset.
ROOT::RDF::RNode DefineColumn(const std::string &columnName, const std::string &formula)
This function will add a new column to the RDataFrame using the same scheme as the usual RDF::Define ...
Double_t fEndTime
TimeStamp for the end time of the last file.
ROOT::RDF::RNode fDataFrame
The resulting RDF::RNode object after initialization.
size_t GetNumberOfBranches()
Number of variables (or observables)
size_t GetEntries()
It returns the number of entries found inside fDataFrame and prints out a warning if the number of en...
TRestDataSet()
Default constructor.
Double_t GetTotalTimeInSeconds() const
It returns the accumulated run time in seconds.
ROOT::RDF::RNode MakeCut(const TRestCut *cut)
This function applies a TRestCut to the dataframe and returns a dataframe with the applied cuts....
void GenerateDataSet()
This function generates the data frame with the filelist and column names (or observables) that have ...
Bool_t fMT
A flag to enable Multithreading during dataframe generation.
TRestCut * fCut
Parameter cuts over the selected dataset.
void Export(const std::string &filename, std::vector< std::string > excludeColumns={})
It will generate an output file with the dataset compilation. Only the selected branches and the file...
std::string fFilterStartTime
All the selected runs will have a starting date after fStartTime.
Bool_t Merge(const TRestDataSet &dS)
This function merge different TRestDataSet metadata in current dataSet.
std::vector< std::string > fFilterMetadata
A list of metadata members where filters will be applied.
std::vector< std::string > fFileSelection
A list populated by the FileSelection method using the conditions of the dataset.
std::vector< std::string > GetFileSelection()
It returns a list of the files that have been finally selected.
std::string fFilterEndTime
All the selected runs will have an ending date before fEndTime.
Double_t fStartTime
TimeStamp for the start time of the first file.
std::vector< std::string > fObservablesList
It contains a list of the observables that will be added to the final tree or exported file.
TTree * GetTree() const
Gives access to the tree.
Bool_t fMergedDataset
It keeps track if the generated dataset is a pure dataset or a merged one.
void Initialize() override
This function initialize different parameters from the TRestDataSet.
void RegenerateTree(std::vector< std::string > finalList={})
It regenerates the tree so that it is an exact copy of the present DataFrame.
std::vector< std::string > fImportedFiles
The list of dataset files imported.
Double_t fTotalDuration
The total integrated run time of selected files.
std::string fFilePattern
A glob file pattern that must be satisfied by all files.
std::vector< Double_t > fFilterGreaterThan
If the corresponding element is not empty it will check if the metadata member is greater.
ROOT::RDF::RNode ApplyRange(size_t from, size_t to)
This method reduces the number of samples inside the dataset by selecting a range.
std::vector< Double_t > fFilterEqualsTo
If the corresponding element is not empty it will check if the metadata member is equal.
void InitFromConfigFile() override
Initialization of specific TRestDataSet members through an RML file.
TRestDataSet & operator=(TRestDataSet &dS)
Operator to copy TRestDataSet metadata.
~TRestDataSet()
Default destructor.
Data provider and manager in REST.
std::string ReplaceMetadataMembers(const std::string &instr, Int_t precision=8)
It will replace the data members contained inside the string given as input. The data members in the ...
@ REST_Info
+show most of the information for each steps
TClass * GetClassQuick()
Get the type of a "class" object, returning the wrapped type identifier "TClass".
time_t StringToTimeStamp(std::string time)
A method to convert a date/time formatted string to a timestamp.
std::vector< std::string > Split(std::string in, std::string separator, bool allowBlankString=false, bool removeWhiteSpaces=false, int startPos=-1)
Split the input string according to the given separator. Returning a vector of fragments.
Double_t StringToDouble(std::string in)
Gets a double from a string.
std::string ToDateTimeString(time_t time)
Format time_t into string.
std::string Replace(std::string in, std::string thisString, std::string byThisString, size_t fromPosition=0, Int_t N=0)
Replace any occurences of thisSring by byThisString inside string in.
std::string metadata
The associated metadata member used to register the relevant quantity.
std::string description
A user given description that can be used to define the relevant quantity.
std::string strategy
It determines how to produce the relevant quantity (accumulate/unique/last/max/min)
std::string value
The quantity value.