300#include "TRestDataSet.h"
303#include "TRestTools.h"
346 if (
fTree !=
nullptr) {
347 RESTWarning <<
"Tree has already been loaded. Skipping TRestDataSet::GenerateDataSet ... "
356 RESTError <<
"File selection is empty " <<
RESTendl;
362 std::set<std::string> finalList;
363 finalList.insert(
"runOrigin");
364 finalList.insert(
"eventID");
365 finalList.insert(
"timeStamp");
369 finalList.insert(obsFromList.begin(), obsFromList.end());
372 ROOT::EnableImplicitMT();
374 ROOT::DisableImplicitMT();
376 RESTInfo <<
"Initializing dataset" <<
RESTendl;
379 RESTInfo <<
"Making cuts" <<
RESTendl;
384 RESTInfo <<
"Adding column to dataset: " << cName <<
RESTendl;
385 finalList.emplace(cName);
389 RegenerateTree(std::vector<std::string>(finalList.begin(), finalList.end()));
391 RESTInfo <<
" - Dataset generated!" <<
RESTendl;
398 RESTInfo <<
"Generating snapshot." <<
RESTendl;
399 std::string user = getenv(
"USER");
400 std::string fOutName =
"/tmp/rest_output_" + user +
".root";
401 if (!finalList.empty())
402 fDataFrame.Snapshot(
"AnalysisTree", fOutName, finalList);
404 fDataFrame.Snapshot(
"AnalysisTree", fOutName);
406 RESTInfo <<
"Re-importing analysis tree." <<
RESTendl;
407 fDataFrame = ROOT::RDataFrame(
"AnalysisTree", fOutName);
409 TFile* f = TFile::Open(fOutName.c_str());
410 fTree = (TChain*)f->Get(
"AnalysisTree");
422 if (!time_stamp_end || !time_stamp_start) {
423 RESTError <<
"TRestDataSet::FileSelect. Start or end dates not properly formed. Please, check "
424 "REST_StringHelper::StringToTimeStamp documentation for valid formats"
431 RESTInfo <<
"TRestDataSet::FileSelection. Starting file selection." <<
RESTendl;
432 RESTInfo <<
"Total files : " << fileNames.size() <<
RESTendl;
433 RESTInfo <<
"This process may take long computation time in case there are many files." <<
RESTendl;
436 std::cout <<
"Processing file selection.";
438 for (
const auto& file : fileNames) {
439 if (cnt % 100 == 0) {
440 std::cout << std::endl;
441 std::cout <<
"Files processed: " << cnt <<
" ." << std::flush;
445 std::cout <<
"." << std::flush;
446 double runStart = run.GetStartTimestamp();
447 double runEnd = run.GetEndTimestamp();
449 if (runStart < time_stamp_start || runEnd > time_stamp_end) {
450 RESTInfo <<
"Rejecting file out of date range: " << file <<
RESTendl;
457 std::string mdValue = run.GetMetadataMember(md);
460 if (mdValue.find(
fFilterContains[n]) == std::string::npos) accept =
false;
475 if (!accept)
continue;
478 for (
auto& [name, properties] :
fQuantity) {
482 if (properties.strategy ==
"accumulate") {
484 properties.value = StringWithPrecision(val, 2);
487 if (properties.strategy ==
"max")
489 properties.value = value;
491 if (properties.strategy ==
"min")
493 properties.value = value;
495 if (properties.strategy ==
"unique") {
496 if (properties.value.empty())
497 properties.value = value;
498 else if (properties.value != value) {
499 RESTWarning <<
"TRestDataSet::FileSelection. Relevant quantity retrieval." <<
RESTendl;
500 RESTWarning <<
"A unique metadata member used for the `" << name
501 <<
"` quantity is not unique!!" <<
RESTendl;
502 RESTWarning <<
"Pre-registered value : " << properties.value <<
" New value : " << value
507 if (properties.strategy ==
"last") properties.value = value;
509 if (properties.strategy.find(
"append") != std::string::npos) {
511 std::string appender = properties.strategy.substr(properties.strategy.find(
"append") + 6);
512 if (properties.value.empty())
513 properties.value = value;
516 if (properties.value.find(value) == std::string::npos)
517 properties.value += appender + value;
520 if (properties.strategy.find(
"extend") != std::string::npos) {
522 std::string appender = properties.strategy.substr(properties.strategy.find(
"extend") + 6);
523 if (properties.value.empty())
524 properties.value = value;
526 properties.value += appender + value;
534 fTotalDuration += run.GetEndTimestamp() - run.GetStartTimestamp();
537 std::cout << std::endl;
569 if (cut ==
nullptr)
return df;
571 auto paramCut = cut->GetParamCut();
572 auto obsList = df.GetColumnNames();
573 for (
const auto& [param, condition] : paramCut) {
574 if (std::find(obsList.begin(), obsList.end(), param) != obsList.end()) {
575 std::string pCut = param + condition;
576 RESTDebug <<
"Applying cut " << pCut <<
RESTendl;
577 df = df.Filter(pCut);
579 RESTWarning <<
" Cut observable " << param <<
" not found in observable list, skipping..."
584 auto cutString = cut->GetCutStrings();
585 for (
const auto& pCut : cutString) {
587 for (
const auto& obs : obsList) {
588 if (pCut.find(obs) != std::string::npos) {
589 RESTDebug <<
"Applying cut " << pCut <<
RESTendl;
590 df = df.Filter(pCut);
597 RESTWarning <<
" Cut string " << pCut <<
" not found in observable list, skipping..." <<
RESTendl;
611 if (*nEntries == (
long long unsigned int)
GetTree()->
GetEntries())
return *nEntries;
612 RESTWarning <<
"TRestDataSet::GetEntries. Number of tree entries is not the same as RDataFrame entries."
614 RESTWarning <<
"Returning RDataFrame entries" <<
RESTendl;
633 std::string evalFormula = formula;
634 for (
auto const& [name, properties] :
fQuantity)
637 df = df.Define(columnName, evalFormula);
660 RESTMetadata <<
" Observables added:" <<
RESTendl;
661 RESTMetadata <<
" -------------------------" <<
RESTendl;
668 RESTMetadata <<
" Metadata filters: " <<
RESTendl;
669 RESTMetadata <<
" ----------------- " <<
RESTendl;
674 RESTMetadata <<
" - " << mdFilter <<
".";
689 RESTMetadata <<
" Relevant quantities: " <<
RESTendl;
690 RESTMetadata <<
" -------------------- " <<
RESTendl;
692 for (
auto const& [name, properties] :
fQuantity) {
693 RESTMetadata <<
" - Name : " << name <<
". Value : " << properties.value
694 <<
". Strategy: " << properties.strategy <<
RESTendl;
695 RESTMetadata <<
" - Metadata: " << properties.metadata <<
RESTendl;
696 RESTMetadata <<
" - Description: " << properties.description <<
RESTendl;
702 RESTMetadata <<
" New columns added to generated dataframe: " <<
RESTendl;
703 RESTMetadata <<
" ---------------------------------------- " <<
RESTendl;
705 RESTMetadata <<
" - Name : " << cName <<
RESTendl;
706 RESTMetadata <<
" - Expression: " << cExpression <<
RESTendl;
713 RESTMetadata <<
"This is a combined dataset." <<
RESTendl;
714 RESTMetadata <<
" -------------------- " <<
RESTendl;
715 RESTMetadata <<
" - Relevant quantities have been removed!" <<
RESTendl;
716 RESTMetadata <<
" - Dataset metadata properties correspond to the first file in the list."
719 RESTMetadata <<
"List of imported files: " <<
RESTendl;
720 RESTMetadata <<
" -------------------- " <<
RESTendl;
726 RESTMetadata <<
" - Multithreading was enabled" <<
RESTendl;
728 RESTMetadata <<
" - Multithreading was NOT enabled" <<
RESTendl;
740 TiXmlElement* filterDefinition =
GetElement(
"filter");
741 while (filterDefinition !=
nullptr) {
742 std::string metadata =
GetFieldValue(
"metadata", filterDefinition);
743 if (metadata.empty() || metadata ==
"Not defined") {
744 RESTError <<
"Filter key defined without metadata member!" <<
RESTendl;
750 std::string contains =
GetFieldValue(
"contains", filterDefinition);
751 if (contains ==
"Not defined") contains =
"";
765 TiXmlElement* observablesDefinition =
GetElement(
"observables");
766 while (observablesDefinition !=
nullptr) {
767 std::string observables =
GetFieldValue(
"list", observablesDefinition);
768 if (observables.empty() || observables ==
"Not defined") {
769 RESTError <<
"<observables key does not contain a list!" <<
RESTendl;
781 TiXmlElement* obsProcessDefinition =
GetElement(
"processObservables");
782 while (obsProcessDefinition !=
nullptr) {
783 std::string observables =
GetFieldValue(
"list", obsProcessDefinition);
784 if (observables.empty() || observables ==
"Not defined") {
785 RESTError <<
"<processObservables key does not contain a list!" <<
RESTendl;
791 for (
const auto& l : obsList) {
792 std::string processObsPattern = l +
"_*";
800 TiXmlElement* quantityDefinition =
GetElement(
"quantity");
801 while (quantityDefinition !=
nullptr) {
802 std::string name =
GetFieldValue(
"name", quantityDefinition);
803 if (name.empty() || name ==
"Not defined") {
804 RESTError <<
"<quantity key does not contain a name!" <<
RESTendl;
808 std::string metadata =
GetFieldValue(
"metadata", quantityDefinition);
809 if (metadata.empty() || metadata ==
"Not defined") {
810 RESTError <<
"<quantity key does not contain a metadata value!" <<
RESTendl;
814 std::string strategy =
GetFieldValue(
"strategy", quantityDefinition);
815 if (strategy.empty() || strategy ==
"Not defined") {
819 std::string description =
GetFieldValue(
"description", quantityDefinition);
833 TiXmlElement* columnDefinition =
GetElement(
"addColumn");
834 while (columnDefinition !=
nullptr) {
836 if (name.empty() || name ==
"Not defined") {
837 RESTError <<
"<define key does not contain a name name!" <<
RESTendl;
841 std::string expression =
GetFieldValue(
"expression", columnDefinition);
842 if (expression.empty() || expression ==
"Not defined") {
843 RESTError <<
"<addColumn key does not contain a expression value!" <<
RESTendl;
869 RESTInfo <<
"Exporting dataset" <<
RESTendl;
871 std::vector<std::string> columns =
fDataFrame.GetColumnNames();
872 if (!excludeColumns.empty()) {
873 columns.erase(std::remove_if(columns.begin(), columns.end(),
874 [&excludeColumns](std::string elem) {
875 return std::find(excludeColumns.begin(), excludeColumns.end(),
876 elem) != excludeColumns.end();
880 RESTInfo <<
"Re-Generating snapshot." <<
RESTendl;
881 std::string user = getenv(
"USER");
882 std::string fOutName =
"/tmp/rest_output_" + user +
".root";
883 fDataFrame.Snapshot(
"AnalysisTree", fOutName, columns);
885 RESTInfo <<
"Re-importing analysis tree." <<
RESTendl;
886 fDataFrame = ROOT::RDataFrame(
"AnalysisTree", fOutName);
888 TFile* f = TFile::Open(fOutName.c_str());
889 fTree = (TChain*)f->Get(
"AnalysisTree");
894 if (excludeColumns.empty()) {
895 RESTInfo <<
"Re-Generating snapshot." <<
RESTendl;
896 std::string user = getenv(
"USER");
897 std::string fOutName =
"/tmp/rest_output_" + user +
".root";
898 fDataFrame.Snapshot(
"AnalysisTree", fOutName);
900 TFile* f = TFile::Open(fOutName.c_str());
901 fTree = (TChain*)f->Get(
"AnalysisTree");
904 std::vector<std::string> dataTypes;
905 for (
int n = 0; n <
fTree->GetListOfBranches()->GetEntries(); n++) {
906 std::string bName =
fTree->GetListOfBranches()->At(n)->GetName();
907 std::string type =
fTree->GetLeaf((TString)bName)->GetTypeName();
908 dataTypes.push_back(type);
909 if (type !=
"Double_t" && type !=
"Int_t") {
910 RESTError <<
"Branch name : " << bName <<
" is type : " << type <<
RESTendl;
911 RESTError <<
"Only Int_t and Double_t types are allowed for "
912 "exporting to ASCII table"
914 RESTError <<
"File will not be generated" <<
RESTendl;
919 FILE* f = fopen(filename.c_str(),
"wt");
921 fprintf(f,
"### TRestDataSet generated file\n");
922 fprintf(f,
"### \n");
926 fprintf(f,
"### Accumulated run time (seconds) : %lf\n",
fTotalDuration);
927 fprintf(f,
"### Accumulated run time (hours) : %lf\n",
fTotalDuration / 3600.);
928 fprintf(f,
"### Accumulated run time (days) : %lf\n",
fTotalDuration / 3600. / 24.);
934 fprintf(f,
"### Metadata filters : \n");
937 fprintf(f,
"### - %s.", md.c_str());
947 fprintf(f,
"### Relevant quantities: \n");
948 for (
auto& [name, properties] :
fQuantity) {
949 fprintf(f,
"### - %s : %s - %s\n", name.c_str(), properties.value.c_str(),
950 properties.description.c_str());
953 fprintf(f,
"### Observables list: ");
955 std::string bName =
fTree->GetListOfBranches()->At(n)->GetName();
956 fprintf(f,
" %s", bName.c_str());
960 fprintf(f,
"### Data starts here\n");
963 std::string obsListStr =
"";
964 for (
const auto& l : obsNames) {
965 if (!obsListStr.empty()) obsListStr +=
":";
970 fTree->Draw((TString)obsListStr,
"",
"goff");
972 for (
unsigned int n = 0; n <
fTree->GetEntries(); n++) {
974 std::string bName =
fTree->GetListOfBranches()->At(m)->GetName();
975 if (m > 0) fprintf(f,
"\t");
976 if (dataTypes[m] ==
"Double_t")
977 if (bName ==
"timeStamp")
978 fprintf(f,
"%010.0lf",
fTree->GetVal(m)[n]);
980 fprintf(f,
"%05.3e",
fTree->GetVal(m)[n]);
982 fprintf(f,
"%06d", (Int_t)
fTree->GetVal(m)[n]);
990 fDataFrame.Snapshot(
"AnalysisTree", filename);
992 TFile* f = TFile::Open(filename.c_str(),
"UPDATE");
993 std::string name = this->GetName();
994 if (name.empty()) name =
"mock";
995 this->
Write(name.c_str());
1001 RESTInfo <<
"Dataset generated: " << filename <<
RESTendl;
1008 SetName(dS.GetName());
1034 auto obsNames = GetObservablesList();
1036 if (std::find(obsNames.begin(), obsNames.end(), obs) != obsNames.end()) {
1037 RESTError <<
"Cannot merge dataSets with different observable list " <<
RESTendl;
1060 RESTError <<
"Datasets can only be imported from root files" <<
RESTendl;
1065 TFile* file = TFile::Open(fileName.c_str(),
"READ");
1066 if (file !=
nullptr) {
1067 TIter nextkey(file->GetListOfKeys());
1069 while ((key = (TKey*)nextkey())) {
1070 std::string kName = key->GetClassName();
1079 if (dS ==
nullptr) {
1080 RESTError << fileName <<
" is not a valid dataSet" <<
RESTendl;
1085 ROOT::EnableImplicitMT();
1087 ROOT::DisableImplicitMT();
1089 fDataFrame = ROOT::RDataFrame(
"AnalysisTree", fileName);
1091 fTree = (TChain*)file->Get(
"AnalysisTree");
1104 for (
const auto& fN : fileNames)
1106 RESTError <<
"Datasets can only be imported from root files" <<
RESTendl;
1111 auto it = fileNames.begin();
1112 while (it != fileNames.end()) {
1113 std::string fileName = *it;
1114 TFile* file = TFile::Open(fileName.c_str(),
"READ");
1115 bool isValid =
false;
1116 if (file !=
nullptr) {
1117 TIter nextkey(file->GetListOfKeys());
1119 while ((key = (TKey*)nextkey())) {
1120 std::string kName = key->GetClassName();
1131 isValid =
Merge(*dS);
1134 if (isValid) count++;
1138 RESTError <<
"Cannot open " << fileName <<
RESTendl;
1142 RESTError << fileName <<
" is not a valid dataSet skipping..." <<
RESTendl;
1143 it = fileNames.erase(it);
1149 if (fileNames.empty()) {
1150 RESTError <<
"File selection is empty, dataSet will not be imported " <<
RESTendl;
1154 RESTInfo <<
"Opening list of files. First file: " << fileNames[0] <<
RESTendl;
1155 fDataFrame = ROOT::RDataFrame(
"AnalysisTree", fileNames);
1157 if (
fTree !=
nullptr) {
1161 fTree =
new TChain(
"AnalysisTree");
1163 for (
const auto& fN : fileNames)
fTree->Add((TString)fN);
std::vector< std::string > GetObservableNames()
It returns a vector with strings containing all the observables that exist in the analysis tree.
A class to help on cuts definitions. To be used with TRestAnalysisTree.
It allows to group a number of runs that satisfy given metadata conditions.
std::vector< std::string > fFilterContains
If not empty it will check if the metadata member contains the string.
virtual std::vector< std::string > FileSelection()
Function to determine the filenames that satisfy the dataset conditions.
std::vector< Double_t > fFilterLowerThan
If the corresponding element is not empty it will check if the metadata member is lower.
void PrintMetadata() override
Prints on screen the information about the metadata members of TRestDataSet.
TChain * fTree
A pointer to the generated tree.
void Import(const std::string &fileName)
This function imports metadata from a root file it import metadata info from the previous dataSet whi...
std::map< std::string, RelevantQuantity > fQuantity
The properties of a relevant quantity that we want to store together with the dataset.
ROOT::RDF::RNode Range(size_t from, size_t to)
This method returns a RDataFrame node with the number of samples inside the dataset by selecting a ra...
std::vector< std::pair< std::string, std::string > > fColumnNameExpressions
A list of new columns together with its corresponding expressions added to the dataset.
ROOT::RDF::RNode DefineColumn(const std::string &columnName, const std::string &formula)
This function will add a new column to the RDataFrame using the same scheme as the usual RDF::Define ...
Double_t fEndTime
TimeStamp for the end time of the last file.
ROOT::RDF::RNode fDataFrame
The resulting RDF::RNode object after initialization.
size_t GetNumberOfBranches()
Number of variables (or observables)
size_t GetEntries()
It returns the number of entries found inside fDataFrame and prints out a warning if the number of en...
TRestDataSet()
Default constructor.
Double_t GetTotalTimeInSeconds() const
It returns the accumulated run time in seconds.
ROOT::RDF::RNode MakeCut(const TRestCut *cut)
This function applies a TRestCut to the dataframe and returns a dataframe with the applied cuts....
void GenerateDataSet()
This function generates the data frame with the filelist and column names (or observables) that have ...
Bool_t fMT
A flag to enable Multithreading during dataframe generation.
TRestCut * fCut
Parameter cuts over the selected dataset.
void Export(const std::string &filename, std::vector< std::string > excludeColumns={})
It will generate an output file with the dataset compilation. Only the selected branches and the file...
std::string fFilterStartTime
All the selected runs will have a starting date after fStartTime.
Bool_t Merge(const TRestDataSet &dS)
This function merge different TRestDataSet metadata in current dataSet.
std::vector< std::string > fFilterMetadata
A list of metadata members where filters will be applied.
std::vector< std::string > fFileSelection
A list populated by the FileSelection method using the conditions of the dataset.
std::vector< std::string > GetFileSelection()
It returns a list of the files that have been finally selected.
std::string fFilterEndTime
All the selected runs will have an ending date before fEndTime.
Double_t fStartTime
TimeStamp for the start time of the first file.
std::vector< std::string > fObservablesList
It contains a list of the observables that will be added to the final tree or exported file.
TTree * GetTree() const
Gives access to the tree.
Bool_t fMergedDataset
It keeps track if the generated dataset is a pure dataset or a merged one.
void Initialize() override
This function initialize different parameters from the TRestDataSet.
void RegenerateTree(std::vector< std::string > finalList={})
It regenerates the tree so that it is an exact copy of the present DataFrame.
std::vector< std::string > fImportedFiles
The list of dataset files imported.
Double_t fTotalDuration
The total integrated run time of selected files.
std::string fFilePattern
A glob file pattern that must be satisfied by all files.
std::vector< Double_t > fFilterGreaterThan
If the corresponding element is not empty it will check if the metadata member is greater.
ROOT::RDF::RNode ApplyRange(size_t from, size_t to)
This method reduces the number of samples inside the dataset by selecting a range.
std::vector< Double_t > fFilterEqualsTo
If the corresponding element is not empty it will check if the metadata member is equal.
void InitFromConfigFile() override
Initialization of specific TRestDataSet members through an RML file.
TRestDataSet & operator=(TRestDataSet &dS)
Operator to copy TRestDataSet metadata.
~TRestDataSet()
Default destructor.
Data provider and manager in REST.
std::string ReplaceMetadataMembers(const std::string &instr, Int_t precision=8)
It will replace the data members contained inside the string given as input. The data members in the ...
@ REST_Info
+show most of the information for each steps
TClass * GetClassQuick()
Get the type of a "class" object, returning the wrapped type identifier "TClass".
time_t StringToTimeStamp(std::string time)
A method to convert a date/time formatted string to a timestamp.
std::vector< std::string > Split(std::string in, std::string separator, bool allowBlankString=false, bool removeWhiteSpaces=false, int startPos=-1)
Split the input string according to the given separator. Returning a vector of fragments.
Double_t StringToDouble(std::string in)
Gets a double from a string.
std::string ToDateTimeString(time_t time)
Format time_t into string.
std::string Replace(std::string in, std::string thisString, std::string byThisString, size_t fromPosition=0, Int_t N=0)
Replace any occurences of thisSring by byThisString inside string in.
std::string metadata
The associated metadata member used to register the relevant quantity.
std::string description
A user given description that can be used to define the relevant quantity.
std::string strategy
It determines how to produce the relevant quantity (accumulate/unique/last/max/min)
std::string value
The quantity value.