REST-for-Physics  v2.3
Rare Event Searches ToolKit for Physics
TRestDataSet.cxx
1 /*************************************************************************
2  * This file is part of the REST software framework. *
3  * *
4  * Copyright (C) 2016 GIFNA/TREX (University of Zaragoza) *
5  * For more information see https://gifna.unizar.es/trex *
6  * *
7  * REST is free software: you can redistribute it and/or modify *
8  * it under the terms of the GNU General Public License as published by *
9  * the Free Software Foundation, either version 3 of the License, or *
10  * (at your option) any later version. *
11  * *
12  * REST is distributed in the hope that it will be useful, *
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15  * GNU General Public License for more details. *
16  * *
17  * You should have a copy of the GNU General Public License along with *
18  * REST in $REST_PATH/LICENSE. *
19  * If not, see https://www.gnu.org/licenses/. *
20  * For the list of contributors see $REST_PATH/CREDITS. *
21  *************************************************************************/
22 
293 #include "TRestDataSet.h"
294 
295 #include "TRestRun.h"
296 #include "TRestTools.h"
297 
298 ClassImp(TRestDataSet);
299 
304 
319 TRestDataSet::TRestDataSet(const char* cfgFileName, const std::string& name) : TRestMetadata(cfgFileName) {
321 }
322 
327 
332 void TRestDataSet::Initialize() { SetSectionName(this->ClassName()); }
333 
339  if (fTree != nullptr) {
340  RESTWarning << "Tree has already been loaded. Skipping TRestDataSet::GenerateDataSet ... "
341  << RESTendl;
342  return;
343  }
344 
345  if (fFileSelection.empty()) FileSelection();
346 
347  // We are not ready yet
348  if (fFileSelection.empty()) {
349  RESTError << "File selection is empty " << RESTendl;
350  return;
351  }
352 
354  TRestRun run(fFileSelection.front());
355  std::vector<std::string> finalList;
356  finalList.push_back("runOrigin");
357  finalList.push_back("eventID");
358  finalList.push_back("timeStamp");
359 
360  auto obsNames = run.GetAnalysisTree()->GetObservableNames();
361  for (const auto& obs : fObservablesList) {
362  if (std::find(obsNames.begin(), obsNames.end(), obs) != obsNames.end()) {
363  finalList.push_back(obs);
364  } else {
365  RESTWarning << " Observable " << obs << " not found in observable list, skipping..." << RESTendl;
366  }
367  }
368 
369  for (const auto& name : obsNames) {
370  for (const auto& pcs : fProcessObservablesList) {
371  if (name.find(pcs) == 0) finalList.push_back(name);
372  }
373  }
374 
375  // Remove duplicated observables if any
376  std::sort(finalList.begin(), finalList.end());
377  finalList.erase(std::unique(finalList.begin(), finalList.end()), finalList.end());
378 
379  if (fMT)
380  ROOT::EnableImplicitMT();
381  else
382  ROOT::DisableImplicitMT();
383 
384  RESTInfo << "Initializing dataset" << RESTendl;
385  fDataFrame = ROOT::RDataFrame("AnalysisTree", fFileSelection);
386 
387  RESTInfo << "Making cuts" << RESTendl;
389 
390  // Adding new user columns added to the dataset
391  for (const auto& [cName, cExpression] : fColumnNameExpressions) {
392  RESTInfo << "Adding column to dataset: " << cName << RESTendl;
393  finalList.emplace_back(cName);
394  fDataFrame = DefineColumn(cName, cExpression);
395  }
396 
397  RegenerateTree(finalList);
398 
399  RESTInfo << " - Dataset generated!" << RESTendl;
400 }
401 
405 void TRestDataSet::RegenerateTree(std::vector<std::string> finalList) {
406  RESTInfo << "Generating snapshot." << RESTendl;
407  std::string user = getenv("USER");
408  std::string fOutName = "/tmp/rest_output_" + user + ".root";
409  if (!finalList.empty())
410  fDataFrame.Snapshot("AnalysisTree", fOutName, finalList);
411  else
412  fDataFrame.Snapshot("AnalysisTree", fOutName);
413 
414  RESTInfo << "Re-importing analysis tree." << RESTendl;
415  fDataFrame = ROOT::RDataFrame("AnalysisTree", fOutName);
416 
417  TFile* f = TFile::Open(fOutName.c_str());
418  fTree = (TChain*)f->Get("AnalysisTree");
419 }
420 
424 std::vector<std::string> TRestDataSet::FileSelection() {
425  fFileSelection.clear();
426 
427  std::time_t time_stamp_start = REST_StringHelper::StringToTimeStamp(fFilterStartTime);
428  std::time_t time_stamp_end = REST_StringHelper::StringToTimeStamp(fFilterEndTime);
429 
430  if (!time_stamp_end || !time_stamp_start) {
431  RESTError << "TRestDataSet::FileSelect. Start or end dates not properly formed. Please, check "
432  "REST_StringHelper::StringToTimeStamp documentation for valid formats"
433  << RESTendl;
434  return fFileSelection;
435  }
436 
437  std::vector<std::string> fileNames = TRestTools::GetFilesMatchingPattern(fFilePattern);
438 
439  RESTInfo << "TRestDataSet::FileSelection. Starting file selection." << RESTendl;
440  RESTInfo << "Total files : " << fileNames.size() << RESTendl;
441  RESTInfo << "This process may take long computation time in case there are many files." << RESTendl;
442 
443  fTotalDuration = 0;
444  std::cout << "Processing file selection.";
445  int cnt = 1;
446  for (const auto& file : fileNames) {
447  if (cnt % 100 == 0) {
448  std::cout << std::endl;
449  std::cout << "Files processed: " << cnt << " ." << std::flush;
450  }
451  cnt++;
452  TRestRun run(file);
453  std::cout << "." << std::flush;
454  double runStart = run.GetStartTimestamp();
455  double runEnd = run.GetEndTimestamp();
456 
457  if (runStart < time_stamp_start || runEnd > time_stamp_end) {
458  RESTInfo << "Rejecting file out of date range: " << file << RESTendl;
459  continue;
460  }
461 
462  int n = 0;
463  bool accept = true;
464  for (const auto& md : fFilterMetadata) {
465  std::string mdValue = run.GetMetadataMember(md);
466 
467  if (!fFilterContains[n].empty())
468  if (mdValue.find(fFilterContains[n]) == std::string::npos) accept = false;
469 
470  if (fFilterGreaterThan[n] != -1) {
471  if (StringToDouble(mdValue) <= fFilterGreaterThan[n]) accept = false;
472  }
473 
474  if (fFilterLowerThan[n] != -1)
475  if (StringToDouble(mdValue) >= fFilterLowerThan[n]) accept = false;
476 
477  if (fFilterEqualsTo[n] != -1)
478  if (StringToDouble(mdValue) != fFilterEqualsTo[n]) accept = false;
479 
480  n++;
481  }
482 
483  if (!accept) continue;
484 
485  Double_t acc = 0;
486  for (auto& [name, properties] : fQuantity) {
487  std::string value = run.ReplaceMetadataMembers(properties.metadata);
488  const Double_t val = REST_StringHelper::StringToDouble(value);
489 
490  if (properties.strategy == "accumulate") {
491  acc += val;
492  properties.value = StringWithPrecision(val, 2);
493  }
494 
495  if (properties.strategy == "max")
496  if (properties.value.empty() || REST_StringHelper::StringToDouble(properties.value) < val)
497  properties.value = value;
498 
499  if (properties.strategy == "min")
500  if (properties.value.empty() || REST_StringHelper::StringToDouble(properties.value) > val)
501  properties.value = value;
502 
503  if (properties.strategy == "unique") {
504  if (properties.value.empty())
505  properties.value = value;
506  else if (properties.value != value) {
507  RESTWarning << "TRestDataSet::FileSelection. Relevant quantity retrieval." << RESTendl;
508  RESTWarning << "A unique metadata member used for the `" << name
509  << "` quantity is not unique!!" << RESTendl;
510  RESTWarning << "Pre-registered value : " << properties.value << " New value : " << value
511  << RESTendl;
512  }
513  }
514 
515  if (properties.strategy == "last") properties.value = value;
516  }
517 
518  if (run.GetStartTimestamp() < fStartTime) fStartTime = run.GetStartTimestamp();
519 
520  if (run.GetEndTimestamp() > fEndTime) fEndTime = run.GetEndTimestamp();
521 
522  fTotalDuration += run.GetEndTimestamp() - run.GetStartTimestamp();
523  fFileSelection.push_back(file);
524  }
525  std::cout << std::endl;
526 
527  return fFileSelection;
528 }
529 
536 ROOT::RDF::RNode TRestDataSet::Range(size_t from, size_t to) { return fDataFrame.Range(from, to); }
537 
542 ROOT::RDF::RNode TRestDataSet::ApplyRange(size_t from, size_t to) {
543  fDataFrame = fDataFrame.Range(from, to);
544  RegenerateTree();
545  return fDataFrame;
546 }
547 
554 ROOT::RDF::RNode TRestDataSet::MakeCut(const TRestCut* cut) {
555  auto df = fDataFrame;
556 
557  if (cut == nullptr) return df;
558 
559  auto paramCut = cut->GetParamCut();
560  auto obsList = df.GetColumnNames();
561  for (const auto& [param, condition] : paramCut) {
562  if (std::find(obsList.begin(), obsList.end(), param) != obsList.end()) {
563  std::string pCut = param + condition;
564  RESTDebug << "Applying cut " << pCut << RESTendl;
565  df = df.Filter(pCut);
566  } else {
567  RESTWarning << " Cut observable " << param << " not found in observable list, skipping..."
568  << RESTendl;
569  }
570  }
571 
572  auto cutString = cut->GetCutStrings();
573  for (const auto& pCut : cutString) {
574  bool added = false;
575  for (const auto& obs : obsList) {
576  if (pCut.find(obs) != std::string::npos) {
577  RESTDebug << "Applying cut " << pCut << RESTendl;
578  df = df.Filter(pCut);
579  added = true;
580  break;
581  }
582  }
583 
584  if (!added) {
585  RESTWarning << " Cut string " << pCut << " not found in observable list, skipping..." << RESTendl;
586  }
587  }
588 
589  return df;
590 }
591 
598  auto nEntries = fDataFrame.Count();
599  if (*nEntries == (long long unsigned int)GetTree()->GetEntries()) return *nEntries;
600  RESTWarning << "TRestDataSet::GetEntries. Number of tree entries is not the same as RDataFrame entries."
601  << RESTendl;
602  RESTWarning << "Returning RDataFrame entries" << RESTendl;
603  return *nEntries;
604 }
605 
618 ROOT::RDF::RNode TRestDataSet::DefineColumn(const std::string& columnName, const std::string& formula) {
619  auto df = fDataFrame;
620 
621  std::string evalFormula = formula;
622  for (auto const& [name, properties] : fQuantity)
623  evalFormula = REST_StringHelper::Replace(evalFormula, name, properties.value);
624 
625  df = df.Define(columnName, evalFormula);
626 
627  return df;
628 }
629 
635 
636  RESTMetadata << " - StartTime : " << REST_StringHelper::ToDateTimeString(fStartTime) << RESTendl;
637  RESTMetadata << " - EndTime : " << REST_StringHelper::ToDateTimeString(fEndTime) << RESTendl;
638  RESTMetadata << " - Path : " << TRestTools::SeparatePathAndName(fFilePattern).first << RESTendl;
639  RESTMetadata << " - File pattern : " << TRestTools::SeparatePathAndName(fFilePattern).second << RESTendl;
640  RESTMetadata << " " << RESTendl;
641  RESTMetadata << " - Accumulated run time (seconds) : " << fTotalDuration << RESTendl;
642  RESTMetadata << " - Accumulated run time (hours) : " << fTotalDuration / 3600. << RESTendl;
643  RESTMetadata << " - Accumulated run time (days) : " << fTotalDuration / 3600. / 24. << RESTendl;
644 
645  RESTMetadata << " " << RESTendl;
646 
647  if (!fObservablesList.empty()) {
648  RESTMetadata << " Single observables added:" << RESTendl;
649  RESTMetadata << " -------------------------" << RESTendl;
650  for (const auto& l : fObservablesList) RESTMetadata << " - " << l << RESTendl;
651 
652  RESTMetadata << " " << RESTendl;
653  }
654 
655  if (!fProcessObservablesList.empty()) {
656  RESTMetadata << " Process observables added: " << RESTendl;
657  RESTMetadata << " -------------------------- " << RESTendl;
658  for (const auto& l : fProcessObservablesList) RESTMetadata << " - " << l << RESTendl;
659 
660  RESTMetadata << " " << RESTendl;
661  }
662 
663  if (!fFilterMetadata.empty()) {
664  RESTMetadata << " Metadata filters: " << RESTendl;
665  RESTMetadata << " ----------------- " << RESTendl;
666  RESTMetadata << " - StartTime : " << fFilterStartTime << RESTendl;
667  RESTMetadata << " - EndTime : " << fFilterEndTime << RESTendl;
668  int n = 0;
669  for (const auto& mdFilter : fFilterMetadata) {
670  RESTMetadata << " - " << mdFilter << ".";
671 
672  if (!fFilterContains[n].empty()) RESTMetadata << " Contains: " << fFilterContains[n];
673  if (fFilterGreaterThan[n] != -1) RESTMetadata << " Greater than: " << fFilterGreaterThan[n];
674  if (fFilterLowerThan[n] != -1) RESTMetadata << " Lower than: " << fFilterLowerThan[n];
675  if (fFilterEqualsTo[n] != -1) RESTMetadata << " Equals to: " << fFilterEqualsTo[n];
676 
677  RESTMetadata << RESTendl;
678  n++;
679  }
680 
681  RESTMetadata << " " << RESTendl;
682  }
683 
684  if (!fQuantity.empty()) {
685  RESTMetadata << " Relevant quantities: " << RESTendl;
686  RESTMetadata << " -------------------- " << RESTendl;
687 
688  for (auto const& [name, properties] : fQuantity) {
689  RESTMetadata << " - Name : " << name << ". Value : " << properties.value
690  << ". Strategy: " << properties.strategy << RESTendl;
691  RESTMetadata << " - Metadata: " << properties.metadata << RESTendl;
692  RESTMetadata << " - Description: " << properties.description << RESTendl;
693  RESTMetadata << " " << RESTendl;
694  }
695  }
696 
697  if (!fColumnNameExpressions.empty()) {
698  RESTMetadata << " New columns added to generated dataframe: " << RESTendl;
699  RESTMetadata << " ---------------------------------------- " << RESTendl;
700  for (const auto& [cName, cExpression] : fColumnNameExpressions) {
701  RESTMetadata << " - Name : " << cName << RESTendl;
702  RESTMetadata << " - Expression: " << cExpression << RESTendl;
703  RESTMetadata << " " << RESTendl;
704  }
705  }
706 
707  if (fMergedDataset) {
708  RESTMetadata << " " << RESTendl;
709  RESTMetadata << "This is a combined dataset." << RESTendl;
710  RESTMetadata << " -------------------- " << RESTendl;
711  RESTMetadata << " - Relevant quantities have been removed!" << RESTendl;
712  RESTMetadata << " - Dataset metadata properties correspond to the first file in the list."
713  << RESTendl;
714  RESTMetadata << " " << RESTendl;
715  RESTMetadata << "List of imported files: " << RESTendl;
716  RESTMetadata << " -------------------- " << RESTendl;
717  for (const auto& fn : fImportedFiles) RESTMetadata << " - " << fn << RESTendl;
718  }
719 
720  RESTMetadata << " " << RESTendl;
721  if (fMT)
722  RESTMetadata << " - Multithreading was enabled" << RESTendl;
723  else
724  RESTMetadata << " - Multithreading was NOT enabled" << RESTendl;
725 
726  RESTMetadata << "----" << RESTendl;
727 }
728 
734 
736  TiXmlElement* filterDefinition = GetElement("filter");
737  while (filterDefinition != nullptr) {
738  std::string metadata = GetFieldValue("metadata", filterDefinition);
739  if (metadata.empty() || metadata == "Not defined") {
740  RESTError << "Filter key defined without metadata member!" << RESTendl;
741  exit(1);
742  }
743 
744  fFilterMetadata.push_back(metadata);
745 
746  std::string contains = GetFieldValue("contains", filterDefinition);
747  if (contains == "Not defined") contains = "";
748  Double_t greaterThan = StringToDouble(GetFieldValue("greaterThan", filterDefinition));
749  Double_t lowerThan = StringToDouble(GetFieldValue("lowerThan", filterDefinition));
750  Double_t equalsTo = StringToDouble(GetFieldValue("equalsTo", filterDefinition));
751 
752  fFilterContains.push_back(contains);
753  fFilterGreaterThan.push_back(greaterThan);
754  fFilterLowerThan.push_back(lowerThan);
755  fFilterEqualsTo.push_back(equalsTo);
756 
757  filterDefinition = GetNextElement(filterDefinition);
758  }
759 
761  TiXmlElement* observablesDefinition = GetElement("observables");
762  while (observablesDefinition != nullptr) {
763  std::string observables = GetFieldValue("list", observablesDefinition);
764  if (observables.empty() || observables == "Not defined") {
765  RESTError << "<observables key does not contain a list!" << RESTendl;
766  exit(1);
767  }
768 
769  std::vector<std::string> obsList = REST_StringHelper::Split(observables, ",");
770 
771  fObservablesList.insert(fObservablesList.end(), obsList.begin(), obsList.end());
772 
773  observablesDefinition = GetNextElement(observablesDefinition);
774  }
775 
777  TiXmlElement* obsProcessDefinition = GetElement("processObservables");
778  while (obsProcessDefinition != nullptr) {
779  std::string observables = GetFieldValue("list", obsProcessDefinition);
780  if (observables.empty() || observables == "Not defined") {
781  RESTError << "<processObservables key does not contain a list!" << RESTendl;
782  exit(1);
783  }
784 
785  std::vector<std::string> obsList = REST_StringHelper::Split(observables, ",");
786 
787  for (const auto& l : obsList) fProcessObservablesList.push_back(l);
788 
789  obsProcessDefinition = GetNextElement(obsProcessDefinition);
790  }
791 
793  TiXmlElement* quantityDefinition = GetElement("quantity");
794  while (quantityDefinition != nullptr) {
795  std::string name = GetFieldValue("name", quantityDefinition);
796  if (name.empty() || name == "Not defined") {
797  RESTError << "<quantity key does not contain a name!" << RESTendl;
798  exit(1);
799  }
800 
801  std::string metadata = GetFieldValue("metadata", quantityDefinition);
802  if (metadata.empty() || metadata == "Not defined") {
803  RESTError << "<quantity key does not contain a metadata value!" << RESTendl;
804  exit(1);
805  }
806 
807  std::string strategy = GetFieldValue("strategy", quantityDefinition);
808  if (strategy.empty() || strategy == "Not defined") {
809  strategy = "unique";
810  }
811 
812  std::string description = GetFieldValue("description", quantityDefinition);
813 
814  RelevantQuantity quantity;
815  quantity.metadata = metadata;
816  quantity.strategy = strategy;
817  quantity.description = description;
818  quantity.value = "";
819 
820  fQuantity[name] = quantity;
821 
822  quantityDefinition = GetNextElement(quantityDefinition);
823  }
824 
826  TiXmlElement* columnDefinition = GetElement("addColumn");
827  while (columnDefinition != nullptr) {
828  std::string name = GetFieldValue("name", columnDefinition);
829  if (name.empty() || name == "Not defined") {
830  RESTError << "<define key does not contain a name name!" << RESTendl;
831  exit(1);
832  }
833 
834  std::string expression = GetFieldValue("expression", columnDefinition);
835  if (expression.empty() || expression == "Not defined") {
836  RESTError << "<addColumn key does not contain a expression value!" << RESTendl;
837  exit(1);
838  }
839 
840  fColumnNameExpressions.push_back({name, expression});
841 
842  columnDefinition = GetNextElement(columnDefinition);
843  }
844 
845  fCut = (TRestCut*)InstantiateChildMetadata("TRestCut");
846 }
847 
861 void TRestDataSet::Export(const std::string& filename, std::vector<std::string> excludeColumns) {
862  RESTInfo << "Exporting dataset" << RESTendl;
863 
864  std::vector<std::string> columns = fDataFrame.GetColumnNames();
865  if (!excludeColumns.empty()) {
866  columns.erase(std::remove_if(columns.begin(), columns.end(),
867  [&excludeColumns](std::string elem) {
868  return std::find(excludeColumns.begin(), excludeColumns.end(),
869  elem) != excludeColumns.end();
870  }),
871  columns.end());
872 
873  RESTInfo << "Re-Generating snapshot." << RESTendl;
874  std::string user = getenv("USER");
875  std::string fOutName = "/tmp/rest_output_" + user + ".root";
876  fDataFrame.Snapshot("AnalysisTree", fOutName, columns);
877 
878  RESTInfo << "Re-importing analysis tree." << RESTendl;
879  fDataFrame = ROOT::RDataFrame("AnalysisTree", fOutName);
880 
881  TFile* f = TFile::Open(fOutName.c_str());
882  fTree = (TChain*)f->Get("AnalysisTree");
883  }
884 
885  if (TRestTools::GetFileNameExtension(filename) == "txt" ||
886  TRestTools::GetFileNameExtension(filename) == "csv") {
887  if (excludeColumns.empty()) {
888  RESTInfo << "Re-Generating snapshot." << RESTendl;
889  std::string user = getenv("USER");
890  std::string fOutName = "/tmp/rest_output_" + user + ".root";
891  fDataFrame.Snapshot("AnalysisTree", fOutName);
892 
893  TFile* f = TFile::Open(fOutName.c_str());
894  fTree = (TChain*)f->Get("AnalysisTree");
895  }
896 
897  std::vector<std::string> dataTypes;
898  for (int n = 0; n < fTree->GetListOfBranches()->GetEntries(); n++) {
899  std::string bName = fTree->GetListOfBranches()->At(n)->GetName();
900  std::string type = fTree->GetLeaf((TString)bName)->GetTypeName();
901  dataTypes.push_back(type);
902  if (type != "Double_t" && type != "Int_t") {
903  RESTError << "Branch name : " << bName << " is type : " << type << RESTendl;
904  RESTError << "Only Int_t and Double_t types are allowed for "
905  "exporting to ASCII table"
906  << RESTendl;
907  RESTError << "File will not be generated" << RESTendl;
908  return;
909  }
910  }
911 
912  FILE* f = fopen(filename.c_str(), "wt");
914  fprintf(f, "### TRestDataSet generated file\n");
915  fprintf(f, "### \n");
916  fprintf(f, "### StartTime : %s\n", fFilterStartTime.c_str());
917  fprintf(f, "### EndTime : %s\n", fFilterEndTime.c_str());
918  fprintf(f, "###\n");
919  fprintf(f, "### Accumulated run time (seconds) : %lf\n", fTotalDuration);
920  fprintf(f, "### Accumulated run time (hours) : %lf\n", fTotalDuration / 3600.);
921  fprintf(f, "### Accumulated run time (days) : %lf\n", fTotalDuration / 3600. / 24.);
922  fprintf(f, "###\n");
923  fprintf(f, "### Data path : %s\n", TRestTools::SeparatePathAndName(fFilePattern).first.c_str());
924  fprintf(f, "### File pattern : %s\n", TRestTools::SeparatePathAndName(fFilePattern).second.c_str());
925  fprintf(f, "###\n");
926  if (!fFilterMetadata.empty()) {
927  fprintf(f, "### Metadata filters : \n");
928  int n = 0;
929  for (const auto& md : fFilterMetadata) {
930  fprintf(f, "### - %s.", md.c_str());
931  if (!fFilterContains[n].empty()) fprintf(f, " Contains: %s.", fFilterContains[n].c_str());
932  if (fFilterGreaterThan[n] != -1) fprintf(f, " Greater than: %6.3lf.", fFilterGreaterThan[n]);
933  if (fFilterLowerThan[n] != -1) fprintf(f, " Lower than: %6.3lf.", fFilterLowerThan[n]);
934  if (fFilterEqualsTo[n] != -1) fprintf(f, " Equals to: %6.3lf.", fFilterLowerThan[n]);
935  fprintf(f, "\n");
936  n++;
937  }
938  }
939  fprintf(f, "###\n");
940  fprintf(f, "### Relevant quantities: \n");
941  for (auto& [name, properties] : fQuantity) {
942  fprintf(f, "### - %s : %s - %s\n", name.c_str(), properties.value.c_str(),
943  properties.description.c_str());
944  }
945  fprintf(f, "###\n");
946  fprintf(f, "### Observables list: ");
947  for (int n = 0; n < fTree->GetListOfBranches()->GetEntries(); n++) {
948  std::string bName = fTree->GetListOfBranches()->At(n)->GetName();
949  fprintf(f, " %s", bName.c_str());
950  }
951  fprintf(f, "\n");
952  fprintf(f, "###\n");
953  fprintf(f, "### Data starts here\n");
954 
955  auto obsNames = fDataFrame.GetColumnNames();
956  std::string obsListStr = "";
957  for (const auto& l : obsNames) {
958  if (!obsListStr.empty()) obsListStr += ":";
959  obsListStr += l;
960  }
961 
962  // We do this so that later we can recover the values using TTree::GetVal
963  fTree->Draw((TString)obsListStr, "", "goff");
964 
965  for (unsigned int n = 0; n < fTree->GetEntries(); n++) {
966  for (unsigned int m = 0; m < GetNumberOfBranches(); m++) {
967  std::string bName = fTree->GetListOfBranches()->At(m)->GetName();
968  if (m > 0) fprintf(f, "\t");
969  if (dataTypes[m] == "Double_t")
970  if (bName == "timeStamp")
971  fprintf(f, "%010.0lf", fTree->GetVal(m)[n]);
972  else
973  fprintf(f, "%05.3e", fTree->GetVal(m)[n]);
974  else
975  fprintf(f, "%06d", (Int_t)fTree->GetVal(m)[n]);
976  }
977  fprintf(f, "\n");
978  }
979  fclose(f);
980 
981  return;
982  } else if (TRestTools::GetFileNameExtension(filename) == "root") {
983  fDataFrame.Snapshot("AnalysisTree", filename);
984 
985  TFile* f = TFile::Open(filename.c_str(), "UPDATE");
986  std::string name = this->GetName();
987  if (name.empty()) name = "mock";
988  this->Write(name.c_str());
989  f->Close();
990  } else {
991  RESTWarning << "TRestDataSet::Export. Extension " << TRestTools::GetFileNameExtension(filename)
992  << " not recognized" << RESTendl;
993  }
994  RESTInfo << "Dataset generated: " << filename << RESTendl;
995 }
996 
1001  SetName(dS.GetName());
1002  fFilterStartTime = dS.GetFilterStartTime();
1003  fFilterEndTime = dS.GetFilterEndTime();
1004  fStartTime = dS.GetStartTime();
1005  fEndTime = dS.GetEndTime();
1006  fFilePattern = dS.GetFilePattern();
1007  fObservablesList = dS.GetObservablesList();
1009  fProcessObservablesList = dS.GetProcessObservablesList();
1010  fFilterMetadata = dS.GetFilterMetadata();
1011  fFilterContains = dS.GetFilterContains();
1012  fFilterGreaterThan = dS.GetFilterGreaterThan();
1013  fFilterLowerThan = dS.GetFilterLowerThan();
1014  fFilterEqualsTo = dS.GetFilterEqualsTo();
1015  fQuantity = dS.GetQuantity();
1016  fColumnNameExpressions = dS.GetAddedColumns();
1018  fCut = dS.GetCut();
1019 
1020  return *this;
1021 }
1022 
1028  auto obsNames = GetObservablesList();
1029  for (const auto& obs : fObservablesList) {
1030  if (std::find(obsNames.begin(), obsNames.end(), obs) != obsNames.end()) {
1031  RESTError << "Cannot merge dataSets with different observable list " << RESTendl;
1032  return false;
1033  }
1034  }
1035 
1036  if (fStartTime > dS.GetStartTime()) fStartTime = dS.GetStartTime();
1037  if (fEndTime < dS.GetEndTime()) fEndTime = dS.GetEndTime();
1038 
1039  auto fileSelection = dS.GetFileSelection();
1040  fFileSelection.insert(fFileSelection.end(), fileSelection.begin(), fileSelection.end());
1041 
1043 
1044  return true;
1045 }
1046 
1052 void TRestDataSet::Import(const std::string& fileName) {
1053  if (TRestTools::GetFileNameExtension(fileName) != "root") {
1054  RESTError << "Datasets can only be imported from root files" << RESTendl;
1055  return;
1056  }
1057 
1058  TRestDataSet* dS = nullptr;
1059  TFile* file = TFile::Open(fileName.c_str(), "READ");
1060  if (file != nullptr) {
1061  TIter nextkey(file->GetListOfKeys());
1062  TKey* key;
1063  while ((key = (TKey*)nextkey())) {
1064  std::string kName = key->GetClassName();
1065  if (REST_Reflection::GetClassQuick(kName.c_str()) != nullptr &&
1066  REST_Reflection::GetClassQuick(kName.c_str())->InheritsFrom("TRestDataSet")) {
1067  dS = file->Get<TRestDataSet>(key->GetName());
1068  *this = *dS;
1069  }
1070  }
1071  }
1072 
1073  if (dS == nullptr) {
1074  RESTError << fileName << " is not a valid dataSet" << RESTendl;
1075  return;
1076  }
1077 
1078  if (fMT)
1079  ROOT::EnableImplicitMT();
1080  else
1081  ROOT::DisableImplicitMT();
1082 
1083  fDataFrame = ROOT::RDataFrame("AnalysisTree", fileName);
1084 
1085  fTree = (TChain*)file->Get("AnalysisTree");
1086 }
1087 
1097 void TRestDataSet::Import(std::vector<std::string> fileNames) {
1098  for (const auto& fN : fileNames)
1099  if (TRestTools::GetFileNameExtension(fN) != "root") {
1100  RESTError << "Datasets can only be imported from root files" << RESTendl;
1101  return;
1102  }
1103 
1104  int count = 0;
1105  auto it = fileNames.begin();
1106  while (it != fileNames.end()) {
1107  std::string fileName = *it;
1108  TFile* file = TFile::Open(fileName.c_str(), "READ");
1109  bool isValid = false;
1110  if (file != nullptr) {
1111  TIter nextkey(file->GetListOfKeys());
1112  TKey* key;
1113  while ((key = (TKey*)nextkey())) {
1114  std::string kName = key->GetClassName();
1115  if (REST_Reflection::GetClassQuick(kName.c_str()) != nullptr &&
1116  REST_Reflection::GetClassQuick(kName.c_str())->InheritsFrom("TRestDataSet")) {
1117  TRestDataSet* dS = file->Get<TRestDataSet>(key->GetName());
1119  dS->PrintMetadata();
1120 
1121  if (count == 0) {
1122  *this = *dS;
1123  isValid = true;
1124  } else {
1125  isValid = Merge(*dS);
1126  }
1127 
1128  if (isValid) count++;
1129  }
1130  }
1131  } else {
1132  RESTError << "Cannot open " << fileName << RESTendl;
1133  }
1134 
1135  if (!isValid) {
1136  RESTError << fileName << " is not a valid dataSet skipping..." << RESTendl;
1137  it = fileNames.erase(it);
1138  } else {
1139  ++it;
1140  }
1141  }
1142 
1143  if (fileNames.empty()) {
1144  RESTError << "File selection is empty, dataSet will not be imported " << RESTendl;
1145  return;
1146  }
1147 
1148  RESTInfo << "Opening list of files. First file: " << fileNames[0] << RESTendl;
1149  fDataFrame = ROOT::RDataFrame("AnalysisTree", fileNames);
1150 
1151  if (fTree != nullptr) {
1152  delete fTree;
1153  fTree = nullptr;
1154  }
1155  fTree = new TChain("AnalysisTree");
1156 
1157  for (const auto& fN : fileNames) fTree->Add((TString)fN);
1158 
1159  fMergedDataset = true;
1160  fImportedFiles = fileNames;
1161 
1162  fQuantity.clear();
1163 }
std::vector< std::string > GetObservableNames()
It returns a vector with strings containing all the observables that exist in the analysis tree.
A class to help on cuts definitions. To be used with TRestAnalysisTree.
Definition: TRestCut.h:31
It allows to group a number of runs that satisfy given metadata conditions.
Definition: TRestDataSet.h:34
std::vector< std::string > fFilterContains
If not empty it will check if the metadata member contains the string.
Definition: TRestDataSet.h:70
virtual std::vector< std::string > FileSelection()
Function to determine the filenames that satisfy the dataset conditions.
std::vector< Double_t > fFilterLowerThan
If the corresponding element is not empty it will check if the metadata member is lower.
Definition: TRestDataSet.h:76
void PrintMetadata() override
Prints on screen the information about the metadata members of TRestDataSet.
TChain * fTree
A pointer to the generated tree.
Definition: TRestDataSet.h:118
std::vector< std::string > fProcessObservablesList
It contains a list of the process where all observables should be added.
Definition: TRestDataSet.h:64
void Import(const std::string &fileName)
This function imports metadata from a root file it import metadata info from the previous dataSet whi...
std::map< std::string, RelevantQuantity > fQuantity
The properties of a relevant quantity that we want to store together with the dataset.
Definition: TRestDataSet.h:82
ROOT::RDF::RNode Range(size_t from, size_t to)
This method returns a RDataFrame node with the number of samples inside the dataset by selecting a ra...
std::vector< std::pair< std::string, std::string > > fColumnNameExpressions
A list of new columns together with its corresponding expressions added to the dataset.
Definition: TRestDataSet.h:106
ROOT::RDF::RNode DefineColumn(const std::string &columnName, const std::string &formula)
This function will add a new column to the RDataFrame using the same scheme as the usual RDF::Define ...
Double_t fEndTime
TimeStamp for the end time of the last file.
Definition: TRestDataSet.h:97
ROOT::RDF::RNode fDataFrame
The resulting RDF::RNode object after initialization.
Definition: TRestDataSet.h:115
size_t GetNumberOfBranches()
Number of variables (or observables)
Definition: TRestDataSet.h:160
size_t GetEntries()
It returns the number of entries found inside fDataFrame and prints out a warning if the number of en...
TRestDataSet()
Default constructor.
Double_t GetTotalTimeInSeconds() const
It returns the accumulated run time in seconds.
Definition: TRestDataSet.h:166
ROOT::RDF::RNode MakeCut(const TRestCut *cut)
This function applies a TRestCut to the dataframe and returns a dataframe with the applied cuts....
void GenerateDataSet()
This function generates the data frame with the filelist and column names (or observables) that have ...
Bool_t fMT
A flag to enable Multithreading during dataframe generation.
Definition: TRestDataSet.h:109
TRestCut * fCut
Parameter cuts over the selected dataset.
Definition: TRestDataSet.h:85
void Export(const std::string &filename, std::vector< std::string > excludeColumns={})
It will generate an output file with the dataset compilation. Only the selected branches and the file...
std::string fFilterStartTime
All the selected runs will have a starting date after fStartTime.
Definition: TRestDataSet.h:52
Bool_t Merge(const TRestDataSet &dS)
This function merge different TRestDataSet metadata in current dataSet.
std::vector< std::string > GetFileSelection()
It returns a list of the files that have been finally selected.
Definition: TRestDataSet.h:163
std::vector< std::string > fFilterMetadata
A list of metadata members where filters will be applied.
Definition: TRestDataSet.h:67
std::vector< std::string > fFileSelection
A list populated by the FileSelection method using the conditions of the dataset.
Definition: TRestDataSet.h:91
std::string fFilterEndTime
All the selected runs will have an ending date before fEndTime.
Definition: TRestDataSet.h:55
Double_t fStartTime
TimeStamp for the start time of the first file.
Definition: TRestDataSet.h:94
std::vector< std::string > fObservablesList
It contains a list of the observables that will be added to the final tree or exported file.
Definition: TRestDataSet.h:61
Bool_t fMergedDataset
It keeps track if the generated dataset is a pure dataset or a merged one.
Definition: TRestDataSet.h:100
void Initialize() override
This function initialize different parameters from the TRestDataSet.
void RegenerateTree(std::vector< std::string > finalList={})
It regenerates the tree so that it is an exact copy of the present DataFrame.
std::vector< std::string > fImportedFiles
The list of dataset files imported.
Definition: TRestDataSet.h:103
Double_t fTotalDuration
The total integrated run time of selected files.
Definition: TRestDataSet.h:88
std::string fFilePattern
A glob file pattern that must be satisfied by all files.
Definition: TRestDataSet.h:58
TTree * GetTree() const
Gives access to the tree.
Definition: TRestDataSet.h:138
std::vector< Double_t > fFilterGreaterThan
If the corresponding element is not empty it will check if the metadata member is greater.
Definition: TRestDataSet.h:73
ROOT::RDF::RNode ApplyRange(size_t from, size_t to)
This method reduces the number of samples inside the dataset by selecting a range.
std::vector< Double_t > fFilterEqualsTo
If the corresponding element is not empty it will check if the metadata member is equal.
Definition: TRestDataSet.h:79
void InitFromConfigFile() override
Initialization of specific TRestDataSet members through an RML file.
TRestDataSet & operator=(TRestDataSet &dS)
Operator to copy TRestDataSet metadata.
~TRestDataSet()
Default destructor.
A base class for any REST metadata class.
Definition: TRestMetadata.h:74
virtual void PrintMetadata()
Implemented it in the derived metadata class to print out specific metadata information.
endl_t RESTendl
Termination flag object for TRestStringOutput.
TiXmlElement * GetElement(std::string eleDeclare, TiXmlElement *e=nullptr)
Get an xml element from a given parent element, according to its declaration.
Int_t LoadConfigFromFile(const std::string &configFilename, const std::string &sectionName="")
Give the file name, find out the corresponding section. Then call the main starter.
TRestMetadata * InstantiateChildMetadata(int index, std::string pattern="")
This method will retrieve a new TRestMetadata instance of a child element of the present TRestMetadat...
virtual void InitFromConfigFile()
To make settings from rml file. This method must be implemented in the derived class.
TRestStringOutput::REST_Verbose_Level GetVerboseLevel()
returns the verboselevel in type of REST_Verbose_Level enumerator
std::string GetFieldValue(std::string parName, TiXmlElement *e)
Returns the field value of an xml element which has the specified name.
void SetSectionName(std::string sName)
set the section name, clear the section content
std::string fConfigFileName
Full name of the rml file.
virtual Int_t Write(const char *name=nullptr, Int_t option=0, Int_t bufsize=0)
overwriting the write() method with fStore considered
TiXmlElement * GetNextElement(TiXmlElement *e)
Get the next sibling xml element of this element, with same eleDeclare.
Data provider and manager in REST.
Definition: TRestRun.h:18
std::string ReplaceMetadataMembers(const std::string &instr, Int_t precision=8)
It will replace the data members contained inside the string given as input. The data members in the ...
Definition: TRestRun.cxx:1652
@ REST_Info
+show most of the information for each steps
static std::pair< std::string, std::string > SeparatePathAndName(const std::string &fullname)
Separate path and filename in a full path+filename string, returns a pair of string.
Definition: TRestTools.cxx:813
static std::string GetFileNameExtension(const std::string &fullname)
Gets the file extension as the substring found after the latest ".".
Definition: TRestTools.cxx:823
static std::vector< std::string > GetFilesMatchingPattern(std::string pattern, bool unlimited=false)
Returns a list of files whose name match the pattern string. Key word is "*". e.g....
Definition: TRestTools.cxx:976
TClass * GetClassQuick(std::string type)
time_t StringToTimeStamp(std::string time)
A method to convert a date/time formatted string to a timestamp.
std::vector< std::string > Split(std::string in, std::string separator, bool allowBlankString=false, bool removeWhiteSpaces=false, int startPos=-1)
Split the input string according to the given separator. Returning a vector of fragments.
Double_t StringToDouble(std::string in)
Gets a double from a string.
std::string ToDateTimeString(time_t time)
Format time_t into string.
std::string Replace(std::string in, std::string thisString, std::string byThisString, size_t fromPosition=0, Int_t N=0)
Replace any occurences of thisSring by byThisString inside string in.
std::string metadata
The associated metadata member used to register the relevant quantity.
Definition: TRestDataSet.h:38
std::string description
A user given description that can be used to define the relevant quantity.
Definition: TRestDataSet.h:44
std::string strategy
It determines how to produce the relevant quantity (accumulate/unique/last/max/min)
Definition: TRestDataSet.h:41
std::string value
The quantity value.
Definition: TRestDataSet.h:47