diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index b03a088571aa8..c2118f12e57d6 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -1523,7 +1523,17 @@ public: // RemoveDuplicates should preserve ordering of the columns: it might be meaningful. RDFInternal::RemoveDuplicates(columnNames); - auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Snapshot"); + std::vector selectedColumns; + try { + selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Snapshot"); + } + catch (const std::runtime_error &e){ + // No columns were found, try again but consider all input data source columns + if (auto ds = GetDataSource()) + selectedColumns = RDFInternal::ConvertRegexToColumns(ds->GetColumnNames(), columnNameRegexp, "Snapshot"); + else + throw e; + } if (RDFInternal::GetDataSourceLabel(*this) == "RNTupleDS") { RDFInternal::RemoveRNTupleSubFields(selectedColumns); diff --git a/tree/dataframe/inc/ROOT/RDF/RInterfaceBase.hxx b/tree/dataframe/inc/ROOT/RDF/RInterfaceBase.hxx index bfab62d1db189..0d34e600bd659 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterfaceBase.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterfaceBase.hxx @@ -216,6 +216,7 @@ public: RInterfaceBase(RDFDetail::RLoopManager &lm, const RDFInternal::RColumnRegister &colRegister); ColumnNames_t GetColumnNames(); + ColumnNames_t GetDatasetTopLevelFieldNames(); std::string GetColumnType(std::string_view column); diff --git a/tree/dataframe/inc/ROOT/RNTupleDS.hxx b/tree/dataframe/inc/ROOT/RNTupleDS.hxx index 1019b53953ee4..9cc87d5dc6cf5 100644 --- a/tree/dataframe/inc/ROOT/RNTupleDS.hxx +++ b/tree/dataframe/inc/ROOT/RNTupleDS.hxx @@ -126,6 +126,7 @@ class RNTupleDS final : public ROOT::RDF::RDataSource { std::unordered_map fFieldId2QualifiedName; std::vector fColumnNames; std::vector fColumnTypes; + std::vector fTopLevelFieldNames; /// List of column readers returned by GetColumnReaders() organized by slot. Used to reconnect readers /// to new page sources when the files in the chain change. std::vector> fActiveColumnReaders; @@ -222,6 +223,7 @@ public: void SetNSlots(unsigned int nSlots) final; std::size_t GetNFiles() const final { return fFileNames.empty() ? 1 : fFileNames.size(); } const std::vector &GetColumnNames() const final { return fColumnNames; } + const std::vector &GetTopLevelFieldNames() const final { return fTopLevelFieldNames; } bool HasColumn(std::string_view colName) const final; std::string GetTypeName(std::string_view colName) const final; std::vector> GetEntryRanges() final; diff --git a/tree/dataframe/src/RInterfaceBase.cxx b/tree/dataframe/src/RInterfaceBase.cxx index 8d3ce5ae82353..733a47a141ce0 100644 --- a/tree/dataframe/src/RInterfaceBase.cxx +++ b/tree/dataframe/src/RInterfaceBase.cxx @@ -100,6 +100,28 @@ ROOT::RDF::ColumnNames_t ROOT::RDF::RInterfaceBase::GetColumnNames() return ret; } +///////////////////////////////////////////////////////////////////////////// +/// \brief Retrieve the names of top-level field names +/// +/// For data sources that support hierarchical dataset schemas, such as TTree +/// or RNTuple, this function will retrieve the names of top-level fields. For +/// example, if the schema contains a user class with a data member, only +/// the name of the top-level field containing the user class object would be +/// reported, but not the name of the data member sub-field. +/// +/// For all other data sources, returns the list of all available dataset columns. +ROOT::RDF::ColumnNames_t ROOT::RDF::RInterfaceBase::GetDatasetTopLevelFieldNames() +{ + ROOT::RDF::ColumnNames_t ret; + if (auto ds = GetDataSource()) { + ret = ROOT::Internal::RDF::GetTopLevelFieldNames(*ds); + // Sorting to be consistent with GetColumnNames + std::sort(ret.begin(), ret.end()); + } + + return ret; +} + ///////////////////////////////////////////////////////////////////////////// /// \brief Return the type of a given column as a string. /// \return the type of the required column. diff --git a/tree/dataframe/src/RNTupleDS.cxx b/tree/dataframe/src/RNTupleDS.cxx index d887bb7f43ce2..7cb8e5de2b500 100644 --- a/tree/dataframe/src/RNTupleDS.cxx +++ b/tree/dataframe/src/RNTupleDS.cxx @@ -393,6 +393,12 @@ ROOT::RDF::RNTupleDS::RNTupleDS(std::unique_ptr pag AddField(fPrincipalDescriptor, "", fPrincipalDescriptor.GetFieldZeroId(), std::vector()); + + auto topLevelFields = fPrincipalDescriptor.GetTopLevelFields(); + const auto nTopLevelFields = std::distance(topLevelFields.begin(), topLevelFields.end()); + fTopLevelFieldNames.reserve(nTopLevelFields); + for (const auto &field : topLevelFields) + fTopLevelFieldNames.push_back(field.GetFieldName()); } namespace { diff --git a/tree/dataframe/test/datasource_ntuple.cxx b/tree/dataframe/test/datasource_ntuple.cxx index 9e25a83f1a060..8f186297cc78a 100644 --- a/tree/dataframe/test/datasource_ntuple.cxx +++ b/tree/dataframe/test/datasource_ntuple.cxx @@ -828,3 +828,15 @@ TEST(RNTupleDS, Int8) std::vector expected{0, 1, 2, 3, 4}; EXPECT_EQ(expected, df.Take("x").GetValue()); } + +TEST_F(RNTupleDSTest, GetTopLevelFieldNames) +{ + ROOT::RDataFrame df{fNtplName, fFileName}; + + EXPECT_VEC_EQ( + df.GetDatasetTopLevelFieldNames(), + std::vector{"VecElectron", "electron", "energy", "jets", "nElectron", "nnlo", "pt", "rvec", "tag"}); + EXPECT_VEC_EQ(df.GetColumnNames(), + std::vector{"VecElectron", "VecElectron.pt", "electron", "electron.pt", "energy", "jets", + "nElectron", "nnlo", "pt", "rvec", "tag"}); +} diff --git a/tree/dataframe/test/datasource_tree.cxx b/tree/dataframe/test/datasource_tree.cxx index 3c8235d315f5e..8b4a63e8a99ba 100644 --- a/tree/dataframe/test/datasource_tree.cxx +++ b/tree/dataframe/test/datasource_tree.cxx @@ -57,6 +57,15 @@ TEST(RTTreeDS, BranchWithNestedSameName) expect_vec_eq(branchNames, expectedBranchNames); } +TEST(RTTreeDS, GetDatasetTopLevelFieldNames) +{ + InputTreeRAII dataset{}; + + ROOT::RDataFrame df{dataset.fTreeName, dataset.fFileName}; + auto branchNames = df.GetDatasetTopLevelFieldNames(); + expect_vec_eq(branchNames, std::vector{"toplevel"}); +} + #ifdef R__USE_IMT struct Dataset20164RAIII { const char *fTreeName{"tree_20164"};