From e582e05ada8d3d583c0d2ee6d994103494eff42a Mon Sep 17 00:00:00 2001 From: Vincenzo Eduardo Padulano Date: Tue, 21 Apr 2026 08:58:30 +0200 Subject: [PATCH 1/2] [df] Allow retrieving dataset top-level field names Useful for instance when calling Snapshot and wanting to select only top-level field names, possibly further filtering the list (e.g. through regexes). --- .../dataframe/inc/ROOT/RDF/RInterfaceBase.hxx | 1 + tree/dataframe/inc/ROOT/RNTupleDS.hxx | 2 ++ tree/dataframe/src/RInterfaceBase.cxx | 22 +++++++++++++++++++ tree/dataframe/src/RNTupleDS.cxx | 6 +++++ tree/dataframe/test/datasource_ntuple.cxx | 12 ++++++++++ tree/dataframe/test/datasource_tree.cxx | 9 ++++++++ 6 files changed, 52 insertions(+) diff --git a/tree/dataframe/inc/ROOT/RDF/RInterfaceBase.hxx b/tree/dataframe/inc/ROOT/RDF/RInterfaceBase.hxx index 5b7fd7334999d..aab35dbd7a838 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterfaceBase.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterfaceBase.hxx @@ -209,6 +209,7 @@ public: RInterfaceBase(RDFDetail::RLoopManager &lm, const RDFInternal::RColumnRegister &colRegister); ColumnNames_t GetColumnNames(); + ColumnNames_t GetDatasetTopLevelFieldNames(); std::string GetColumnType(std::string_view column); diff --git a/tree/dataframe/inc/ROOT/RNTupleDS.hxx b/tree/dataframe/inc/ROOT/RNTupleDS.hxx index ca87b581da04a..cac8d23da7385 100644 --- a/tree/dataframe/inc/ROOT/RNTupleDS.hxx +++ b/tree/dataframe/inc/ROOT/RNTupleDS.hxx @@ -135,6 +135,7 @@ class RNTupleDS final : public ROOT::RDF::RDataSource { std::unordered_map fFieldId2QualifiedName; std::vector fColumnNames; std::vector fColumnTypes; + std::vector fTopLevelFieldNames; /// List of column readers returned by GetColumnReaders() organized by slot. Used to reconnect readers /// to new page sources when the files in the chain change. std::vector> fActiveColumnReaders; @@ -235,6 +236,7 @@ public: void SetNSlots(unsigned int nSlots) final; std::size_t GetNFiles() const final { return fFileNames.empty() ? 1 : fFileNames.size(); } const std::vector &GetColumnNames() const final { return fColumnNames; } + const std::vector &GetTopLevelFieldNames() const final { return fTopLevelFieldNames; } bool HasColumn(std::string_view colName) const final; std::string GetTypeName(std::string_view colName) const final; std::vector> GetEntryRanges() final; diff --git a/tree/dataframe/src/RInterfaceBase.cxx b/tree/dataframe/src/RInterfaceBase.cxx index 8d3ce5ae82353..733a47a141ce0 100644 --- a/tree/dataframe/src/RInterfaceBase.cxx +++ b/tree/dataframe/src/RInterfaceBase.cxx @@ -100,6 +100,28 @@ ROOT::RDF::ColumnNames_t ROOT::RDF::RInterfaceBase::GetColumnNames() return ret; } +///////////////////////////////////////////////////////////////////////////// +/// \brief Retrieve the names of top-level field names +/// +/// For data sources that support hierarchical dataset schemas, such as TTree +/// or RNTuple, this function will retrieve the names of top-level fields. For +/// example, if the schema contains a user class with a data member, only +/// the name of the top-level field containing the user class object would be +/// reported, but not the name of the data member sub-field. +/// +/// For all other data sources, returns the list of all available dataset columns. +ROOT::RDF::ColumnNames_t ROOT::RDF::RInterfaceBase::GetDatasetTopLevelFieldNames() +{ + ROOT::RDF::ColumnNames_t ret; + if (auto ds = GetDataSource()) { + ret = ROOT::Internal::RDF::GetTopLevelFieldNames(*ds); + // Sorting to be consistent with GetColumnNames + std::sort(ret.begin(), ret.end()); + } + + return ret; +} + ///////////////////////////////////////////////////////////////////////////// /// \brief Return the type of a given column as a string. /// \return the type of the required column. diff --git a/tree/dataframe/src/RNTupleDS.cxx b/tree/dataframe/src/RNTupleDS.cxx index 8458036bfdd0c..25c097e6dcd1e 100644 --- a/tree/dataframe/src/RNTupleDS.cxx +++ b/tree/dataframe/src/RNTupleDS.cxx @@ -448,6 +448,12 @@ ROOT::RDF::RNTupleDS::RNTupleDS(std::unique_ptr pag AddField(fPrincipalDescriptor, "", fPrincipalDescriptor.GetFieldZeroId(), std::vector()); + + auto topLevelFields = fPrincipalDescriptor.GetTopLevelFields(); + const auto nTopLevelFields = std::distance(topLevelFields.begin(), topLevelFields.end()); + fTopLevelFieldNames.reserve(nTopLevelFields); + for (const auto &field : topLevelFields) + fTopLevelFieldNames.push_back(field.GetFieldName()); } namespace { diff --git a/tree/dataframe/test/datasource_ntuple.cxx b/tree/dataframe/test/datasource_ntuple.cxx index 446cf9d2ad4a6..e3b647305744a 100644 --- a/tree/dataframe/test/datasource_ntuple.cxx +++ b/tree/dataframe/test/datasource_ntuple.cxx @@ -855,3 +855,15 @@ TEST(RNTupleDS, Int8) std::vector expected{0, 1, 2, 3, 4}; EXPECT_EQ(expected, df.Take("x").GetValue()); } + +TEST_F(RNTupleDSTest, GetTopLevelFieldNames) +{ + ROOT::RDataFrame df{fNtplName, fFileName}; + + EXPECT_VEC_EQ( + df.GetDatasetTopLevelFieldNames(), + std::vector{"VecElectron", "electron", "energy", "jets", "nElectron", "nnlo", "pt", "rvec", "tag"}); + EXPECT_VEC_EQ(df.GetColumnNames(), + std::vector{"VecElectron", "VecElectron.pt", "electron", "electron.pt", "energy", "jets", + "nElectron", "nnlo", "pt", "rvec", "tag"}); +} diff --git a/tree/dataframe/test/datasource_tree.cxx b/tree/dataframe/test/datasource_tree.cxx index 3c8235d315f5e..8b4a63e8a99ba 100644 --- a/tree/dataframe/test/datasource_tree.cxx +++ b/tree/dataframe/test/datasource_tree.cxx @@ -57,6 +57,15 @@ TEST(RTTreeDS, BranchWithNestedSameName) expect_vec_eq(branchNames, expectedBranchNames); } +TEST(RTTreeDS, GetDatasetTopLevelFieldNames) +{ + InputTreeRAII dataset{}; + + ROOT::RDataFrame df{dataset.fTreeName, dataset.fFileName}; + auto branchNames = df.GetDatasetTopLevelFieldNames(); + expect_vec_eq(branchNames, std::vector{"toplevel"}); +} + #ifdef R__USE_IMT struct Dataset20164RAIII { const char *fTreeName{"tree_20164"}; From b5c100d92a2aa1a5ab0a1aa910c4d03ef9528fba Mon Sep 17 00:00:00 2001 From: Vincenzo Eduardo Padulano Date: Tue, 21 Apr 2026 08:58:45 +0200 Subject: [PATCH 2/2] [df] Fallback to looking for regex in all column names in Snapshot Enabling the GetTopLevelFieldNames method in RNTuple exposed one previously faulty interaction between this and Snapshot. For the case of Snapshot with a regex, the regex would only consider top-level column names. If the regex contains a specific name of a subfield, e.g. "columnName.dataMember", then the regex would fail even though that column exists in the dataset. This commit also keeps the default Snapshot behaviour of only considering the top-level column names, but checks for the full list of column names in case a regex failed before throwing the final error if necessary. --- tree/dataframe/inc/ROOT/RDF/RInterface.hxx | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index 0a000eb178d04..1a2d3012d54ce 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -1480,7 +1480,17 @@ public: // RemoveDuplicates should preserve ordering of the columns: it might be meaningful. RDFInternal::RemoveDuplicates(columnNames); - auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Snapshot"); + std::vector selectedColumns; + try { + selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Snapshot"); + } + catch (const std::runtime_error &e){ + // No columns were found, try again but consider all input data source columns + if (auto ds = GetDataSource()) + selectedColumns = RDFInternal::ConvertRegexToColumns(ds->GetColumnNames(), columnNameRegexp, "Snapshot"); + else + throw e; + } if (RDFInternal::GetDataSourceLabel(*this) == "RNTupleDS") { RDFInternal::RemoveRNTupleSubfields(selectedColumns);