From 75a8fe1caa6968b3282e1014e100acfc08f2bcc4 Mon Sep 17 00:00:00 2001 From: Vincenzo Eduardo Padulano Date: Tue, 21 Apr 2026 08:58:30 +0200 Subject: [PATCH 1/2] [df] Allow retrieving dataset top-level field names Useful for instance when calling Snapshot and wanting to select only top-level field names, possibly further filtering the list (e.g. through regexes). (cherry picked from commit 1f3af5a2c8c7c137b9f81e364bc02463f5dc0bf8) --- .../dataframe/inc/ROOT/RDF/RInterfaceBase.hxx | 1 + tree/dataframe/inc/ROOT/RNTupleDS.hxx | 2 ++ tree/dataframe/src/RInterfaceBase.cxx | 22 +++++++++++++++++++ tree/dataframe/src/RNTupleDS.cxx | 6 +++++ tree/dataframe/test/datasource_ntuple.cxx | 12 ++++++++++ tree/dataframe/test/datasource_tree.cxx | 9 ++++++++ 6 files changed, 52 insertions(+) diff --git a/tree/dataframe/inc/ROOT/RDF/RInterfaceBase.hxx b/tree/dataframe/inc/ROOT/RDF/RInterfaceBase.hxx index bfab62d1db189..0d34e600bd659 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterfaceBase.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterfaceBase.hxx @@ -216,6 +216,7 @@ public: RInterfaceBase(RDFDetail::RLoopManager &lm, const RDFInternal::RColumnRegister &colRegister); ColumnNames_t GetColumnNames(); + ColumnNames_t GetDatasetTopLevelFieldNames(); std::string GetColumnType(std::string_view column); diff --git a/tree/dataframe/inc/ROOT/RNTupleDS.hxx b/tree/dataframe/inc/ROOT/RNTupleDS.hxx index 1019b53953ee4..9cc87d5dc6cf5 100644 --- a/tree/dataframe/inc/ROOT/RNTupleDS.hxx +++ b/tree/dataframe/inc/ROOT/RNTupleDS.hxx @@ -126,6 +126,7 @@ class RNTupleDS final : public ROOT::RDF::RDataSource { std::unordered_map fFieldId2QualifiedName; std::vector fColumnNames; std::vector fColumnTypes; + std::vector fTopLevelFieldNames; /// List of column readers returned by GetColumnReaders() organized by slot. Used to reconnect readers /// to new page sources when the files in the chain change. std::vector> fActiveColumnReaders; @@ -222,6 +223,7 @@ public: void SetNSlots(unsigned int nSlots) final; std::size_t GetNFiles() const final { return fFileNames.empty() ? 1 : fFileNames.size(); } const std::vector &GetColumnNames() const final { return fColumnNames; } + const std::vector &GetTopLevelFieldNames() const final { return fTopLevelFieldNames; } bool HasColumn(std::string_view colName) const final; std::string GetTypeName(std::string_view colName) const final; std::vector> GetEntryRanges() final; diff --git a/tree/dataframe/src/RInterfaceBase.cxx b/tree/dataframe/src/RInterfaceBase.cxx index 8d3ce5ae82353..733a47a141ce0 100644 --- a/tree/dataframe/src/RInterfaceBase.cxx +++ b/tree/dataframe/src/RInterfaceBase.cxx @@ -100,6 +100,28 @@ ROOT::RDF::ColumnNames_t ROOT::RDF::RInterfaceBase::GetColumnNames() return ret; } +///////////////////////////////////////////////////////////////////////////// +/// \brief Retrieve the names of top-level field names +/// +/// For data sources that support hierarchical dataset schemas, such as TTree +/// or RNTuple, this function will retrieve the names of top-level fields. For +/// example, if the schema contains a user class with a data member, only +/// the name of the top-level field containing the user class object would be +/// reported, but not the name of the data member sub-field. +/// +/// For all other data sources, returns the list of all available dataset columns. +ROOT::RDF::ColumnNames_t ROOT::RDF::RInterfaceBase::GetDatasetTopLevelFieldNames() +{ + ROOT::RDF::ColumnNames_t ret; + if (auto ds = GetDataSource()) { + ret = ROOT::Internal::RDF::GetTopLevelFieldNames(*ds); + // Sorting to be consistent with GetColumnNames + std::sort(ret.begin(), ret.end()); + } + + return ret; +} + ///////////////////////////////////////////////////////////////////////////// /// \brief Return the type of a given column as a string. /// \return the type of the required column. diff --git a/tree/dataframe/src/RNTupleDS.cxx b/tree/dataframe/src/RNTupleDS.cxx index d887bb7f43ce2..7cb8e5de2b500 100644 --- a/tree/dataframe/src/RNTupleDS.cxx +++ b/tree/dataframe/src/RNTupleDS.cxx @@ -393,6 +393,12 @@ ROOT::RDF::RNTupleDS::RNTupleDS(std::unique_ptr pag AddField(fPrincipalDescriptor, "", fPrincipalDescriptor.GetFieldZeroId(), std::vector()); + + auto topLevelFields = fPrincipalDescriptor.GetTopLevelFields(); + const auto nTopLevelFields = std::distance(topLevelFields.begin(), topLevelFields.end()); + fTopLevelFieldNames.reserve(nTopLevelFields); + for (const auto &field : topLevelFields) + fTopLevelFieldNames.push_back(field.GetFieldName()); } namespace { diff --git a/tree/dataframe/test/datasource_ntuple.cxx b/tree/dataframe/test/datasource_ntuple.cxx index 9e25a83f1a060..8f186297cc78a 100644 --- a/tree/dataframe/test/datasource_ntuple.cxx +++ b/tree/dataframe/test/datasource_ntuple.cxx @@ -828,3 +828,15 @@ TEST(RNTupleDS, Int8) std::vector expected{0, 1, 2, 3, 4}; EXPECT_EQ(expected, df.Take("x").GetValue()); } + +TEST_F(RNTupleDSTest, GetTopLevelFieldNames) +{ + ROOT::RDataFrame df{fNtplName, fFileName}; + + EXPECT_VEC_EQ( + df.GetDatasetTopLevelFieldNames(), + std::vector{"VecElectron", "electron", "energy", "jets", "nElectron", "nnlo", "pt", "rvec", "tag"}); + EXPECT_VEC_EQ(df.GetColumnNames(), + std::vector{"VecElectron", "VecElectron.pt", "electron", "electron.pt", "energy", "jets", + "nElectron", "nnlo", "pt", "rvec", "tag"}); +} diff --git a/tree/dataframe/test/datasource_tree.cxx b/tree/dataframe/test/datasource_tree.cxx index 3c8235d315f5e..8b4a63e8a99ba 100644 --- a/tree/dataframe/test/datasource_tree.cxx +++ b/tree/dataframe/test/datasource_tree.cxx @@ -57,6 +57,15 @@ TEST(RTTreeDS, BranchWithNestedSameName) expect_vec_eq(branchNames, expectedBranchNames); } +TEST(RTTreeDS, GetDatasetTopLevelFieldNames) +{ + InputTreeRAII dataset{}; + + ROOT::RDataFrame df{dataset.fTreeName, dataset.fFileName}; + auto branchNames = df.GetDatasetTopLevelFieldNames(); + expect_vec_eq(branchNames, std::vector{"toplevel"}); +} + #ifdef R__USE_IMT struct Dataset20164RAIII { const char *fTreeName{"tree_20164"}; From 37605005b7b0c7f85a2bffcc84e627c0d6629205 Mon Sep 17 00:00:00 2001 From: Vincenzo Eduardo Padulano Date: Tue, 21 Apr 2026 08:58:45 +0200 Subject: [PATCH 2/2] [df] Fallback to looking for regex in all column names in Snapshot Enabling the GetTopLevelFieldNames method in RNTuple exposed one previously faulty interaction between this and Snapshot. For the case of Snapshot with a regex, the regex would only consider top-level column names. If the regex contains a specific name of a subfield, e.g. "columnName.dataMember", then the regex would fail even though that column exists in the dataset. This commit also keeps the default Snapshot behaviour of only considering the top-level column names, but checks for the full list of column names in case a regex failed before throwing the final error if necessary. (cherry picked from commit 13bb547817ebf2b75bf12c112cf3c888a141f931) --- tree/dataframe/inc/ROOT/RDF/RInterface.hxx | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx index b03a088571aa8..c2118f12e57d6 100644 --- a/tree/dataframe/inc/ROOT/RDF/RInterface.hxx +++ b/tree/dataframe/inc/ROOT/RDF/RInterface.hxx @@ -1523,7 +1523,17 @@ public: // RemoveDuplicates should preserve ordering of the columns: it might be meaningful. RDFInternal::RemoveDuplicates(columnNames); - auto selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Snapshot"); + std::vector selectedColumns; + try { + selectedColumns = RDFInternal::ConvertRegexToColumns(columnNames, columnNameRegexp, "Snapshot"); + } + catch (const std::runtime_error &e){ + // No columns were found, try again but consider all input data source columns + if (auto ds = GetDataSource()) + selectedColumns = RDFInternal::ConvertRegexToColumns(ds->GetColumnNames(), columnNameRegexp, "Snapshot"); + else + throw e; + } if (RDFInternal::GetDataSourceLabel(*this) == "RNTupleDS") { RDFInternal::RemoveRNTupleSubFields(selectedColumns);