Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions vortex-array/src/scalar_fn/fns/byte_length.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use crate::arrays::varbinview::VarBinViewArrayExt;
use crate::dtype::DType;
use crate::dtype::Nullability;
use crate::dtype::PType;
use crate::expr::Expression;
use crate::kernel::ExecuteParentKernel;
use crate::scalar::Scalar;
use crate::scalar_fn::Arity;
Expand Down Expand Up @@ -122,6 +123,14 @@ impl ScalarFnVTable for ByteLength {
}
}

fn validity(
&self,
_: &Self::Options,
expression: &Expression,
) -> VortexResult<Option<Expression>> {
Ok(Some(expression.child(0).validity()?))
}

fn is_null_sensitive(&self, _options: &Self::Options) -> bool {
false
}
Expand Down
15 changes: 15 additions & 0 deletions vortex-array/src/scalar_fn/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,18 @@ mod sealed {
/// This can be the **only** implementor for [`super::typed::DynScalarFn`].
impl<V: ScalarFnVTable> Sealed for TypedScalarFnInstance<V> {}
}

/*
* A scalar function has a negative cost if applying it to an array and
* canonicalizing is cheaper than canonicalizing an array and applying it.
*
* Example of negative cost expressions are byte_length() and get_item() since
* they don't depend on input size.
*
* Example of non-negative cost expression is like()
*/
pub fn is_negative_cost(id: ScalarFnId) -> bool {
id == Id::new_static("vortex.byte_length")
|| id == Id::new_static("vortex.get_item")
|| id == Id::new_static("vortex.literal")
}
3 changes: 2 additions & 1 deletion vortex-duckdb/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ const DEFAULT_DUCKDB_VERSION: &str = "1.5.3";

const BUILD_ARTIFACTS: [&str; 3] = ["libduckdb.dylib", "libduckdb.so", "libduckdb_static.a"];

const SOURCE_FILES: [&str; 17] = [
const SOURCE_FILES: [&str; 18] = [
"cpp/client_context.cpp",
"cpp/config.cpp",
"cpp/copy_function.cpp",
Expand All @@ -34,6 +34,7 @@ const SOURCE_FILES: [&str; 17] = [
"cpp/expr.cpp",
"cpp/file_system.cpp",
"cpp/logical_type.cpp",
"cpp/optimizer.cpp",
"cpp/replacement_scan.cpp",
"cpp/reusable_dict.cpp",
"cpp/scalar_function.cpp",
Expand Down
1 change: 1 addition & 0 deletions vortex-duckdb/cpp/include/duckdb_vx.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#pragma once

#include "duckdb_vx/client_context.h"
#include "duckdb_vx/optimizer.h"
#include "duckdb_vx/config.h"
#include "duckdb_vx/copy_function.h"
#include "duckdb_vx/data.h"
Expand Down
141 changes: 141 additions & 0 deletions vortex-duckdb/cpp/include/duckdb_vx/optimizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors
#pragma once
#include "duckdb.h"

#ifdef __cplusplus
extern "C" {
#endif

duckdb_state duckdb_vx_optimizer_extension_register(duckdb_database ffi_db);

#ifdef __cplusplus
}
#endif

#ifdef __cplusplus
#include "duckdb/optimizer/optimizer_extension.hpp"
#include "duckdb/planner/expression/bound_columnref_expression.hpp"
#include "duckdb/planner/expression/bound_function_expression.hpp"
#include "duckdb/planner/operator/logical_get.hpp"
#include <optional>

// Only one consumer of this header file, so "using" is fine
using namespace duckdb;

using ExpressionPtr = unique_ptr<Expression>;
using LogicalOperatorPtr = unique_ptr<LogicalOperator>;

/**
* Column index in requested scan. Example:
*
* CREATE TABLE t (a1 INTEGER, a2 INTEGER, a3 INTEGER);
* SELECT a2, a3 FROM t;
*
* a2's TableColumnScanIndex is 0, a3's TableColumnScanIndex is 1,
* index is index in SELECT clause.
*/
using TableColumnScanIndex = idx_t;

/**
* Column index in table's storage. Example:
*
* CREATE TABLE t (a1 INTEGER, a2 INTEGER, a3 INTEGER);
* SELECT a2, a3 FROM t;
*
* a2's TableColumnStorageIndex is 1, a3's TableColumnScanIndex is 2,
* index is index of column in table storage.
*
* for i: TableColumnScanIndex, column_ids[i].GetPrimaryIndex() is
* TableColumnStorageIndex
*/
using TableColumnStorageIndex = idx_t;

using TableIndex = idx_t;

struct GetAnalysis {
LogicalGet &get;
/**
* for fn(col), mapping of "col scan index" -> "fn expression".
* "fn expression" is nullptr iff column is used with a different function
* or without function application in the query plan.
*/
unordered_map<TableColumnScanIndex, const BoundFunctionExpression *> col_to_fn;
};

using Analyses = unordered_map<TableIndex, GetAnalysis>;

/*
* Query plans may have PROJECTIONs which wrap GETs. One example is VIEWs for

Check warning on line 69 in vortex-duckdb/cpp/include/duckdb_vx/optimizer.h

View workflow job for this annotation

GitHub Actions / Spell Check with Typos

"PROJECTIO" should be "PROJECTION".
* our benchmarks:
*
* CREATE VIEW view AS (SELECT * FROM '*.vortex');
* SELECT len(col) FROM view;
*
* Second query "col"'s table_index would be 1 (VIEW) and not 0 (GET for
* vortex). But we want to push down len(col) to vortex. So we keep an aliases
* mapping of
*
* "projection table index" to "projection operator".
*
* to resolve this.
* For simplicity, current implementation is limited to one level i.e.
* VIEW -> GET is pushed down but VIEW->VIEW->GET or VIEW->CTE->GET is not.
*/
using Projections = unordered_map<TableIndex, const LogicalProjection &>;

/**
* Collect fn(col) expressions i.e. expressions where a single function (not
* a function chain) wraps a single bound column. If "col" is used without
* function application in "plan", record in "analyses.conflicts"
*/
struct ScalarFnCollect final : LogicalOperatorVisitor {
Analyses &analyses;
const Projections &projections;

ScalarFnCollect(Analyses &analyses, const Projections &projections);
void VisitOperator(LogicalOperator &op) override;
ExpressionPtr VisitReplace(BoundColumnRefExpression &expr, ExpressionPtr *ptr) override;
ExpressionPtr VisitReplace(BoundFunctionExpression &expr, ExpressionPtr *ptr) override;
};

/*
* For "col" in columns collected by ScalarFnCollect, replace fn(col) to "col"
* if "col" doesn't have conflicting usage. Update return types for bound
* columns and logical projections referencing this column.
*/
struct ScalarFnReplace final : LogicalOperatorVisitor {
Analyses &analyses;
const Projections &projections;

ScalarFnReplace(Analyses &analyses, const Projections &aliases);
ExpressionPtr VisitReplace(BoundColumnRefExpression &expr, ExpressionPtr *ptr) override;
ExpressionPtr VisitReplace(BoundFunctionExpression &expr, ExpressionPtr *ptr) override;
};

void FindGetsAndAliases(LogicalOperator &op,
Analyses &analyses,
Projections &aliases,
LogicalOperator *parent = nullptr);

LogicalOperatorPtr TryPushdownScalarFunctions(ClientContext &context, LogicalOperatorPtr plan);
void VortexOptimizeFunction(OptimizerExtensionInput &input, LogicalOperatorPtr &plan);

struct VortexOptimizerExtension final : OptimizerExtension {
inline VortexOptimizerExtension() : OptimizerExtension(VortexOptimizeFunction, nullptr, {}) {
}
};

struct GetBinding {
GetAnalysis &analysis;
TableColumnScanIndex column_index;
};

/*
* Given a column binding, resolve it to a GET and a GET's column scan index.
* Returns nullopt for virtual columns and columns which are neither part of
* GET nor part of PROJECTION wrapping a GET.
*/
std::optional<GetBinding> Resolve(ColumnBinding binding, Analyses &analyses, const Projections &projections);

#endif
17 changes: 17 additions & 0 deletions vortex-duckdb/cpp/include/duckdb_vx/table_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,23 @@

#ifdef __cplusplus
static_assert(sizeof(idx_t) == 8);

#include "duckdb/main/capi/capi_internal.hpp"

duckdb::unique_ptr<duckdb::FunctionData> bind(duckdb::ClientContext &context,
duckdb::TableFunctionBindInput &input,
duckdb::vector<duckdb::LogicalType> &return_types,
duckdb::vector<duckdb::string> &names);

struct TableFunctionProjectionExpressionInput {
const duckdb::LogicalGet &get;
const duckdb::Expression &expression;
idx_t projection_idx;
};

// true if we can push down the expression, false otherwise
bool projection_expression_pushdown(duckdb::ClientContext &context,
const TableFunctionProjectionExpressionInput &input);
#endif

#ifdef __cplusplus
Expand Down
Loading
Loading