Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1136,7 +1136,7 @@ jobs:
./cmake-out/backends/vulkan/test/custom_ops/choose_qparams_per_row
./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_qdq
./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_clone
./cmake-out/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add
./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_binary

# "Classic" Operator tests
PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_op.sh --build
Expand Down
10 changes: 5 additions & 5 deletions backends/vulkan/custom_ops_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,11 +564,11 @@ def apply_rotary_emb_impl(
apply_rotary_emb_op = getattr(getattr(torch.ops, namespace), name)

########################
## add_q8ta_q8ta_q8to ##
## q8ta_add ##
########################


def add_q8ta_q8ta_q8to_impl(
def q8ta_add_impl(
input_a: torch.Tensor,
input_b: torch.Tensor,
input_a_scale: float,
Expand Down Expand Up @@ -598,12 +598,12 @@ def add_q8ta_q8ta_q8to_impl(
return quantized_result


name = "add_q8ta_q8ta_q8to"
name = "q8ta_add"
lib.define(
f"{name}(Tensor input_a, Tensor input_b, float input_a_scale, int input_a_zero_point, float input_b_scale, int input_b_zero_point, float output_scale, int output_zero_point, float alpha) -> Tensor"
)
lib.impl(name, add_q8ta_q8ta_q8to_impl, "CompositeExplicitAutograd")
add_q8ta_q8ta_q8to_op = getattr(getattr(torch.ops, namespace), name)
lib.impl(name, q8ta_add_impl, "CompositeExplicitAutograd")
q8ta_add_op = getattr(getattr(torch.ops, namespace), name)

#############################
## select_as_symint ##
Expand Down
19 changes: 9 additions & 10 deletions backends/vulkan/op_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,19 +501,23 @@ def register_torchao_choose_qparams_affine():


# =============================================================================
# QuantizedBinary.cpp
# Q8taBinary.cpp
# =============================================================================


@update_features(exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default)
def register_add_q8ta_q8ta_q8to():
@update_features(exir_ops.edge.et_vk.q8ta_add.default)
def register_q8ta_add():
return OpFeatures(
inputs_storage=utils.PACKED_INT8_4W4C_BUFFER,
inputs_storage=utils.PACKED_INT8_BUFFER,
supports_resize=False,
supports_prepacking=True,
)


# =============================================================================
# Reduce.cpp
# =============================================================================


def get_dims_reduced(node: torch.fx.Node) -> Union[int, List[int]]:
ndim = utils.ndim_of(node.args[0])
assert ndim is not None
Expand Down Expand Up @@ -623,11 +627,6 @@ def pick_storage_for_reduce(node: torch.fx.Node):
return inputs_storage, outputs_storage


# =============================================================================
# Reduce.cpp
# =============================================================================


@update_features(
[
exir_ops.edge.aten.mean.dim,
Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/patterns/quantized_binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def make_add_q8ta_q8ta_q8to_custom_op(
exir_ops.edge.aten.add.Tensor,
exir_ops.edge.aten.add_.Tensor,
}:
op_target = exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default
op_target = exir_ops.edge.et_vk.q8ta_add.default
else:
# For future binary operations, add more mappings here
raise NotImplementedError(
Expand Down
82 changes: 69 additions & 13 deletions backends/vulkan/runtime/VulkanBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,21 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {

const size_t num_inputs = compute_graph->inputs().size();
bool should_propagate_resize = false;
#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracer* event_tracer = context.event_tracer();
runtime::EventTracerEntry overall_event_tracer_entry =
event_tracer_start_profiling_delegate(
event_tracer,
"ETVK_EXECUTE",
/* delegate_debug_id = */ -1);
#endif // ET_EVENT_TRACER_ENABLED
#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracerEntry copy_inputs_event_tracer_entry =
event_tracer_start_profiling_delegate(
event_tracer,
"ETVK_COPY_INPUTS",
/* delegate_debug_id = */ -1);
#endif // ET_EVENT_TRACER_ENABLED
for (size_t i = 0; i < num_inputs; i++) {
const ValueRef iref = compute_graph->inputs()[i].value;
if (compute_graph->val_is_tensor(iref)) {
Expand Down Expand Up @@ -669,13 +684,61 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
compute_graph->get_val_type(iref));
}
}
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(
event_tracer, copy_inputs_event_tracer_entry);
#endif // ET_EVENT_TRACER_ENABLED

if (should_propagate_resize || compute_graph->has_data_dependent_shapes()) {
#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracerEntry resize_event_tracer_entry =
event_tracer_start_profiling_delegate(
event_tracer,
"ETVK_RESIZE",
/* delegate_debug_id = */ -1);
#endif // ET_EVENT_TRACER_ENABLED
compute_graph->propagate_resize();
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(
event_tracer, resize_event_tracer_entry);
#endif // ET_EVENT_TRACER_ENABLED
}

#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracerEntry execute_event_tracer_entry =
event_tracer_start_profiling_delegate(
event_tracer,
"ETVK_COMPUTE_GRAPH_EXECUTE",
/* delegate_debug_id = */ -1);
#endif // ET_EVENT_TRACER_ENABLED
compute_graph->execute();
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(
event_tracer, execute_event_tracer_entry);
#endif // ET_EVENT_TRACER_ENABLED

#ifdef ET_EVENT_TRACER_ENABLED
compute_graph->context()->querypool().extract_results();
for (const auto& r :
compute_graph->context()->querypool().get_shader_timestamp_data()) {
std::string event_name = "{" + r.kernel_name +
", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}";
event_tracer_log_profiling_delegate(
event_tracer,
event_name.c_str(),
/* delegate_debug_id = */ -1,
r.start_time_ns,
r.end_time_ns);
}
#endif // ET_EVENT_TRACER_ENABLED

#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracerEntry copy_outputs_event_tracer_entry =
event_tracer_start_profiling_delegate(
event_tracer,
"ETVK_COPY_OUTPUTS",
/* delegate_debug_id = */ -1);
#endif // ET_EVENT_TRACER_ENABLED
for (size_t i = 0; i < compute_graph->outputs().size(); i++) {
const size_t o = i + num_inputs;
const ValueRef oref = compute_graph->outputs()[i].value;
Expand All @@ -701,21 +764,14 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
compute_graph->get_val_type(oref));
}
}
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(
event_tracer, copy_outputs_event_tracer_entry);
#endif // ET_EVENT_TRACER_ENABLED

#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracer* event_tracer = context.event_tracer();
compute_graph->context()->querypool().extract_results();
for (const auto& r :
compute_graph->context()->querypool().get_shader_timestamp_data()) {
std::string event_name = "{" + r.kernel_name +
", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}";
event_tracer_log_profiling_delegate(
event_tracer,
event_name.c_str(),
/* delegate_debug_id = */ -1,
r.start_time_ns,
r.end_time_ns);
}
event_tracer_end_profiling_delegate(
event_tracer, overall_event_tracer_entry);
#endif // ET_EVENT_TRACER_ENABLED

return Error::Ok;
Expand Down
6 changes: 5 additions & 1 deletion backends/vulkan/runtime/api/containers/StagingBuffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,12 @@ StagingBuffer::StagingBuffer(
const vkapi::CopyDirection direction)
: context_p_(context_p),
dtype_(get_staging_dtype(context_p, dtype)),
// For 8-bit types, align numel to the next multiple of 4. Devices that
// lack 8-bit storage buffer support will interpret the data as int32, so
// the buffer size must be a multiple of 4 bytes.
vulkan_buffer_(context_p_->adapter_ptr()->vma().create_staging_buffer(
element_size(dtype_) * numel,
element_size(dtype_) *
(element_size(dtype_) == 1 ? utils::align_up_4(numel) : numel),
direction)),
mapped_data_(nullptr) {}

Expand Down
76 changes: 0 additions & 76 deletions backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl

This file was deleted.

61 changes: 41 additions & 20 deletions backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
Original file line number Diff line number Diff line change
Expand Up @@ -334,45 +334,66 @@ TensorIndex linear_idx_to_tensor_idx(
/*
* Convert a linear texel index to a TensorIndex4D.
*
* This function is used for texel-based dispatch where each thread handles
* one packed texel (4 elements along the packed dimension). The texel index
* is decomposed using the dim_order and strides from the tensor's layout.
* This is the inverse of tensor4d_idx_to_texel_idx. It handles both
* single-packed layouts (outer_block_size == 1) and block-packed layouts
* (e.g., 4W4C where outer_block_size > 1).
*
* The strides in BufferMetadata should already be in texel space (with packed
* dimension size divided by 4).
* The approach mirrors tensor4d_idx_to_texel_idx by decomposing the problem
* into two levels:
* 1. Decompose texel_idx into block_idx and intra-block texel offset
* 2. Decompose block_idx into block-space tensor coordinates using strides
* 3. Convert block-space coordinates to element-space by multiplying by
* block sizes
* 4. Add the intra-block outer-dimension offset
*
* For single-packed layouts (outer_block_size == 1, inner_dim == outer_dim),
* texels_per_block == 1, so block_idx == texel_idx and intra_block_texel == 0.
* The only effective multiplication is tidx[inner_dim] *= inner_block_size
* (i.e., *= 4), matching the previous single-packed behavior.
*
* Parameters:
* meta: BufferMetadata with tensor sizes and texel-space strides
* meta: BufferMetadata with block-space strides
* texel_idx: Linear index into packed texels (0 to num_texels-1)
* hashed_layout: Packed layout info containing dim_order and packed_dim
*
* Returns: TensorIndex4D with logical tensor coordinates (packed dim is base of 4-element block)
* Returns: TensorIndex4D with logical tensor coordinates (packed dims are
* base of their respective blocks)
*/
TensorIndex4D texel_idx_to_tensor4d_idx(
const BufferMetadata meta,
uint texel_idx,
const int hashed_layout) {
TensorIndex4D tidx;

const int packed_dim = get_packed_dim(hashed_layout);
const int inner_dim = get_packed_dim(hashed_layout);
const int outer_dim = get_outer_packed_dim(hashed_layout);
const int inner_block_size = get_packed_dim_block_size(hashed_layout);
const int outer_block_size = get_outer_packed_dim_block_size(hashed_layout);

// Decompose texel_idx using dim_order from hashed_layout and strides from meta
// Iterate from slowest-varying dimension (d=3) to fastest (d=0)
// This follows the pattern of linear_idx_to_tensor_idx in indexing.glslh
// Number of texels per block: each block has inner_block_size *
// outer_block_size elements, and each texel holds 4 elements
const int texels_per_block = (inner_block_size * outer_block_size) / 4;

// Decompose texel_idx into block_idx and intra-block texel offset
const uint block_idx = texel_idx / texels_per_block;
const int intra_block_texel = int(texel_idx % texels_per_block);

// Decompose block_idx into block-space tensor coordinates using dim_order
// and strides. Iterate from slowest-varying (d=3) to fastest (d=0).
uint remaining = block_idx;
[[unroll]] for (int d = 3; d >= 0; d--) {
// Get dim index from hashed_layout's dim_order (bits 0-15)
int dim_idx = extract_4b(hashed_layout, d);

// Get stride for this dimension from BufferMetadata
uint dim_stride = meta.strides[0][dim_idx];

// Compute coordinate for this dimension
tidx.data[dim_idx] = int(texel_idx / dim_stride);
texel_idx = texel_idx % dim_stride;
tidx.data[dim_idx] = int(remaining / dim_stride);
remaining = remaining % dim_stride;
}

// Convert packed dimension from texel index to element index
tidx.data[packed_dim] *= 4;
// Convert block-space coordinates to element-space
tidx.data[inner_dim] *= inner_block_size;
tidx.data[outer_dim] *= outer_block_size;

// Add intra-block outer-dimension offset
tidx.data[outer_dim] += intra_block_texel;

return tidx;
}
Expand Down
Loading
Loading