Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion backends/vulkan/op_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,6 @@ def register_q8ta_add():
return OpFeatures(
inputs_storage=utils.PACKED_INT8_BUFFER,
supports_resize=False,
supports_prepacking=True,
)


Expand Down
82 changes: 69 additions & 13 deletions backends/vulkan/runtime/VulkanBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,21 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {

const size_t num_inputs = compute_graph->inputs().size();
bool should_propagate_resize = false;
#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracer* event_tracer = context.event_tracer();
runtime::EventTracerEntry overall_event_tracer_entry =
event_tracer_start_profiling_delegate(
event_tracer,
"ETVK_EXECUTE",
/* delegate_debug_id = */ -1);
#endif // ET_EVENT_TRACER_ENABLED
#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracerEntry copy_inputs_event_tracer_entry =
event_tracer_start_profiling_delegate(
event_tracer,
"ETVK_COPY_INPUTS",
/* delegate_debug_id = */ -1);
#endif // ET_EVENT_TRACER_ENABLED
for (size_t i = 0; i < num_inputs; i++) {
const ValueRef iref = compute_graph->inputs()[i].value;
if (compute_graph->val_is_tensor(iref)) {
Expand Down Expand Up @@ -669,13 +684,61 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
compute_graph->get_val_type(iref));
}
}
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(
event_tracer, copy_inputs_event_tracer_entry);
#endif // ET_EVENT_TRACER_ENABLED

if (should_propagate_resize || compute_graph->has_data_dependent_shapes()) {
#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracerEntry resize_event_tracer_entry =
event_tracer_start_profiling_delegate(
event_tracer,
"ETVK_RESIZE",
/* delegate_debug_id = */ -1);
#endif // ET_EVENT_TRACER_ENABLED
compute_graph->propagate_resize();
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(
event_tracer, resize_event_tracer_entry);
#endif // ET_EVENT_TRACER_ENABLED
}

#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracerEntry execute_event_tracer_entry =
event_tracer_start_profiling_delegate(
event_tracer,
"ETVK_COMPUTE_GRAPH_EXECUTE",
/* delegate_debug_id = */ -1);
#endif // ET_EVENT_TRACER_ENABLED
compute_graph->execute();
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(
event_tracer, execute_event_tracer_entry);
#endif // ET_EVENT_TRACER_ENABLED

#ifdef ET_EVENT_TRACER_ENABLED
compute_graph->context()->querypool().extract_results();
for (const auto& r :
compute_graph->context()->querypool().get_shader_timestamp_data()) {
std::string event_name = "{" + r.kernel_name +
", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}";
event_tracer_log_profiling_delegate(
event_tracer,
event_name.c_str(),
/* delegate_debug_id = */ -1,
r.start_time_ns,
r.end_time_ns);
}
#endif // ET_EVENT_TRACER_ENABLED

#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracerEntry copy_outputs_event_tracer_entry =
event_tracer_start_profiling_delegate(
event_tracer,
"ETVK_COPY_OUTPUTS",
/* delegate_debug_id = */ -1);
#endif // ET_EVENT_TRACER_ENABLED
for (size_t i = 0; i < compute_graph->outputs().size(); i++) {
const size_t o = i + num_inputs;
const ValueRef oref = compute_graph->outputs()[i].value;
Expand All @@ -701,21 +764,14 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
compute_graph->get_val_type(oref));
}
}
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(
event_tracer, copy_outputs_event_tracer_entry);
#endif // ET_EVENT_TRACER_ENABLED

#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracer* event_tracer = context.event_tracer();
compute_graph->context()->querypool().extract_results();
for (const auto& r :
compute_graph->context()->querypool().get_shader_timestamp_data()) {
std::string event_name = "{" + r.kernel_name +
", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}";
event_tracer_log_profiling_delegate(
event_tracer,
event_name.c_str(),
/* delegate_debug_id = */ -1,
r.start_time_ns,
r.end_time_ns);
}
event_tracer_end_profiling_delegate(
event_tracer, overall_event_tracer_entry);
#endif // ET_EVENT_TRACER_ENABLED

return Error::Ok;
Expand Down
6 changes: 5 additions & 1 deletion backends/vulkan/runtime/api/containers/StagingBuffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,12 @@ StagingBuffer::StagingBuffer(
const vkapi::CopyDirection direction)
: context_p_(context_p),
dtype_(get_staging_dtype(context_p, dtype)),
// For 8-bit types, align numel to the next multiple of 4. Devices that
// lack 8-bit storage buffer support will interpret the data as int32, so
// the buffer size must be a multiple of 4 bytes.
vulkan_buffer_(context_p_->adapter_ptr()->vma().create_staging_buffer(
element_size(dtype_) * numel,
element_size(dtype_) *
(element_size(dtype_) == 1 ? utils::align_up_4(numel) : numel),
direction)),
mapped_data_(nullptr) {}

Expand Down
61 changes: 41 additions & 20 deletions backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
Original file line number Diff line number Diff line change
Expand Up @@ -334,45 +334,66 @@ TensorIndex linear_idx_to_tensor_idx(
/*
* Convert a linear texel index to a TensorIndex4D.
*
* This function is used for texel-based dispatch where each thread handles
* one packed texel (4 elements along the packed dimension). The texel index
* is decomposed using the dim_order and strides from the tensor's layout.
* This is the inverse of tensor4d_idx_to_texel_idx. It handles both
* single-packed layouts (outer_block_size == 1) and block-packed layouts
* (e.g., 4W4C where outer_block_size > 1).
*
* The strides in BufferMetadata should already be in texel space (with packed
* dimension size divided by 4).
* The approach mirrors tensor4d_idx_to_texel_idx by decomposing the problem
* into two levels:
* 1. Decompose texel_idx into block_idx and intra-block texel offset
* 2. Decompose block_idx into block-space tensor coordinates using strides
* 3. Convert block-space coordinates to element-space by multiplying by
* block sizes
* 4. Add the intra-block outer-dimension offset
*
* For single-packed layouts (outer_block_size == 1, inner_dim == outer_dim),
* texels_per_block == 1, so block_idx == texel_idx and intra_block_texel == 0.
* The only effective multiplication is tidx[inner_dim] *= inner_block_size
* (i.e., *= 4), matching the previous single-packed behavior.
*
* Parameters:
* meta: BufferMetadata with tensor sizes and texel-space strides
* meta: BufferMetadata with block-space strides
* texel_idx: Linear index into packed texels (0 to num_texels-1)
* hashed_layout: Packed layout info containing dim_order and packed_dim
*
* Returns: TensorIndex4D with logical tensor coordinates (packed dim is base of 4-element block)
* Returns: TensorIndex4D with logical tensor coordinates (packed dims are
* base of their respective blocks)
*/
TensorIndex4D texel_idx_to_tensor4d_idx(
const BufferMetadata meta,
uint texel_idx,
const int hashed_layout) {
TensorIndex4D tidx;

const int packed_dim = get_packed_dim(hashed_layout);
const int inner_dim = get_packed_dim(hashed_layout);
const int outer_dim = get_outer_packed_dim(hashed_layout);
const int inner_block_size = get_packed_dim_block_size(hashed_layout);
const int outer_block_size = get_outer_packed_dim_block_size(hashed_layout);

// Decompose texel_idx using dim_order from hashed_layout and strides from meta
// Iterate from slowest-varying dimension (d=3) to fastest (d=0)
// This follows the pattern of linear_idx_to_tensor_idx in indexing.glslh
// Number of texels per block: each block has inner_block_size *
// outer_block_size elements, and each texel holds 4 elements
const int texels_per_block = (inner_block_size * outer_block_size) / 4;

// Decompose texel_idx into block_idx and intra-block texel offset
const uint block_idx = texel_idx / texels_per_block;
const int intra_block_texel = int(texel_idx % texels_per_block);

// Decompose block_idx into block-space tensor coordinates using dim_order
// and strides. Iterate from slowest-varying (d=3) to fastest (d=0).
uint remaining = block_idx;
[[unroll]] for (int d = 3; d >= 0; d--) {
// Get dim index from hashed_layout's dim_order (bits 0-15)
int dim_idx = extract_4b(hashed_layout, d);

// Get stride for this dimension from BufferMetadata
uint dim_stride = meta.strides[0][dim_idx];

// Compute coordinate for this dimension
tidx.data[dim_idx] = int(texel_idx / dim_stride);
texel_idx = texel_idx % dim_stride;
tidx.data[dim_idx] = int(remaining / dim_stride);
remaining = remaining % dim_stride;
}

// Convert packed dimension from texel index to element index
tidx.data[packed_dim] *= 4;
// Convert block-space coordinates to element-space
tidx.data[inner_dim] *= inner_block_size;
tidx.data[outer_dim] *= outer_block_size;

// Add intra-block outer-dimension offset
tidx.data[outer_dim] += intra_block_texel;

return tidx;
}
Expand Down
78 changes: 78 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.glsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#version 450 core

#define PRECISION ${PRECISION}

${define_active_storage_type("buffer")}

layout(std430) buffer;

#include "indexing.glslh"

// Output buffer: packed int8x4 values (each int32 contains 4 packed int8)
${layout_declare_tensor(B, "w", "t_outp", "int", "buffer")}
// Input staging buffer: raw int8 data interpreted as int32 for device compat
${layout_declare_tensor(B, "r", "nchw_in", "int", "buffer")}

// Metadata for output tensor
${layout_declare_ubo(B, "BufferMetadata", "outp")}

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

${layout_declare_spec_const(C, "int", "outp_layout", "CONTIG_LAYOUT_INT")}

void main() {
const uint texel_idx = gl_GlobalInvocationID.x;
const uint num_texels = numel(outp) / 4;
if (texel_idx >= num_texels) {
return;
}

const int inner_dim = get_packed_dim(outp_layout);
const int outer_dim = get_outer_packed_dim(outp_layout);

const TensorIndex4D tidx =
texel_idx_to_tensor4d_idx(outp, texel_idx, outp_layout);

// Bounds check on outer dimension
if (tidx.data[outer_dim] >= int(outp.sizes[0][outer_dim])) {
return;
}

// Tensor sizes in WHCN order for NCHW contiguous index computation
const uint W = outp.sizes[0][0];
const uint H = outp.sizes[0][1];
const uint C = outp.sizes[0][2];

// Pack 4 int8 values along inner dimension into one int32
int packed = 0;
[[unroll]] for (int i = 0; i < 4; ++i) {
const int elem_inner = tidx.data[inner_dim] + i;
if (elem_inner < int(outp.sizes[0][inner_dim])) {
// Build element coordinates
ivec4 elem = tidx.data;
elem[inner_dim] = elem_inner;

// Compute NCHW contiguous index: w + h*W + c*H*W + n*C*H*W
const uint nchw_idx = uint(elem[0]) + uint(elem[1]) * W +
uint(elem[2]) * H * W + uint(elem[3]) * C * H * W;

// Read int8 from staging buffer (each int32 contains 4 bytes)
const uint int_idx = nchw_idx >> 2;
const uint byte_pos = nchw_idx & 3;
const int staging_val = nchw_in[int_idx];
const int byte_val = (staging_val >> (byte_pos * 8)) & 0xFF;

packed |= (byte_val << (i * 8));
}
}

t_outp[texel_idx] = packed;
}
11 changes: 11 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/nchw_to_int8x4_buffer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

nchw_to_int8x4_buffer:
parameter_names_with_default_values:
DTYPE: int
shader_variants:
- NAME: nchw_to_int8x4_buffer
19 changes: 1 addition & 18 deletions backends/vulkan/runtime/graph/ops/impl/Q8taConv2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -401,24 +401,7 @@ void q8ta_conv2d_general(
}

void q8ta_conv2d(ComputeGraph& graph, const std::vector<ValueRef>& args) {
// Index into args to extract values needed for dispatch decision
const ValueRef packed_int8_input = args.at(0);
const ValueRef kernel_size = args.at(9);
const ValueRef groups = args.at(13);

const int32_t groups_val = graph.get_int(groups);
const int64_t IC = graph.size_at<int64_t>(-3, packed_int8_input);

const int64_t K_h = graph.get_int_list(kernel_size)->at(0);
const int64_t K_w = graph.get_int_list(kernel_size)->at(1);

// Use im2col path when: non-grouped, input channels multiple of 4, small
// kernel
if (groups_val == 1 && IC % 4 == 0 && K_h <= 3 && K_w <= 3) {
q8ta_conv2d_im2col(graph, args);
} else {
q8ta_conv2d_general(graph, args);
}
q8ta_conv2d_general(graph, args);
}

REGISTER_OPERATORS {
Expand Down
49 changes: 49 additions & 0 deletions backends/vulkan/runtime/graph/ops/impl/Q8taStaging.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/vulkan/runtime/graph/ops/impl/Q8taStaging.h>

#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>

namespace vkcompute {

void add_staging_to_int8x4_buffer_node(
ComputeGraph& graph,
const ValueRef tensor_data,
const ValueRef tensor) {
VK_CHECK_COND(graph.dtype_of(tensor) == vkapi::kInt8x4);

std::string kernel_name = "nchw_to_int8x4_buffer";

vkapi::ParamsBindList param_buffers;
param_buffers.append(graph.buffer_meta_ubo(tensor));

// One thread per texel (each texel = one int32 = 4 packed int8).
// Use padded_numel to account for dimension padding in packed int8 layouts
// (e.g., kPackedInt8_4C with C=3 pads to C=4).
uint32_t num_texels =
utils::safe_downcast<uint32_t>(graph.padded_numel_of(tensor) / 4);
utils::uvec3 global_wg_size = {num_texels, 1, 1};
utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size);

graph.prepack_nodes().emplace_back(new PrepackNode(
graph,
VK_KERNEL_FROM_STR(kernel_name),
global_wg_size,
local_wg_size,
// Input and Output
tensor_data,
tensor,
// Parameter Buffers
param_buffers,
// Specialization Constants
{graph.hashed_layout_of(tensor)}));
}

} // namespace vkcompute
Loading
Loading