Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1136,7 +1136,7 @@ jobs:
./cmake-out/backends/vulkan/test/custom_ops/choose_qparams_per_row
./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_qdq
./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_clone
./cmake-out/backends/vulkan/test/custom_ops/q8ta_q8ta_q8to_add
./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_binary

# "Classic" Operator tests
PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_op.sh --build
Expand Down
10 changes: 5 additions & 5 deletions backends/vulkan/custom_ops_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -564,11 +564,11 @@ def apply_rotary_emb_impl(
apply_rotary_emb_op = getattr(getattr(torch.ops, namespace), name)

########################
## add_q8ta_q8ta_q8to ##
## q8ta_add ##
########################


def add_q8ta_q8ta_q8to_impl(
def q8ta_add_impl(
input_a: torch.Tensor,
input_b: torch.Tensor,
input_a_scale: float,
Expand Down Expand Up @@ -598,12 +598,12 @@ def add_q8ta_q8ta_q8to_impl(
return quantized_result


name = "add_q8ta_q8ta_q8to"
name = "q8ta_add"
lib.define(
f"{name}(Tensor input_a, Tensor input_b, float input_a_scale, int input_a_zero_point, float input_b_scale, int input_b_zero_point, float output_scale, int output_zero_point, float alpha) -> Tensor"
)
lib.impl(name, add_q8ta_q8ta_q8to_impl, "CompositeExplicitAutograd")
add_q8ta_q8ta_q8to_op = getattr(getattr(torch.ops, namespace), name)
lib.impl(name, q8ta_add_impl, "CompositeExplicitAutograd")
q8ta_add_op = getattr(getattr(torch.ops, namespace), name)

#############################
## select_as_symint ##
Expand Down
27 changes: 16 additions & 11 deletions backends/vulkan/op_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,19 +501,23 @@ def register_torchao_choose_qparams_affine():


# =============================================================================
# QuantizedBinary.cpp
# Q8taBinary.cpp
# =============================================================================


@update_features(exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default)
def register_add_q8ta_q8ta_q8to():
@update_features(exir_ops.edge.et_vk.q8ta_add.default)
def register_q8ta_add():
return OpFeatures(
inputs_storage=utils.PACKED_INT8_4W4C_BUFFER,
inputs_storage=utils.PACKED_INT8_BUFFER,
supports_resize=False,
supports_prepacking=True,
)


# =============================================================================
# Reduce.cpp
# =============================================================================


def get_dims_reduced(node: torch.fx.Node) -> Union[int, List[int]]:
ndim = utils.ndim_of(node.args[0])
assert ndim is not None
Expand Down Expand Up @@ -623,11 +627,6 @@ def pick_storage_for_reduce(node: torch.fx.Node):
return inputs_storage, outputs_storage


# =============================================================================
# Reduce.cpp
# =============================================================================


@update_features(
[
exir_ops.edge.aten.mean.dim,
Expand Down Expand Up @@ -778,6 +777,9 @@ def register_q8ta_conv_pw_op():
utils.NO_STORAGE, # groups (non tensor)
utils.NO_STORAGE, # original OC count (non tensor)
],
outputs_storage=[
utils.PACKED_INT8_CHANNELS_PACKED_BUFFER,
],
supports_resize=False,
supports_prepacking=True,
)
Expand All @@ -792,7 +794,7 @@ def register_q8ta_conv_pw_op():
def register_q8ta_conv2d_ops():
return OpFeatures(
inputs_storage=[
utils.PACKED_INT8_4W4C_BUFFER, # input
utils.PACKED_INT8_4C1W_BUFFER, # input
utils.NO_STORAGE, # input_scale (non tensor)
utils.NO_STORAGE, # input_zero_point (non tensor)
utils.NO_STORAGE, # weight (prepacked)
Expand All @@ -808,6 +810,9 @@ def register_q8ta_conv2d_ops():
utils.NO_STORAGE, # groups (non tensor)
utils.NO_STORAGE, # original OC count (non tensor)
],
outputs_storage=[
utils.PACKED_INT8_CHANNELS_PACKED_BUFFER,
],
supports_resize=False,
supports_prepacking=True,
)
Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/patterns/quantized_binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def make_add_q8ta_q8ta_q8to_custom_op(
exir_ops.edge.aten.add.Tensor,
exir_ops.edge.aten.add_.Tensor,
}:
op_target = exir_ops.edge.et_vk.add_q8ta_q8ta_q8to.default
op_target = exir_ops.edge.et_vk.q8ta_add.default
else:
# For future binary operations, add more mappings here
raise NotImplementedError(
Expand Down
82 changes: 69 additions & 13 deletions backends/vulkan/runtime/VulkanBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,21 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {

const size_t num_inputs = compute_graph->inputs().size();
bool should_propagate_resize = false;
#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracer* event_tracer = context.event_tracer();
runtime::EventTracerEntry overall_event_tracer_entry =
event_tracer_start_profiling_delegate(
event_tracer,
"ETVK_EXECUTE",
/* delegate_debug_id = */ -1);
#endif // ET_EVENT_TRACER_ENABLED
#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracerEntry copy_inputs_event_tracer_entry =
event_tracer_start_profiling_delegate(
event_tracer,
"ETVK_COPY_INPUTS",
/* delegate_debug_id = */ -1);
#endif // ET_EVENT_TRACER_ENABLED
for (size_t i = 0; i < num_inputs; i++) {
const ValueRef iref = compute_graph->inputs()[i].value;
if (compute_graph->val_is_tensor(iref)) {
Expand Down Expand Up @@ -669,13 +684,61 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
compute_graph->get_val_type(iref));
}
}
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(
event_tracer, copy_inputs_event_tracer_entry);
#endif // ET_EVENT_TRACER_ENABLED

if (should_propagate_resize || compute_graph->has_data_dependent_shapes()) {
#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracerEntry resize_event_tracer_entry =
event_tracer_start_profiling_delegate(
event_tracer,
"ETVK_RESIZE",
/* delegate_debug_id = */ -1);
#endif // ET_EVENT_TRACER_ENABLED
compute_graph->propagate_resize();
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(
event_tracer, resize_event_tracer_entry);
#endif // ET_EVENT_TRACER_ENABLED
}

#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracerEntry execute_event_tracer_entry =
event_tracer_start_profiling_delegate(
event_tracer,
"ETVK_COMPUTE_GRAPH_EXECUTE",
/* delegate_debug_id = */ -1);
#endif // ET_EVENT_TRACER_ENABLED
compute_graph->execute();
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(
event_tracer, execute_event_tracer_entry);
#endif // ET_EVENT_TRACER_ENABLED

#ifdef ET_EVENT_TRACER_ENABLED
compute_graph->context()->querypool().extract_results();
for (const auto& r :
compute_graph->context()->querypool().get_shader_timestamp_data()) {
std::string event_name = "{" + r.kernel_name +
", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}";
event_tracer_log_profiling_delegate(
event_tracer,
event_name.c_str(),
/* delegate_debug_id = */ -1,
r.start_time_ns,
r.end_time_ns);
}
#endif // ET_EVENT_TRACER_ENABLED

#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracerEntry copy_outputs_event_tracer_entry =
event_tracer_start_profiling_delegate(
event_tracer,
"ETVK_COPY_OUTPUTS",
/* delegate_debug_id = */ -1);
#endif // ET_EVENT_TRACER_ENABLED
for (size_t i = 0; i < compute_graph->outputs().size(); i++) {
const size_t o = i + num_inputs;
const ValueRef oref = compute_graph->outputs()[i].value;
Expand All @@ -701,21 +764,14 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
compute_graph->get_val_type(oref));
}
}
#ifdef ET_EVENT_TRACER_ENABLED
event_tracer_end_profiling_delegate(
event_tracer, copy_outputs_event_tracer_entry);
#endif // ET_EVENT_TRACER_ENABLED

#ifdef ET_EVENT_TRACER_ENABLED
runtime::EventTracer* event_tracer = context.event_tracer();
compute_graph->context()->querypool().extract_results();
for (const auto& r :
compute_graph->context()->querypool().get_shader_timestamp_data()) {
std::string event_name = "{" + r.kernel_name +
", \"dispatch_id\": " + std::to_string(r.dispatch_id) + "}";
event_tracer_log_profiling_delegate(
event_tracer,
event_name.c_str(),
/* delegate_debug_id = */ -1,
r.start_time_ns,
r.end_time_ns);
}
event_tracer_end_profiling_delegate(
event_tracer, overall_event_tracer_entry);
#endif // ET_EVENT_TRACER_ENABLED

return Error::Ok;
Expand Down
6 changes: 5 additions & 1 deletion backends/vulkan/runtime/api/containers/StagingBuffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,12 @@ StagingBuffer::StagingBuffer(
const vkapi::CopyDirection direction)
: context_p_(context_p),
dtype_(get_staging_dtype(context_p, dtype)),
// For 8-bit types, align numel to the next multiple of 4. Devices that
// lack 8-bit storage buffer support will interpret the data as int32, so
// the buffer size must be a multiple of 4 bytes.
vulkan_buffer_(context_p_->adapter_ptr()->vma().create_staging_buffer(
element_size(dtype_) * numel,
element_size(dtype_) *
(element_size(dtype_) == 1 ? utils::align_up_4(numel) : numel),
direction)),
mapped_data_(nullptr) {}

Expand Down
76 changes: 0 additions & 76 deletions backends/vulkan/runtime/graph/ops/glsl/binary_q8ta_q8ta_q8to.glsl

This file was deleted.

Loading
Loading