Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ void main() {
// Compute the value for each element in the texel along the packed dim.
VEC4_T outtex = VEC4_T(0);
int limit = min(
4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]);
4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]);
for (int comp = 0; comp < limit; comp++) {
int elem_idx = out_tidx.data[0]; // W index is the linear element index
outtex[comp] = VEC4_T(start + elem_idx * step).x;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ void main() {
VEC4_T out_texel = VEC4_T(0);

int limit = min(
4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]);
4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]);
for (int comp = 0; comp < 4; comp++) {
if (comp >= limit) {
break;
Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/runtime/graph/ops/glsl/full_texture.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ void main() {

TensorIndex4D tidx =
texture_pos_to_tensor4d_idx_simple(outp, pos, out_layout);
const int packed_dim_size = outp.sizes[packed_dim];
const int packed_dim_size = safe_idx(outp.sizes, packed_dim);
int packed_idx = tidx.data[packed_dim];

if (packed_idx + 3 >= packed_dim_size) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ void main() {
VEC4_T out_texel = VEC4_T(0);

int limit = min(
4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]);
4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]);
for (int comp = 0; comp < 4; comp++) {
TensorIndex4D input_tidx = out_tidx;
int gather_idx = idx_texel[comp];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ void main() {
VEC4_T out_texel = VEC4_T(0);

int limit = min(
4, outp.sizes[out_packed_dim] - out_tidx.data[out_packed_dim]);
4, safe_idx(outp.sizes, out_packed_dim) - out_tidx.data[out_packed_dim]);
for (int comp = 0; comp < limit; comp++) {
int idx = idx_texel[comp];

Expand Down
8 changes: 8 additions & 0 deletions backends/vulkan/runtime/graph/ops/glsl/indexing.glslh
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,14 @@ uint safe_idx(const uvec4 v, const int idx) {
return v.w;
}

// Safe ivec3 component access via if/else chain. Same rationale as safe_idx
// for ivec4.
int safe_idx(const ivec3 v, const int idx) {
if (idx == 0) return v.x;
if (idx == 1) return v.y;
return v.z;
}

// Safe ivec4 component write via if/else chain. Companion to safe_idx for
// cases where we need to set a component by a spec-const-derived index.
void safe_set(inout ivec4 v, const int idx, const int val) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ void main() {
texel_idx_to_tensor4d_idx(outp, texel_idx, outp_layout);

// Bounds check on outer dimension
if (tidx.data[outer_dim] >= int(outp.sizes[0][outer_dim])) {
if (tidx.data[outer_dim] >= int(safe_idx(outp.sizes[0], outer_dim))) {
return;
}

Expand All @@ -55,7 +55,7 @@ void main() {
int packed = 0;
[[unroll]] for (int i = 0; i < 4; ++i) {
const int elem_inner = tidx.data[inner_dim] + i;
if (elem_inner < int(outp.sizes[0][inner_dim])) {
if (elem_inner < int(safe_idx(outp.sizes[0], inner_dim))) {
// Build element coordinates
ivec4 elem = tidx.data;
elem[inner_dim] = elem_inner;
Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/runtime/graph/ops/glsl/pad_texture.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ void main() {

// Tail texels may have fewer than 4 valid elements; leave extras as 0.
const int limit =
min(4, outp.sizes[packed_dim] - out_tidx.data[packed_dim]);
min(4, safe_idx(outp.sizes, packed_dim) - out_tidx.data[packed_dim]);

VEC4_T out_texel = VEC4_T(0);

Expand Down
100 changes: 50 additions & 50 deletions backends/vulkan/runtime/graph/ops/glsl/permute_texture.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -9,97 +9,97 @@
#version 450 core

${define_required_extensions("texture3d", DTYPE)}
${define_explicit_type_extensions(DTYPE)}

#define PRECISION ${PRECISION}

#define VEC4_T ${texel_type(DTYPE)}
#define T ${buffer_scalar_type(DTYPE)}
#define VEC4_T ${texel_load_type(DTYPE, "texture3d")}
#define T ${texel_load_component_type(DTYPE, "texture3d")}

${define_active_storage_type("texture3d")}

#extension GL_EXT_control_flow_attributes : require

layout(std430) buffer;

#include "indexing_utils.h"
#include "common.glslh"
#include "indexing.glslh"

${layout_declare_tensor(B, "w", "t_out", DTYPE, "texture3d")}
${layout_declare_tensor(B, "r", "t_in", DTYPE, "texture3d")}

${layout_declare_ubo(B, "TextureMetadata", "outp")}
${layout_declare_ubo(B, "TextureMetadata", "inp")}

layout(push_constant) uniform restrict Block {
ivec4 out_sizes;
ivec4 in_sizes;
ivec4 permute_dims; // Permutation mapping: permute_dims[i] = j means output dim i comes from input dim j
ivec4 permute_dims;
};

${layout_declare_spec_const(C, "int", "out_layout", "DEFAULT_LAYOUT")}
const lowp ivec4 out_axis_map = unhash_axis_map(out_layout);
const lowp int out_packed_dim = unhash_packed_dim(out_layout);

${layout_declare_spec_const(C, "int", "in_layout", "DEFAULT_LAYOUT")}
const lowp ivec4 in_axis_map = unhash_axis_map(in_layout);
const lowp int in_packed_dim = unhash_packed_dim(in_layout);
${layout_declare_spec_const(C, "int", "out_layout", "CONTIG_LAYOUT_INT")}
${layout_declare_spec_const(C, "int", "in_layout", "CONTIG_LAYOUT_INT")}
const int out_packed_dim = get_packed_dim(out_layout);
const int in_packed_dim = get_packed_dim(in_layout);

layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;

// Convert output tensor index to input tensor index based on permutation
// Convert output tensor index to input tensor index based on permutation.
// permute_dims[i] = j means output dim i comes from input dim j.
// We write: in_tidx[permute_dims.{x,y,z,w}] = out_tidx.{x,y,z,w}
// This uses literal component access on the push constant (safe) and dynamic
// indexing into the local in_tidx variable (also safe).
ivec4 out_tidx_to_in_tidx(const ivec4 out_tidx) {
ivec4 in_tidx;

// Apply the permutation mapping: in_tidx[permute_dims[i]] = out_tidx[i]
in_tidx[permute_dims.x] = out_tidx.x;
in_tidx[permute_dims.y] = out_tidx.y;
in_tidx[permute_dims.z] = out_tidx.z;
in_tidx[permute_dims.w] = out_tidx.w;

return in_tidx;
}

// Check if we can use the fast path where texels from the input tensor can be
// copied directly into the output tensor. This occurs when the packed dimension
// is preserved in the permutation, i.e. reading a texel from the output tensor
// produces 4 texels along the same dimension as reading a texel from the input
// tensor.
bool can_use_fast_path() {
// Fast path is possible when the packed dimension is preserved in the permutation
// This means permute_dims[out_packed_dim] == in_packed_dim
return permute_dims[out_packed_dim] == in_packed_dim;
}

void main() {
const ivec3 lpos = ivec3(gl_GlobalInvocationID);
ivec4 out_tidx = lpos_to_tidx(lpos, out_sizes, out_axis_map.w, out_packed_dim);
const ivec3 out_pos = ivec3(gl_GlobalInvocationID);

if (any(greaterThanEqual(out_tidx, out_sizes))) {
if (out_of_bounds(out_pos, outp)) {
return;
}

if (can_use_fast_path()) {
TensorIndex4D out_tidx =
texture_pos_to_tensor4d_idx_simple(outp, out_pos, out_layout);

// Check if packed dimension is preserved in the permutation. Use safe_idx
// to avoid dynamic indexing of push constant with spec-const-derived index.
const bool fast_path =
safe_idx(permute_dims, out_packed_dim) == in_packed_dim;

if (fast_path) {
// Fast path: packed dimension is preserved, so we can copy texels directly
ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
ivec4 in_tidx_data = out_tidx_to_in_tidx(out_tidx.data);
TensorIndex4D in_tidx;
in_tidx.data = in_tidx_data;

write_texel_lpos(t_out, lpos, in_texel, out_axis_map);
}
else {
ivec3 in_pos =
tensor4d_idx_to_texel_pos_simple(inp, in_tidx, in_layout);
VEC4_T in_texel = texelFetch(t_in, in_pos, 0);

imageStore(t_out, out_pos, in_texel);
} else {
// Slow path: packed dimension is not preserved, so each element of the
// output texel may be "sourced" from a different texel in the input tensor.
// Therefore each output texel element is processed individually.
// output texel may come from a different texel in the input tensor.
VEC4_T out_texel = VEC4_T(0);

for (int texel_i = 0; texel_i < 4; ++texel_i) {
ivec4 in_tidx = out_tidx_to_in_tidx(out_tidx);
ivec3 in_pos = tidx_to_pos(in_tidx, in_sizes, in_axis_map, in_packed_dim);
int element_idx = in_tidx[in_packed_dim] % 4;
for (int comp = 0; comp < 4; comp++) {
ivec4 in_tidx_data = out_tidx_to_in_tidx(out_tidx.data);
TensorIndex4D in_tidx;
in_tidx.data = in_tidx_data;

VEC4_T in_texel = VEC4_T(load_texel(t_in, in_pos));
T selected_value = T(in_texel[element_idx]);
TextureElementIndex in_elem =
tensor4d_idx_to_texture_element_idx_simple(inp, in_tidx, in_layout);

out_texel[texel_i] = selected_value;
VEC4_T in_texel = texelFetch(t_in, in_elem.pos, 0);
out_texel[comp] = in_texel[in_elem.comp];

out_tidx[out_packed_dim]++;
out_tidx.data[out_packed_dim]++;
}

write_texel_lpos(t_out, lpos, out_texel, out_axis_map);
imageStore(t_out, out_pos, out_texel);
}
}
13 changes: 7 additions & 6 deletions backends/vulkan/runtime/graph/ops/glsl/reduce.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ layout(constant_id = 5) const int group_dim = 1;
shared vec4 shared_vecs[MAX_NTHREADS];

#include "indexing_utils.h"
#include "indexing.glslh"

int tid_to_smi(const ivec2 tid) {
return tid.x + tid.y * NWORKERS;
Expand Down Expand Up @@ -95,7 +96,7 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {
scan_pos[reduce_dim] = tid.x;
// Partially accumulate over elements i, i + NWORKERS, i + 2*NWORKERS, ... of
// the reduction row
for (int i = tid.x; i < tin_sizes[reduce_dim];
for (int i = tid.x; i < safe_idx(tin_sizes, reduce_dim);
i += NWORKERS, scan_pos[reduce_dim] += NWORKERS) {
accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos));
}
Expand All @@ -115,11 +116,11 @@ void reduce_nonpacked_dim(const ivec2 tid, ivec3 scan_pos) {

// Determine if there are any padding elements in the final texel of the
// packed dimension
const int nspill = mod4(tin_sizes[packed_dim]);
const int nspill = mod4(safe_idx(tin_sizes, packed_dim));
// Detect if this thread is working on the final texels of the packed
// dimension, which may have padding elements
const bool is_last_texel =
scan_pos[packed_dim] == (tin_limits[packed_dim] - 1);
scan_pos[packed_dim] == (safe_idx(tin_limits, packed_dim) - 1);

// Explicitly set padding elements to 0
if (is_last_texel && nspill > 0) {
Expand All @@ -145,10 +146,10 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
const int smi = tid_to_smi(tid);

// Number of non-padding elements in the last texel in the reduction row
const int nspill = mod4(tin_sizes[packed_dim]);
const int nspill = mod4(safe_idx(tin_sizes, packed_dim));
// Only reduce up to the last "complete" texel. The last texel will need to be
// handled specially if it has padding elements.
const int reduce_len = tin_sizes[packed_dim] - nspill;
const int reduce_len = safe_idx(tin_sizes, packed_dim) - nspill;

scan_pos[reduce_dim] = 0;
vec4 accum = INIT_ACCUM(vec4(load_texel(tin, scan_pos).x));
Expand All @@ -163,7 +164,7 @@ void reduce_packed_dim(const ivec2 tid, ivec3 scan_pos) {
// For the last texel in the dim, if there are padding elements then each
// element of the texel needs to be processed individually such that the
// padding elements are ignored
if (scan_pos[reduce_dim] == tin_limits[reduce_dim] - 1 && nspill > 0) {
if (scan_pos[reduce_dim] == safe_idx(tin_limits, reduce_dim) - 1 && nspill > 0) {
const vec4 intex = load_texel(tin, scan_pos);
for (int i = 0; i < nspill; i++) {
accum.x = UPDATE_ACCUM(accum.x, intex[i]);
Expand Down
11 changes: 6 additions & 5 deletions backends/vulkan/runtime/graph/ops/glsl/reduce2d.glsl
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ layout(constant_id = 6) const int group_dim = 2;
shared vec4 shared_vecs[MAX_NTHREADS];

#include "indexing_utils.h"
#include "indexing.glslh"

int tid_to_smi(const ivec2 tid) {
return tid.x + tid.y * NWORKERS;
Expand All @@ -68,12 +69,12 @@ void reduce_2d_non_packed_dim(const ivec2 tid, ivec3 scan_pos) {

// First dimension reduction
scan_pos[reduce_dim1] = tid.x;
for (int i = tid.x; i < tin_sizes[reduce_dim1];
for (int i = tid.x; i < safe_idx(tin_sizes, reduce_dim1);
i += NWORKERS, scan_pos[reduce_dim1] += NWORKERS) {

// Second dimension reduction
scan_pos[reduce_dim2] = 0;
for (int j = 0; j < tin_sizes[reduce_dim2]; j++, scan_pos[reduce_dim2]++) {
for (int j = 0; j < safe_idx(tin_sizes, reduce_dim2); j++, scan_pos[reduce_dim2]++) {
accum = UPDATE_ACCUM(accum, load_texel(tin, scan_pos));
}
}
Expand All @@ -93,11 +94,11 @@ void reduce_2d_non_packed_dim(const ivec2 tid, ivec3 scan_pos) {

// Determine if there are any padding elements in the final texel of the
// packed dimension
const int nspill = mod4(tin_sizes[packed_dim]);
const int nspill = mod4(safe_idx(tin_sizes, packed_dim));
// Detect if this thread is working on the final texels of the packed
// dimension, which may have padding elements
const bool is_last_texel =
scan_pos[packed_dim] == (tin_limits[packed_dim] - 1);
const bool is_last_texel =
scan_pos[packed_dim] == (safe_idx(tin_limits, packed_dim) - 1);

// Explicitly set padding elements to 0
if (is_last_texel && nspill > 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ void main() {
VEC4_T out_texel = VEC4_T(0);

const int limit = min(
4, out_meta.sizes[packed_dim] - out_tidx.data[packed_dim]);
4, safe_idx(out_meta.sizes, packed_dim) - out_tidx.data[packed_dim]);
for (int comp = 0; comp < limit; comp++) {
TensorIndex4D in_tidx = out_tidx;
in_tidx.data = ivec4(
Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/runtime/graph/ops/glsl/select.glslh
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) {

int adjusted_index = index;
if (index < 0) {
adjusted_index = index + inp.sizes[selected_dim];
adjusted_index = index + safe_idx(inp.sizes, selected_dim);
}

// Handle different dimensions for selection
Expand Down
2 changes: 1 addition & 1 deletion backends/vulkan/runtime/graph/ops/glsl/slice.glslh
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ TensorIndex4D out_tidx_to_in_tidx(const TensorIndex4D out_tidx) {

int adjusted_start = start;
if (start < 0) {
adjusted_start = start + inp.sizes[selected_dim];
adjusted_start = start + safe_idx(inp.sizes, selected_dim);
}

in_tidx.data[selected_dim] = adjusted_start + out_tidx.data[selected_dim] * step;
Expand Down
Loading
Loading