Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions ggml/src/ggml-sycl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,11 @@ endif()

if (GGML_SYCL_TARGET STREQUAL "INTEL")
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
if (NOT GGML_SYCL_DEVICE_ARCH)
target_link_options(ggml-sycl PRIVATE -Xs -ze-intel-greater-than-4GB-buffer-required)
else()
message(STATUS "Skipping -ze-intel-greater-than-4GB-buffer-required for spir64_gen AOT")
endif()

# Link against Intel oneMKL
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
Expand All @@ -160,7 +164,15 @@ if (GGML_SYCL_HOST_MEM_FALLBACK)
endif()

if (GGML_SYCL_DEVICE_ARCH)
target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
message(STATUS "GGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} (AOT via spir64_gen)")
target_compile_options(
ggml-sycl PRIVATE
-fsycl-targets=spir64_gen
"SHELL:-Xsycl-target-backend=spir64_gen \"-device ${GGML_SYCL_DEVICE_ARCH}\""
)
target_link_options(
ggml-sycl PRIVATE
-fsycl-targets=spir64_gen
"SHELL:-Xsycl-target-backend=spir64_gen \"-device ${GGML_SYCL_DEVICE_ARCH}\""
)
endif()

29 changes: 27 additions & 2 deletions ggml/src/ggml-sycl/convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,23 @@ static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int64_t k,
#endif
}

template <typename dst_t>
static void dequantize_row_q5_K_sycl_reorder(const void * vx, dst_t * y, const int64_t k, dpct::queue_ptr stream) {
const int64_t nb = k / QK_K;

dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });

stream->submit([&](sycl::handler & cgh) {
sycl::local_accessor<uint8_t, 1> scale_local_acc(sycl::range<1>(K_SCALE_SIZE), cgh);

cgh.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)),
[=](sycl::nd_item<3> item_ct1) {
dequantize_block_q5_K_reorder(vx, y, get_pointer(scale_local_acc), item_ct1, nb);
});
});
}

template <typename dst_t>
static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
Expand Down Expand Up @@ -650,7 +667,11 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
return dequantize_row_q4_K_sycl;
}
case GGML_TYPE_Q5_K:
return dequantize_row_q5_K_sycl;
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
return dequantize_row_q5_K_sycl_reorder;
} else {
return dequantize_row_q5_K_sycl;
}
case GGML_TYPE_Q6_K:
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
return dequantize_row_q6_K_sycl_reorder;
Expand Down Expand Up @@ -725,7 +746,11 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
return dequantize_row_q4_K_sycl;
}
case GGML_TYPE_Q5_K:
return dequantize_row_q5_K_sycl;
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
return dequantize_row_q5_K_sycl_reorder;
} else {
return dequantize_row_q5_K_sycl;
}
case GGML_TYPE_Q6_K:
if (dst->src[0]->extra && ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
return dequantize_row_q6_K_sycl_reorder;
Expand Down
143 changes: 143 additions & 0 deletions ggml/src/ggml-sycl/cumsum.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#include "cumsum.hpp"
#include "common.hpp"

#include <algorithm>

#define SYCL_CUMSUM_BLOCK_SIZE 256

static __dpct_inline__ float warp_prefix_inclusive_sum_f32(float x, const sycl::nd_item<3> & item) {
return sycl::inclusive_scan_over_group(item.get_sub_group(), x, sycl::plus<float>());
}

static void cumsum_f32_kernel(
const float * __restrict__ src, float * __restrict__ dst,
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t d1, const int64_t d2, const int64_t d3,
const sycl::nd_item<3> & item, float * smem) {

const int tid = item.get_local_id(2);
const int block_size = item.get_local_range(2);
const int lane = tid % WARP_SIZE;
const int warp = tid / WARP_SIZE;
const int warps_per_block = block_size / WARP_SIZE;

float * s_vals = smem;
float * s_warp_sums = smem + block_size;
float * s_carry = smem + block_size + warps_per_block;

if (tid == 0) {
s_carry[0] = 0.0f;
}
item.barrier(sycl::access::fence_space::local_space);

const int64_t i3 = item.get_group(0);
const int64_t i2 = item.get_group(1);
const int64_t i1 = item.get_group(2);
if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
return;
}

const float * src_row = src + i1 * s01 + i2 * s02 + i3 * s03;
float * dst_row = dst + i1 * d1 + i2 * d2 + i3 * d3;

constexpr int num_unroll = 4;
float temp[num_unroll];

for (int64_t i = 0; i < ne00; i += num_unroll * block_size) {
int64_t idx = i + tid * num_unroll;

temp[0] = (idx < ne00 ? src_row[idx] : 0.0f);
#pragma unroll
for (int j = 1; j < num_unroll; j++) {
temp[j] = temp[j - 1];
if (idx + j < ne00) {
temp[j] += src_row[idx + j];
}
}

float val = (idx < ne00) ? temp[num_unroll - 1] : 0.0f;

val = warp_prefix_inclusive_sum_f32(val, item);
s_vals[tid] = val;

if (lane == WARP_SIZE - 1) {
s_warp_sums[warp] = val;
}
item.barrier(sycl::access::fence_space::local_space);

if (warp == 0) {
float w = (tid < warps_per_block) ? s_warp_sums[tid] : 0.0f;
float inc = warp_prefix_inclusive_sum_f32(w, item);
if (tid < warps_per_block) {
s_warp_sums[tid] = inc - w;
}
if (tid == warps_per_block - 1) {
s_carry[1] = inc;
}
}
item.barrier(sycl::access::fence_space::local_space);

float carry = s_carry[0];
float final_offset = s_vals[tid] + s_warp_sums[warp] + carry - temp[num_unroll - 1];

#pragma unroll
for (int j = 0; j < num_unroll; j++) {
if (idx + j < ne00) {
dst_row[idx + j] = temp[j] + final_offset;
}
}

item.barrier(sycl::access::fence_space::local_space);

if (tid == 0) {
s_carry[0] += s_carry[1];
}
}
}

void ggml_sycl_op_cumsum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];

GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);

dpct::queue_ptr stream = ctx.stream();
SYCL_CHECK(ggml_sycl_set_device(ctx.device));

const float * src_d = static_cast<const float *>(src0->data);
float * dst_d = static_cast<float *>(dst->data);

const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];

const size_t ts = sizeof(float);
const int64_t s01 = src0->nb[1] / ts;
const int64_t s02 = src0->nb[2] / ts;
const int64_t s03 = src0->nb[3] / ts;
const int64_t d1 = dst->nb[1] / ts;
const int64_t d2 = dst->nb[2] / ts;
const int64_t d3 = dst->nb[3] / ts;

const int num_warps = (ne00 + WARP_SIZE - 1) / WARP_SIZE;
int block_size = num_warps * WARP_SIZE;
block_size = std::min(block_size, SYCL_CUMSUM_BLOCK_SIZE);
const int warps_per_block = block_size / WARP_SIZE;
const int smem_size = block_size + warps_per_block + 2;

const sycl::range<3> grid(ne03, ne02, ne01);
const sycl::range<3> block(1, 1, block_size);

stream->submit([&](sycl::handler & cgh) {
sycl::local_accessor<float, 1> smem_acc(sycl::range<1>(smem_size), cgh);
cgh.parallel_for(
sycl::nd_range<3>(grid * block, block),
[=](sycl::nd_item<3> item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
cumsum_f32_kernel(src_d, dst_d, ne00, ne01, ne02, ne03,
s01, s02, s03, d1, d2, d3,
item, get_pointer(smem_acc));
});
});
}
5 changes: 5 additions & 0 deletions ggml/src/ggml-sycl/cumsum.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#pragma once

#include "common.hpp"

void ggml_sycl_op_cumsum(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
57 changes: 57 additions & 0 deletions ggml/src/ggml-sycl/dequantize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,63 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
#endif
}

template <typename dst_t>
static void dequantize_block_q5_K_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy,
uint8_t * scales_local, const sycl::nd_item<3> & item_ct1, int64_t n_blocks) {
const int64_t ib = item_ct1.get_group(2);

#if QK_K == 256
// assume 64 threads
const int64_t tid = item_ct1.get_local_id(2);
const int64_t il = tid / 16; // 0...3
const int64_t ir = tid % 16; // 0...15
const int64_t is = 2 * il;

dst_t * y = yy + ib * QK_K + 64 * il + 2 * ir;

const uint8_t * base = static_cast<const uint8_t *>(vx);

// Reordered layout: [qs (QK_K/2 per block)] [qh (QK_K/8 per block)] [scales (K_SCALE_SIZE per block)] [dm (half2 per block)]
const size_t qs_offset = ib * (QK_K / 2);
const size_t qh_offset = n_blocks * (QK_K / 2) + ib * (QK_K / 8);
const size_t scales_offset = n_blocks * (QK_K / 2) + n_blocks * (QK_K / 8) + ib * K_SCALE_SIZE;
const size_t dm_offset = n_blocks * (QK_K / 2) + n_blocks * (QK_K / 8) + n_blocks * K_SCALE_SIZE + ib * sizeof(ggml_half2);

const uint8_t * qs_ptr = base + qs_offset;
const uint8_t * qh_ptr = base + qh_offset;
const uint8_t * scales_ptr = base + scales_offset;
const ggml_half2 dm_values = *reinterpret_cast<const ggml_half2 *>(base + dm_offset);

const float dall = dm_values.x();
const float dmin = dm_values.y();

const uint8_t * ql = qs_ptr + 32 * il + 2 * ir;
const uint8_t * qh = qh_ptr + 2 * ir;

if (tid < K_SCALE_SIZE) {
scales_local[tid] = scales_ptr[tid];
}

item_ct1.barrier(sycl::access::fence_space::local_space);

uint8_t sc, m;
get_scale_min_k4(is + 0, scales_local, sc, m);
const float d1 = dall * sc; const float m1 = dmin * m;
get_scale_min_k4(is + 1, scales_local, sc, m);
const float d2 = dall * sc; const float m2 = dmin * m;

uint8_t hm = 1 << (2 * il);
y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
hm <<= 1;
y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2;
y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2;
#else
GGML_UNUSED(ib); GGML_UNUSED(tid); GGML_UNUSED(yy); GGML_UNUSED(scales_local); GGML_UNUSED(n_blocks);
GGML_ABORT("Q5_K reorder dequantize not supported for QK_K != 256");
#endif
}

template<typename dst_t>
static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
const sycl::nd_item<3> &item_ct1) {
Expand Down
62 changes: 62 additions & 0 deletions ggml/src/ggml-sycl/diag.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#include "diag.hpp"
#include "common.hpp"

#define SYCL_DIAG_BLOCK_SIZE 256

template <typename T>
static void diag_kernel(T * __restrict__ dst, const T * __restrict__ src,
const int64_t ne0, const int64_t ne1,
const int64_t ne2, const int64_t ne3,
const int64_t total_elements,
const sycl::nd_item<1> & item) {
const int64_t i = item.get_global_id(0);
if (i >= total_elements) {
return;
}

const int64_t i0 = i % ne0;
const int64_t i1 = (i / ne0) % ne1;
const int64_t i2 = (i / (ne0 * ne1)) % ne2;
const int64_t i3 = i / (ne0 * ne1 * ne2);

const int64_t dst_idx = ((i3 * ne2 + i2) * ne1 + i1) * ne0 + i0;

if (i0 == i1) {
const int64_t batch_idx = i3 * ne2 + i2;
dst[dst_idx] = src[batch_idx * ne0 + i0];
} else {
dst[dst_idx] = T(0);
}

(void)ne3;
}

void ggml_sycl_op_diag(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];

GGML_ASSERT(ggml_is_contiguous(dst));
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->ne[1] == 1);

dpct::queue_ptr stream = ctx.stream();
SYCL_CHECK(ggml_sycl_set_device(ctx.device));

const void * src0_d = src0->data;
void * dst_d = dst->data;

const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
const int64_t ne2 = dst->ne[2];
const int64_t ne3 = dst->ne[3];
const int64_t n_elems = ggml_nelements(dst);
const int64_t num_blocks = (n_elems + SYCL_DIAG_BLOCK_SIZE - 1) / SYCL_DIAG_BLOCK_SIZE;

GGML_ASSERT(dst->type == GGML_TYPE_F32);
stream->parallel_for(
sycl::nd_range<1>(num_blocks * SYCL_DIAG_BLOCK_SIZE, SYCL_DIAG_BLOCK_SIZE),
[=](sycl::nd_item<1> item) {
diag_kernel(static_cast<float *>(dst_d),
static_cast<const float *>(src0_d),
ne0, ne1, ne2, ne3, n_elems, item);
});
}
5 changes: 5 additions & 0 deletions ggml/src/ggml-sycl/diag.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#pragma once

#include "common.hpp"

void ggml_sycl_op_diag(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
Loading
Loading