|
8 | 8 |
|
9 | 9 | // W6A8 dp4a matvec for packed INT6 decode (M <= 4), used for GGUF Q6_K weights. |
10 | 10 | // |
11 | | -// Reads a genuine 6-bit packed weight (CudaDp4aPlanarInt6Tensor format), split into |
12 | | -// two planes: |
| 11 | +// Reads a genuine 6-bit packed weight (CudaDp4aPlanarInt6Tensor format), split |
| 12 | +// into two planes: |
13 | 13 | // ql : [N, K/2] uint8 — low-nibble plane, nibble-packed even/odd exactly |
14 | 14 | // like the INT4 path (ql[:,j] = lo[:,2j] | (lo[:,2j+1] << 4)). |
15 | 15 | // qh : [N, K/4] uint8 — high-2-bit plane, 4 values/byte, arranged per |
16 | 16 | // 32-weight chunk as hi_even_packed[4] then hi_odd_packed[4] (each |
17 | | -// byte holds the four 2-bit highs of one dp4a word in even/odd order). |
| 17 | +// byte holds the four 2-bit highs of one dp4a word in even/odd |
| 18 | +// order). |
18 | 19 | // scale : [N, K/gs] bf16 — per-group scales, row-major (coalesced; no zero). |
19 | | -// The stored 6-bit value is u = q + 32 in [0, 63] (q in [-32, 31]); the constant |
20 | | -// -32 offset is applied in the kernel, so Q6_K's symmetry means NO zero tensor. |
| 20 | +// The stored 6-bit value is u = q + 32 in [0, 63] (q in [-32, 31]); the |
| 21 | +// constant -32 offset is applied in the kernel, so Q6_K's symmetry means NO |
| 22 | +// zero tensor. |
21 | 23 | // |
22 | | -// Dynamically quantizes bf16 activations to INT8 (per-32-element blocks, even/odd |
23 | | -// order, identical to the INT4 path), reconstructs full 6-bit weight bytes per |
24 | | -// dp4a word (vfull = vi_lo | (spread2(hi_byte) << 4)), and uses dp4a for fused |
25 | | -// int6xint8 dot products with vectorized weight loads and warp-cooperative |
26 | | -// quantization. |
| 24 | +// Dynamically quantizes bf16 activations to INT8 (per-32-element blocks, |
| 25 | +// even/odd order, identical to the INT4 path), reconstructs full 6-bit weight |
| 26 | +// bytes per dp4a word (vfull = vi_lo | (spread2(hi_byte) << 4)), and uses dp4a |
| 27 | +// for fused int6xint8 dot products with vectorized weight loads and |
| 28 | +// warp-cooperative quantization. |
27 | 29 | // |
28 | 30 | // Symbol names are suffixed _i6 / distinct from int4_plain_mm.cuh and |
29 | 31 | // int8_plain_mm.cuh so all three translation units can be linked together |
@@ -80,9 +82,10 @@ __device__ __forceinline__ uint32_t spread2_i6(uint32_t b) { |
80 | 82 | // blocks, EVEN/ODD order — identical to the INT4 path's Q8Block). |
81 | 83 | // --------------------------------------------------------------------------- |
82 | 84 |
|
83 | | -// alignas(16) pads sizeof(Q8Block_i6) to 48 so each block (and its qs_even/qs_odd |
84 | | -// 16-byte halves) is 16-byte aligned, allowing two vectorized uint4 loads of a |
85 | | -// block's int8 activations instead of eight scalar int32 loads. |
| 85 | +// alignas(16) pads sizeof(Q8Block_i6) to 48 so each block (and its |
| 86 | +// qs_even/qs_odd 16-byte halves) is 16-byte aligned, allowing two vectorized |
| 87 | +// uint4 loads of a block's int8 activations instead of eight scalar int32 |
| 88 | +// loads. |
86 | 89 | struct alignas(16) Q8Block_i6 { |
87 | 90 | int8_t qs_even[Q8_BLOCK_SIZE_I6 / 2]; |
88 | 91 | int8_t qs_odd[Q8_BLOCK_SIZE_I6 / 2]; |
@@ -175,7 +178,8 @@ __global__ void __launch_bounds__(MV6_THREADS) int6_w6a8_matvec_kernel( |
175 | 178 | uint2 qh_chunk = __ldg(&qhrow8[i]); |
176 | 179 | int32_t k_base = i * 32; |
177 | 180 | uint32_t words[4] = {packed16.x, packed16.y, packed16.z, packed16.w}; |
178 | | - // qh_chunk.x bytes = hi_even_packed[0..3], qh_chunk.y = hi_odd_packed[0..3]. |
| 181 | + // qh_chunk.x bytes = hi_even_packed[0..3], qh_chunk.y = |
| 182 | + // hi_odd_packed[0..3]. |
179 | 183 | uint32_t hi_even_word = qh_chunk.x; |
180 | 184 | uint32_t hi_odd_word = qh_chunk.y; |
181 | 185 |
|
|
0 commit comments