Skip to content

Commit a5d8ee5

Browse files
committed
lint
1 parent b000aea commit a5d8ee5

1 file changed

Lines changed: 18 additions & 14 deletions

File tree

backends/cuda/runtime/shims/int6_plain_mm.cuh

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,24 @@
88

99
// W6A8 dp4a matvec for packed INT6 decode (M <= 4), used for GGUF Q6_K weights.
1010
//
11-
// Reads a genuine 6-bit packed weight (CudaDp4aPlanarInt6Tensor format), split into
12-
// two planes:
11+
// Reads a genuine 6-bit packed weight (CudaDp4aPlanarInt6Tensor format), split
12+
// into two planes:
1313
// ql : [N, K/2] uint8 — low-nibble plane, nibble-packed even/odd exactly
1414
// like the INT4 path (ql[:,j] = lo[:,2j] | (lo[:,2j+1] << 4)).
1515
// qh : [N, K/4] uint8 — high-2-bit plane, 4 values/byte, arranged per
1616
// 32-weight chunk as hi_even_packed[4] then hi_odd_packed[4] (each
17-
// byte holds the four 2-bit highs of one dp4a word in even/odd order).
17+
// byte holds the four 2-bit highs of one dp4a word in even/odd
18+
// order).
1819
// scale : [N, K/gs] bf16 — per-group scales, row-major (coalesced; no zero).
19-
// The stored 6-bit value is u = q + 32 in [0, 63] (q in [-32, 31]); the constant
20-
// -32 offset is applied in the kernel, so Q6_K's symmetry means NO zero tensor.
20+
// The stored 6-bit value is u = q + 32 in [0, 63] (q in [-32, 31]); the
21+
// constant -32 offset is applied in the kernel, so Q6_K's symmetry means NO
22+
// zero tensor.
2123
//
22-
// Dynamically quantizes bf16 activations to INT8 (per-32-element blocks, even/odd
23-
// order, identical to the INT4 path), reconstructs full 6-bit weight bytes per
24-
// dp4a word (vfull = vi_lo | (spread2(hi_byte) << 4)), and uses dp4a for fused
25-
// int6xint8 dot products with vectorized weight loads and warp-cooperative
26-
// quantization.
24+
// Dynamically quantizes bf16 activations to INT8 (per-32-element blocks,
25+
// even/odd order, identical to the INT4 path), reconstructs full 6-bit weight
26+
// bytes per dp4a word (vfull = vi_lo | (spread2(hi_byte) << 4)), and uses dp4a
27+
// for fused int6xint8 dot products with vectorized weight loads and
28+
// warp-cooperative quantization.
2729
//
2830
// Symbol names are suffixed _i6 / distinct from int4_plain_mm.cuh and
2931
// int8_plain_mm.cuh so all three translation units can be linked together
@@ -80,9 +82,10 @@ __device__ __forceinline__ uint32_t spread2_i6(uint32_t b) {
8082
// blocks, EVEN/ODD order — identical to the INT4 path's Q8Block).
8183
// ---------------------------------------------------------------------------
8284

83-
// alignas(16) pads sizeof(Q8Block_i6) to 48 so each block (and its qs_even/qs_odd
84-
// 16-byte halves) is 16-byte aligned, allowing two vectorized uint4 loads of a
85-
// block's int8 activations instead of eight scalar int32 loads.
85+
// alignas(16) pads sizeof(Q8Block_i6) to 48 so each block (and its
86+
// qs_even/qs_odd 16-byte halves) is 16-byte aligned, allowing two vectorized
87+
// uint4 loads of a block's int8 activations instead of eight scalar int32
88+
// loads.
8689
struct alignas(16) Q8Block_i6 {
8790
int8_t qs_even[Q8_BLOCK_SIZE_I6 / 2];
8891
int8_t qs_odd[Q8_BLOCK_SIZE_I6 / 2];
@@ -175,7 +178,8 @@ __global__ void __launch_bounds__(MV6_THREADS) int6_w6a8_matvec_kernel(
175178
uint2 qh_chunk = __ldg(&qhrow8[i]);
176179
int32_t k_base = i * 32;
177180
uint32_t words[4] = {packed16.x, packed16.y, packed16.z, packed16.w};
178-
// qh_chunk.x bytes = hi_even_packed[0..3], qh_chunk.y = hi_odd_packed[0..3].
181+
// qh_chunk.x bytes = hi_even_packed[0..3], qh_chunk.y =
182+
// hi_odd_packed[0..3].
179183
uint32_t hi_even_word = qh_chunk.x;
180184
uint32_t hi_odd_word = qh_chunk.y;
181185

0 commit comments

Comments
 (0)