Skip to content

[BUG] Compilation with lineinfo detonates on large kernels #876

@c200chromebook

Description

@c200chromebook

Describe the bug
If you are using a large/complex kernel with multiple complex types, lineinfo can cause it to detonate.

Steps/Code to reproduce bug
See minimal reproducer below

Expected behavior
Lineinfo works the same as not using lineinfo

Environment details (please complete the following information):

Environment: numba-cuda 0.65.1, CUDA 13.2, cuda-bindings 13.2.0

"""Minimal reproducer: cuda.bindings.nvvm.nvvmError: ERROR_COMPILATION (9)

Triggered by lineinfo=True on device functions operating on large
nested record/struct types. The combination of:
  1. lineinfo=True on >= 3 device functions
  2. Large struct arguments (STATE_VAR ~233KB, POLICY ~1KB with nested sub-structs)
  3. Non-trivial control flow in the device functions
causes NVVM to fail during IR-to-PTX compilation.

Workarounds that make it compile:
  - Set lineinfo=False on all device functions
  - Reduce the POLICY struct size (e.g. remove the 'pad' field)
  - Inline device_fn_a into device_fn_b (reducing device function count)


"""
import numba
import numpy as np
from numpy import float64, uint8, uint64, int16, int8, int32
from numba import cuda

W = 32          # CUDA warp size
EPSILON = 1e-8
MAX_STEPS = 420
BRANCH_A = 2
BRANCH_B = 4


# ── STATE_VAR (~233 KB) ─────────────────────────────────────────────

RNG_T = [('state', uint64, W), ('inc', uint64, W)]

FLAGS_T = ([(f'f{i}', uint8, W) for i in range(18)]
           + [('canary', uint8, W), ('initialized', uint8, W)]
           + [(f'u{i}', uint8, W) for i in range(4)])

INT16_T = [(f'i{i}', int16, W) for i in range(32)]

FLOAT64_T = ([(f'x{i}', float64, W) for i in range(93)]
             + [('val_a', float64, W), ('val_b', float64, W)]
             + [(f'y{i}', float64, W) for i in range(35)])

PAIR_T = [('lo', float64, W), ('hi', float64, W)]

STATE_VAR_T = np.dtype([
    ('arr1', float64, (7, W)),   ('arr2', float64, (19, W)),
    ('arr3', float64, (18, W)),  ('arr4', float64, (421, W)),
    ('arr5', float64, (12, W)),  ('arr6', float64, (2, W)),
    ('rng1', RNG_T), ('rng2', RNG_T), ('rng3', RNG_T),
    ('rands', float64, (9, W)),
    ('flags', FLAGS_T),
    ('i16', INT16_T),
    ('f64', FLOAT64_T),
    ('vecs', [('v1', float64, (121, W)), ('v2', float64, (140, W))]),
    ('prev', [('p1', float64, (W,)), ('p2', float64, (W,))]),
    ('temps', [('out_a', float64, W), ('out_b', float64, W)]),
    ('pop', PAIR_T),
    ('grp_a', PAIR_T),
    ('grp_b', PAIR_T),
    ('ledger', [('left', PAIR_T), ('right', PAIR_T)]),
    ('extra', [('sub', PAIR_T)]),
], align=True)
SV_NB = numba.from_dtype(STATE_VAR_T)


# ── POLICY (~1 KB) ──────────────────────────────────────────────────

POL_FLAGS_T = ([(f'f{i}', uint8) for i in range(18)]
               + [('canary', uint8)]
               + [('kind', uint8), ('is_outer', np.bool_)])

POL_T = np.dtype([
    ('pad', float64, (10,)),          # ← removing this makes it compile
    ('threshold', int32),
    ('scales', [('floor', float64)]),
    ('i16_block', [(f'i{i}', int16) for i in range(34)]),
    ('f64_block', [(f'x{i}', float64) for i in range(93)]),
    ('start', int32),
    ('flags', POL_FLAGS_T),
], align=True)
POL_NB = numba.from_dtype(POL_T)


# ── CONSTANT ─────────────────────────────────────────────────────────

CON_T = np.dtype([
    ('params', [('rate_a', float64), ('spread', float64)]),
    ('settings', [('mode', int8)]),
], align=True)
CON_NB = numba.from_dtype(CON_T)


# ── device functions (lineinfo=True triggers the bug) ────────────────

@cuda.jit(device=True, lineinfo=True)
def device_fn_a(t, p, scales, params, sv, lane, settings):
    rate, earned, cost = 0.0, 0.0, 0.0
    fv = sv['f64']
    if not (settings['mode'] == BRANCH_A and not p['flags']['is_outer']):
        rate = params['rate_a'] if t < p['threshold'] \
            else fv['val_b'][lane]
    else:
        rate = max(earned - cost - params['spread'], scales['floor'])
    return rate, earned, cost


@cuda.jit(device=True, lineinfo=True)
def device_fn_b(t, lane, sv, p, params, settings):
    bal = 25.0
    if bal > EPSILON:
        tmp = sv['temps']
        kind = p['flags']['kind']
        rate, earned, cost = device_fn_a(
            t, p, p['scales'], params, sv, lane, settings)
        base = bal * sv['f64']['val_a'][lane]
        e_amt = base * ((1 + earned) - 1)
        c_inv = base * ((1 + cost) - 1)
        tmp['out_b'][lane] = c_inv if kind != BRANCH_B else 0
        c_rate = base * ((1 + rate) - 1)
        tmp['out_a'][lane] = 0.0 if kind == BRANCH_B else (e_amt - c_rate)


@cuda.jit(device=True, lineinfo=True)
def device_fn_c(xpos, ypos, bdim, t, pol, con, sv, n, offset):
    if ypos >= n:
        return
    wpp = bdim // W
    wip = ypos >> 5
    lane = ypos & 31
    p = pol[xpos]
    s = sv[xpos * wpp + wip]
    sf = s['flags']
    params = con[0]['params']
    settings = con[0]['settings']
    if sf['canary'][lane]:
        return
    sf['canary'][lane] = True
    if min(t + offset, MAX_STEPS) < p['start']:
        sf['initialized'][lane] = False
        sf['canary'][lane] = False
        return
    if not sf['initialized'][lane]:
        sf['initialized'][lane] = True
        sf['canary'][lane] = False
        return
    device_fn_b(t, lane, s, p, params, settings)
    if sf['canary'][lane] == 1:
        sf['canary'][lane] = 0
    else:
        sf['canary'][lane] = 1


# ── kernel ───────────────────────────────────────────────────────────

sig = numba.void(numba.int32, POL_NB[::1], CON_NB[::1],
                 SV_NB[::1], numba.int32, numba.int32)

@cuda.jit(sig)
def kernel(t, pol, con, sv, n, offset):
    device_fn_c(cuda.blockIdx.x, cuda.threadIdx.x, cuda.blockDim.x,
                t, pol, con, sv, n, offset)

print(next(iter(kernel.inspect_asm().values())))

Additional context
Thanks to Q Developer for boiling down that minimal reproducer.

Bit of digging suggests it may be a mangling issue: error: Symbol name with unsupported characters

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions