"""Minimal reproducer: cuda.bindings.nvvm.nvvmError: ERROR_COMPILATION (9)
Triggered by lineinfo=True on device functions operating on large
nested record/struct types. The combination of:
1. lineinfo=True on >= 3 device functions
2. Large struct arguments (STATE_VAR ~233KB, POLICY ~1KB with nested sub-structs)
3. Non-trivial control flow in the device functions
causes NVVM to fail during IR-to-PTX compilation.
Workarounds that make it compile:
- Set lineinfo=False on all device functions
- Reduce the POLICY struct size (e.g. remove the 'pad' field)
- Inline device_fn_a into device_fn_b (reducing device function count)
"""
import numba
import numpy as np
from numpy import float64, uint8, uint64, int16, int8, int32
from numba import cuda
W = 32 # CUDA warp size
EPSILON = 1e-8
MAX_STEPS = 420
BRANCH_A = 2
BRANCH_B = 4
# ── STATE_VAR (~233 KB) ─────────────────────────────────────────────
RNG_T = [('state', uint64, W), ('inc', uint64, W)]
FLAGS_T = ([(f'f{i}', uint8, W) for i in range(18)]
+ [('canary', uint8, W), ('initialized', uint8, W)]
+ [(f'u{i}', uint8, W) for i in range(4)])
INT16_T = [(f'i{i}', int16, W) for i in range(32)]
FLOAT64_T = ([(f'x{i}', float64, W) for i in range(93)]
+ [('val_a', float64, W), ('val_b', float64, W)]
+ [(f'y{i}', float64, W) for i in range(35)])
PAIR_T = [('lo', float64, W), ('hi', float64, W)]
STATE_VAR_T = np.dtype([
('arr1', float64, (7, W)), ('arr2', float64, (19, W)),
('arr3', float64, (18, W)), ('arr4', float64, (421, W)),
('arr5', float64, (12, W)), ('arr6', float64, (2, W)),
('rng1', RNG_T), ('rng2', RNG_T), ('rng3', RNG_T),
('rands', float64, (9, W)),
('flags', FLAGS_T),
('i16', INT16_T),
('f64', FLOAT64_T),
('vecs', [('v1', float64, (121, W)), ('v2', float64, (140, W))]),
('prev', [('p1', float64, (W,)), ('p2', float64, (W,))]),
('temps', [('out_a', float64, W), ('out_b', float64, W)]),
('pop', PAIR_T),
('grp_a', PAIR_T),
('grp_b', PAIR_T),
('ledger', [('left', PAIR_T), ('right', PAIR_T)]),
('extra', [('sub', PAIR_T)]),
], align=True)
SV_NB = numba.from_dtype(STATE_VAR_T)
# ── POLICY (~1 KB) ──────────────────────────────────────────────────
POL_FLAGS_T = ([(f'f{i}', uint8) for i in range(18)]
+ [('canary', uint8)]
+ [('kind', uint8), ('is_outer', np.bool_)])
POL_T = np.dtype([
('pad', float64, (10,)), # ← removing this makes it compile
('threshold', int32),
('scales', [('floor', float64)]),
('i16_block', [(f'i{i}', int16) for i in range(34)]),
('f64_block', [(f'x{i}', float64) for i in range(93)]),
('start', int32),
('flags', POL_FLAGS_T),
], align=True)
POL_NB = numba.from_dtype(POL_T)
# ── CONSTANT ─────────────────────────────────────────────────────────
CON_T = np.dtype([
('params', [('rate_a', float64), ('spread', float64)]),
('settings', [('mode', int8)]),
], align=True)
CON_NB = numba.from_dtype(CON_T)
# ── device functions (lineinfo=True triggers the bug) ────────────────
@cuda.jit(device=True, lineinfo=True)
def device_fn_a(t, p, scales, params, sv, lane, settings):
rate, earned, cost = 0.0, 0.0, 0.0
fv = sv['f64']
if not (settings['mode'] == BRANCH_A and not p['flags']['is_outer']):
rate = params['rate_a'] if t < p['threshold'] \
else fv['val_b'][lane]
else:
rate = max(earned - cost - params['spread'], scales['floor'])
return rate, earned, cost
@cuda.jit(device=True, lineinfo=True)
def device_fn_b(t, lane, sv, p, params, settings):
bal = 25.0
if bal > EPSILON:
tmp = sv['temps']
kind = p['flags']['kind']
rate, earned, cost = device_fn_a(
t, p, p['scales'], params, sv, lane, settings)
base = bal * sv['f64']['val_a'][lane]
e_amt = base * ((1 + earned) - 1)
c_inv = base * ((1 + cost) - 1)
tmp['out_b'][lane] = c_inv if kind != BRANCH_B else 0
c_rate = base * ((1 + rate) - 1)
tmp['out_a'][lane] = 0.0 if kind == BRANCH_B else (e_amt - c_rate)
@cuda.jit(device=True, lineinfo=True)
def device_fn_c(xpos, ypos, bdim, t, pol, con, sv, n, offset):
if ypos >= n:
return
wpp = bdim // W
wip = ypos >> 5
lane = ypos & 31
p = pol[xpos]
s = sv[xpos * wpp + wip]
sf = s['flags']
params = con[0]['params']
settings = con[0]['settings']
if sf['canary'][lane]:
return
sf['canary'][lane] = True
if min(t + offset, MAX_STEPS) < p['start']:
sf['initialized'][lane] = False
sf['canary'][lane] = False
return
if not sf['initialized'][lane]:
sf['initialized'][lane] = True
sf['canary'][lane] = False
return
device_fn_b(t, lane, s, p, params, settings)
if sf['canary'][lane] == 1:
sf['canary'][lane] = 0
else:
sf['canary'][lane] = 1
# ── kernel ───────────────────────────────────────────────────────────
sig = numba.void(numba.int32, POL_NB[::1], CON_NB[::1],
SV_NB[::1], numba.int32, numba.int32)
@cuda.jit(sig)
def kernel(t, pol, con, sv, n, offset):
device_fn_c(cuda.blockIdx.x, cuda.threadIdx.x, cuda.blockDim.x,
t, pol, con, sv, n, offset)
print(next(iter(kernel.inspect_asm().values())))
Bit of digging suggests it may be a mangling issue: error: Symbol name with unsupported characters
Describe the bug
If you are using a large/complex kernel with multiple complex types, lineinfo can cause it to detonate.
Steps/Code to reproduce bug
See minimal reproducer below
Expected behavior
Lineinfo works the same as not using lineinfo
Environment details (please complete the following information):
Environment: numba-cuda 0.65.1, CUDA 13.2, cuda-bindings 13.2.0
Additional context
Thanks to Q Developer for boiling down that minimal reproducer.
Bit of digging suggests it may be a mangling issue: error: Symbol name with unsupported characters