Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions python/tokenspeed/runtime/layers/logits_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,9 +537,8 @@ def fused_softcap_kernel(
# Perform operations in-place
x = x / softcapping_value

# Manual tanh implementation using exp
exp2x = tl.exp(2 * x)
x = (exp2x - 1) / (exp2x + 1)
# Stable tanh form; the exp ratio overflows to inf/inf for large logits.
x = 2 * tl.sigmoid(2 * x) - 1

x = x * softcapping_value

Expand Down
35 changes: 35 additions & 0 deletions test/runtime/test_logits_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Regression tests for logits processing helpers."""

from __future__ import annotations

import os
import sys

# CI Registration (parsed via AST, runtime no-op)
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from ci_system.ci_register import register_cuda_ci

register_cuda_ci(est_time=90, suite="runtime-1gpu")

import pytest
import torch

from tokenspeed.runtime.layers.logits_processor import fused_softcap

pytestmark = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")


def test_fused_softcap_handles_large_logits_without_nan():
cap = 30.0
logits = torch.tensor(
[[5000.0, 2000.0, 1500.0, 100.0, 0.0, -100.0, -1500.0, -5000.0]],
device="cuda",
dtype=torch.float32,
)
expected = cap * torch.tanh(logits / cap)

out = fused_softcap(logits.clone(), cap)
torch.cuda.synchronize()

assert torch.isfinite(out).all()
torch.testing.assert_close(out, expected, rtol=1e-5, atol=2e-5)
Loading