diff --git a/.github/workflows/cuda-pc-sampling.yml b/.github/workflows/cuda-pc-sampling.yml new file mode 100644 index 00000000000..10eb8183abd --- /dev/null +++ b/.github/workflows/cuda-pc-sampling.yml @@ -0,0 +1,94 @@ +# Runs the full suite of CUDA micro-benchmarks on a GPU runner under NVIDIA CUDA +# PC sampling, collected by Polar Signals (parca-agent + the parcagpu CUPTI shim). +# +# Unlike `codspeed.yml`, this workflow does NOT use Codspeed. It runs every CUDA +# benchmark directly (`cargo bench`) so the GPU work is captured as continuous +# profiling data in Polar Signals rather than as wall-time regression numbers. +# +# See https://www.polarsignals.com/blog/posts/2026/06/10/nvidia-cuda-pc-sampling +name: CUDA PC Sampling + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/develop' }} + +on: + pull_request: + paths: + - "vortex-cuda/**" + - ".github/workflows/cuda-pc-sampling.yml" + workflow_dispatch: { } + +permissions: + contents: read + +jobs: + cuda-pc-sampling: + # Profiling uploads to Polar Signals Cloud, which needs repo secrets that are + # unavailable to forked PRs, and the GPU runner is internal-only. + if: github.repository == 'vortex-data/vortex' && github.event.pull_request.head.repo.fork == false + name: "CUDA micro-benchmarks (PC sampling)" + timeout-minutes: 60 + runs-on: >- + runs-on=${{ github.run_id }}/family=g5/cpu=8/image=ubuntu24-gpu-x64/tag=cuda-pc-sampling + steps: + - uses: runs-on/action@v2 + with: + sccache: s3 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 + - uses: ./.github/actions/setup-rust + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Display NVIDIA SMI details + run: | + nvidia-smi + nvidia-smi -L + nvidia-smi -q -d Memory + + - name: Build parcagpu CUPTI shim + run: | + set -Eeu -o pipefail + sudo apt-get update + # cmake to configure the build; systemtap-sdt-dev provides the `dtrace` + # probe generator that parcagpu's USDT probes require. + sudo apt-get install -y --no-install-recommends cmake systemtap-sdt-dev + # parcagpu vendors its `proton` dependency as a git submodule. + git clone --depth 1 --branch v0.3.0 --recurse-submodules --shallow-submodules \ + https://github.com/parca-dev/parcagpu.git + make -C parcagpu local + shim="$(find "$PWD/parcagpu" -name libparcagpucupti.so -print -quit)" + if [[ -z "$shim" ]]; then + echo "::error::Could not locate libparcagpucupti.so after building parcagpu" + exit 1 + fi + echo "PARCAGPU_LIB=$shim" >> "$GITHUB_ENV" + + - name: Build CUDA benchmarks + env: + # Emit device-code line info so Polar Signals can symbolize PC samples. + VORTEX_CUDA_LINEINFO: "1" + run: cargo bench -p vortex-cuda --no-run + + - name: Setup Polar Signals + uses: polarsignals/gh-actions-ps-profiling@68ae857e375a826606352016e5b90f01a2a7ff7a # v0.8.1 + with: + polarsignals_cloud_token: ${{ secrets.POLAR_SIGNALS_API_KEY }} + # GPU PC sampling support landed in parca-agent v0.48.0. + parca_agent_version: "0.48.0" + labels: "branch=${{ github.ref_name }};gh_run_id=${{ github.run_id }};benchmark=cuda-pc-sampling" + project_uuid: "e5d846e1-b54c-46e7-9174-8bf055a3af56" + + - name: Run CUDA micro-benchmarks under PC sampling + env: + RUST_BACKTRACE: full + VORTEX_CUDA_LINEINFO: "1" + # The parcagpu CUPTI shim is injected into every CUDA process via the + # standard CUDA injection hook; it exposes PC samples as USDT probes that + # the Polar Signals parca-agent consumes over eBPF. + CUDA_INJECTION64_PATH: ${{ env.PARCAGPU_LIB }} + # Enable hardware PC sampling, targeting ~100 samples/sec, with the + # blog's recommended sampling factor of 20 (2^20 GPU cycles per sample). + PARCAGPU_PC_SAMPLING_RATE: "100" + PARCAGPU_SAMPLING_FACTOR: "20" + run: cargo bench -p vortex-cuda diff --git a/vortex-cuda/build.rs b/vortex-cuda/build.rs index e6024d3c275..e5462d68a0c 100644 --- a/vortex-cuda/build.rs +++ b/vortex-cuda/build.rs @@ -42,6 +42,7 @@ fn main() { ); println!("cargo:rerun-if-env-changed=PROFILE"); + println!("cargo:rerun-if-env-changed=VORTEX_CUDA_LINEINFO"); // Regenerate bit_unpack kernels only when the generator changes println!( @@ -146,6 +147,15 @@ fn nvcc_compile_ptx( // - synchronize: https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html#using-synccheck } else { cmd.arg("-O3"); + + // Generate line-number information for device code when requested. This lets + // PC-sampling profilers (e.g. CUDA PC sampling collected via Polar Signals) + // symbolize samples back to source. `-lineinfo` does not affect execution + // performance; gate it behind an env var so default release builds are + // unchanged. + if env::var_os("VORTEX_CUDA_LINEINFO").is_some() { + cmd.arg("-lineinfo"); + } } // Output PTX file goes to output_dir with same base name