Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,12 @@ make check-env
make bootstrap
PYTHONPATH=python python3 scripts/check_env.py --json
```

## Tutorial

If you are new to this repository, start with the Chinese tutorial under
[`tutorial/`](./tutorial/README.md). It explains the full
`PTO-DSL -> PTOAS -> PTO-ISA -> Bisheng -> .so -> benchmark` workflow,
shows minimal PTODSL examples, and walks through real kernels such as
`grouped_matmul`, `flash_attention_score`, and `moe_token_permute` with
current performance data from `bench/reports/regression_latest.md`.
118 changes: 118 additions & 0 deletions bench/adapters/ops_transformer/attention/attention_update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
from __future__ import annotations

import json
import statistics
import time
from pathlib import Path

import torch
import torch_npu

from pto_kernels.bench.adapter_utils import describe_baseline
from pto_kernels.ops.attention.attention_update.runtime import (
VARIANT,
VARIANTS,
make_attention_update_inputs,
run_torch_npu_attention_update,
)


def describe(repo_root, spec):
summary = describe_baseline(repo_root, "attention", "attention_update", spec.inventory_ref)
summary["runtime_entrypoint"] = "torch_npu.npu_attention_update"
summary["seed_variant"] = {"default": VARIANT.as_dict(), "variants": [variant.as_dict() for variant in VARIANTS]}
return summary


def compile_kernel(repo_root, spec, artifacts_dir):
del repo_root, spec, artifacts_dir
if not hasattr(torch_npu, "npu_attention_update"):
return {
"status": "blocked",
"reason": "torch_npu does not expose npu_attention_update on this environment.",
}
return {
"status": "runtime_builtin",
"entrypoint": "torch_npu.npu_attention_update",
"note": "Baseline execution uses the installed runtime package on the constrained sp=2, updateType=0 slice.",
}


def benchmark(repo_root, spec, artifacts_dir):
del repo_root
try:
variant_reports = []
for variant in VARIANTS:
inputs = make_attention_update_inputs(variant, device_index=int(spec.device.get("id", 0)))
for _ in range(spec.bench.warmup):
run_torch_npu_attention_update(inputs)
torch.npu.synchronize()

timings_ms = []
output = None
for _ in range(spec.bench.repeat):
torch.npu.synchronize()
start = time.perf_counter()
output = run_torch_npu_attention_update(inputs)
torch.npu.synchronize()
timings_ms.append((time.perf_counter() - start) * 1000.0)

if output is None:
raise RuntimeError(f"Baseline benchmark did not produce output tensors for {variant.label}.")

out, lse_out = output
if lse_out is not None:
raise RuntimeError("Constrained baseline slice expects updateType=0 and no lseOut tensor.")
out_diff = (out.float().cpu() - inputs["reference"]).abs().max().item()
variant_reports.append(
{
"variant": variant.as_dict(),
"shape_summary": variant.shape_summary,
"timings_ms": {
"median": statistics.median(timings_ms),
"min": min(timings_ms),
"max": max(timings_ms),
},
"correctness": {
"out_max_abs_diff": out_diff,
"max_abs_diff": out_diff,
"passes": bool(out_diff <= spec.correctness.atol),
},
}
)
except Exception as exc:
report = {
"status": "blocked",
"variants": [variant.as_dict() for variant in VARIANTS],
"entrypoint": "torch_npu.npu_attention_update",
"reason": str(exc),
}
report_path = Path(artifacts_dir) / "ops_transformer_attention_update_benchmark.json"
report_path.write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8")
report["report_path"] = str(report_path)
return report

max_abs_diff = max(item["correctness"]["max_abs_diff"] for item in variant_reports)
report = {
"status": "ok",
"variants": [item["variant"] for item in variant_reports],
"entrypoint": "torch_npu.npu_attention_update",
"shape_summaries": [item["shape_summary"] for item in variant_reports],
"timings_ms": {
"median": max(item["timings_ms"]["median"] for item in variant_reports),
"min": min(item["timings_ms"]["min"] for item in variant_reports),
"max": max(item["timings_ms"]["max"] for item in variant_reports),
},
"correctness": {
"max_abs_diff": max_abs_diff,
"atol": spec.correctness.atol,
"rtol": spec.correctness.rtol,
"passes": bool(all(item["correctness"]["passes"] for item in variant_reports)),
},
"reference_contract": "sp2_update_type0",
"variant_reports": variant_reports,
}
report_path = Path(artifacts_dir) / "ops_transformer_attention_update_benchmark.json"
report_path.write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8")
report["report_path"] = str(report_path)
return report
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from __future__ import annotations

import json
import statistics
import time
from pathlib import Path

import torch
import torch_npu

from pto_kernels.bench.adapter_utils import describe_baseline
from pto_kernels.ops.attention.fused_infer_attention_score.runtime import (
VARIANT,
VARIANTS,
make_fused_infer_attention_score_inputs,
run_torch_npu_fused_infer_attention_score,
)


def describe(repo_root, spec):
summary = describe_baseline(repo_root, "attention", "fused_infer_attention_score", spec.inventory_ref)
summary["runtime_entrypoint"] = "torch_npu.npu_fused_infer_attention_score"
summary["seed_variant"] = {"default": VARIANT.as_dict(), "variants": [variant.as_dict() for variant in VARIANTS]}
return summary


def compile_kernel(repo_root, spec, artifacts_dir):
del repo_root, spec, artifacts_dir
if not hasattr(torch_npu, "npu_fused_infer_attention_score"):
return {
"status": "blocked",
"reason": "torch_npu does not expose npu_fused_infer_attention_score on this environment.",
}
return {
"status": "runtime_builtin",
"entrypoint": "torch_npu.npu_fused_infer_attention_score",
"note": "Baseline execution uses the installed runtime package on the constrained single-block no-quant slice.",
}


def benchmark(repo_root, spec, artifacts_dir):
del repo_root
try:
variant_reports = []
for variant in VARIANTS:
inputs = make_fused_infer_attention_score_inputs(variant, device_index=int(spec.device.get("id", 0)))
for _ in range(spec.bench.warmup):
run_torch_npu_fused_infer_attention_score(inputs)
torch.npu.synchronize()

timings_ms = []
output = None
for _ in range(spec.bench.repeat):
torch.npu.synchronize()
start = time.perf_counter()
output = run_torch_npu_fused_infer_attention_score(inputs)
torch.npu.synchronize()
timings_ms.append((time.perf_counter() - start) * 1000.0)

if output is None:
raise RuntimeError(f"Baseline benchmark did not produce output tensors for {variant.label}.")

out, aux = output
out_diff = (out.float().cpu() - inputs["reference"]).abs().max().item()
variant_reports.append(
{
"variant": variant.as_dict(),
"shape_summary": variant.shape_summary,
"timings_ms": {
"median": statistics.median(timings_ms),
"min": min(timings_ms),
"max": max(timings_ms),
},
"correctness": {
"out_max_abs_diff": out_diff,
"max_abs_diff": out_diff,
"passes": bool(out_diff <= spec.correctness.atol),
},
"aux_shape": None if aux is None else tuple(aux.shape),
}
)
except Exception as exc:
report = {
"status": "blocked",
"variants": [variant.as_dict() for variant in VARIANTS],
"entrypoint": "torch_npu.npu_fused_infer_attention_score",
"reason": str(exc),
}
report_path = Path(artifacts_dir) / "ops_transformer_fused_infer_attention_score_benchmark.json"
report_path.write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8")
report["report_path"] = str(report_path)
return report

max_abs_diff = max(item["correctness"]["max_abs_diff"] for item in variant_reports)
report = {
"status": "ok",
"variants": [item["variant"] for item in variant_reports],
"entrypoint": "torch_npu.npu_fused_infer_attention_score",
"shape_summaries": [item["shape_summary"] for item in variant_reports],
"timings_ms": {
"median": max(item["timings_ms"]["median"] for item in variant_reports),
"min": min(item["timings_ms"]["min"] for item in variant_reports),
"max": max(item["timings_ms"]["max"] for item in variant_reports),
},
"correctness": {
"max_abs_diff": max_abs_diff,
"atol": spec.correctness.atol,
"rtol": spec.correctness.rtol,
"passes": bool(all(item["correctness"]["passes"] for item in variant_reports)),
},
"reference_contract": "single_block_bnsd_no_quant",
"variant_reports": variant_reports,
}
report_path = Path(artifacts_dir) / "ops_transformer_fused_infer_attention_score_benchmark.json"
report_path.write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8")
report["report_path"] = str(report_path)
return report
115 changes: 115 additions & 0 deletions bench/adapters/ops_transformer/attention/incre_flash_attention.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
from __future__ import annotations

import json
import math
import statistics
import time
from pathlib import Path

import torch
import torch_npu

from pto_kernels.bench.adapter_utils import describe_baseline
from pto_kernels.ops.attention.incre_flash_attention.runtime import (
VARIANTS,
make_incre_flash_attention_inputs,
run_torch_npu_incre_flash_attention,
)


def describe(repo_root, spec):
summary = describe_baseline(repo_root, "attention", "incre_flash_attention", spec.inventory_ref)
summary["runtime_entrypoint"] = "torch_npu.npu_incre_flash_attention"
summary["seed_variant"] = {
"default": VARIANTS[0].as_dict(),
"variants": [variant.as_dict() for variant in VARIANTS],
}
return summary


def benchmark(repo_root, spec, artifacts_dir):
if not hasattr(torch_npu, "npu_incre_flash_attention"):
report = {
"status": "blocked",
"variants": [variant.as_dict() for variant in VARIANTS],
"reason": "torch_npu does not expose npu_incre_flash_attention on this environment.",
"entrypoint": "torch_npu.npu_incre_flash_attention",
}
report_path = Path(artifacts_dir) / "ops_transformer_incre_flash_attention_benchmark.json"
report_path.write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8")
report["report_path"] = str(report_path)
return report

try:
variant_reports = []
for variant in VARIANTS:
inputs = make_incre_flash_attention_inputs(variant, device_index=int(spec.device.get("id", 0)))
for _ in range(spec.bench.warmup):
run_torch_npu_incre_flash_attention(inputs)
torch.npu.synchronize()

timings_ms = []
output = None
for _ in range(spec.bench.repeat):
torch.npu.synchronize()
start = time.perf_counter()
output = run_torch_npu_incre_flash_attention(inputs)
torch.npu.synchronize()
timings_ms.append((time.perf_counter() - start) * 1000.0)

if output is None:
raise RuntimeError(f"Baseline benchmark did not produce output tensors for {variant.label}.")

out_diff = (output.float().cpu() - inputs["reference"]).abs().max().item()
variant_reports.append(
{
"variant": variant.as_dict(),
"shape_summary": variant.shape_summary,
"timings_ms": {
"median": statistics.median(timings_ms),
"min": min(timings_ms),
"max": max(timings_ms),
},
"correctness": {
"out_max_abs_diff": out_diff,
"max_abs_diff": out_diff,
"passes": bool(out_diff <= spec.correctness.atol),
},
}
)
except Exception as exc:
report = {
"status": "blocked",
"variants": [variant.as_dict() for variant in VARIANTS],
"reason": f"Baseline execution failed: {exc}",
"entrypoint": "torch_npu.npu_incre_flash_attention",
}
report_path = Path(artifacts_dir) / "ops_transformer_incre_flash_attention_benchmark.json"
report_path.write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8")
report["report_path"] = str(report_path)
return report

max_abs_diff = max(item["correctness"]["max_abs_diff"] for item in variant_reports)
report = {
"status": "ok",
"variants": [item["variant"] for item in variant_reports],
"shape_summaries": [item["shape_summary"] for item in variant_reports],
"timings_ms": {
"median": max(item["timings_ms"]["median"] for item in variant_reports),
"min": min(item["timings_ms"]["min"] for item in variant_reports),
"max": max(item["timings_ms"]["max"] for item in variant_reports),
},
"correctness": {
"max_abs_diff": max_abs_diff,
"atol": spec.correctness.atol,
"rtol": spec.correctness.rtol,
"passes": bool(all(item["correctness"]["passes"] for item in variant_reports)),
},
"entrypoint": "torch_npu.npu_incre_flash_attention",
"reference_contract": "bnsd_decode_no_mask_no_quant",
"variant_reports": variant_reports,
}
report_path = Path(artifacts_dir) / "ops_transformer_incre_flash_attention_benchmark.json"
report_path.write_text(json.dumps(report, indent=2, sort_keys=True), encoding="utf-8")
report["report_path"] = str(report_path)
return report
Loading