Skip to content

techapi-pr-validation-comment #2

techapi-pr-validation-comment

techapi-pr-validation-comment #2

name: techapi-pr-validation-comment
# TechAPI data PRs can ask this repository to validate their head commit and
# leave a curator-facing comment back on the PR. This keeps the PR owned by the
# human contributor while TechEngineBot reports the engine verdict.
on:
repository_dispatch:
types: [techapi-pr-validate]
workflow_dispatch:
inputs:
pr_number:
description: "TechAPI PR number to comment on"
type: string
required: true
head_sha:
description: "TechAPI commit SHA to validate"
type: string
required: true
pr_url:
description: "TechAPI PR URL"
type: string
required: false
default: ""
permissions:
contents: read
concurrency:
group: techapi-pr-validation-${{ github.event.client_payload.pr_number || inputs.pr_number }}
cancel-in-progress: true
jobs:
validate:
runs-on: ubuntu-latest
env:
TECHAPI_COMMENT_TOKEN: ${{ secrets.TECHENGINEBOT_TOKEN || secrets.TECHAPI_TOKEN }}
TECHAPI_PR_NUMBER: ${{ github.event.client_payload.pr_number || inputs.pr_number }}
TECHAPI_HEAD_SHA: ${{ github.event.client_payload.head_sha || inputs.head_sha }}
TECHAPI_HEAD_REF: ${{ github.event.client_payload.head_ref || '' }}
TECHAPI_PR_URL: ${{ github.event.client_payload.pr_url || inputs.pr_url }}
REQUESTED_BY: ${{ github.event.client_payload.requested_by || github.actor }}
TECHAPI_DATA_DIR: ${{ github.workspace }}/TechAPI/data
steps:
- name: Checkout TechEngine
uses: actions/checkout@v4
- name: Checkout TechAPI PR head
uses: actions/checkout@v4
with:
repository: GetTechAPI/TechAPI
ref: ${{ env.TECHAPI_HEAD_SHA }}
path: TechAPI
- name: Checkout TechAPI main
uses: actions/checkout@v4
with:
repository: GetTechAPI/TechAPI
ref: main
path: TechAPI-main
- uses: actions/setup-python@v5
with:
python-version: "3.12"
cache: pip
- name: Install TechEngine
run: pip install -e .
- name: Validate TechAPI data
id: validate
shell: bash
run: |
set +e
{
echo "## app.validate"
python -m app.validate
echo "app_validate_status=$?"
} > validation.log 2>&1
app_status=$(grep "app_validate_status=" validation.log | tail -n 1 | cut -d= -f2)
{
echo
echo "## integrity_check.py --strict"
python integrity_check.py TechAPI/data --strict
echo "integrity_status=$?"
} >> validation.log 2>&1
integrity_status=$(grep "integrity_status=" validation.log | tail -n 1 | cut -d= -f2)
sed -i '/_status=/d' validation.log
status="success"
if [ "${app_status:-1}" != "0" ] || [ "${integrity_status:-1}" != "0" ]; then
status="failure"
fi
echo "status=$status" >> "$GITHUB_OUTPUT"
echo "app_status=${app_status:-1}" >> "$GITHUB_OUTPUT"
echo "integrity_status=${integrity_status:-1}" >> "$GITHUB_OUTPUT"
- name: Build data quality summary
shell: bash
run: |
python - <<'PY'
from __future__ import annotations
import hashlib
import json
import re
from collections import Counter, defaultdict
from pathlib import Path
from typing import Any
HEAD = Path("TechAPI/data")
BASE = Path("TechAPI-main/data")
CATEGORIES = ("brand", "soc", "smartphone", "gpu", "cpu")
MAX_WARNINGS = 20
def load_json(path: Path) -> dict[str, Any]:
return json.loads(path.read_text(encoding="utf-8-sig"))
def rel_jsons(root: Path, category: str) -> dict[str, Path]:
base = root / category
if not base.exists():
return {}
return {
str(path.relative_to(root)).replace("\\", "/"): path
for path in sorted(base.rglob("*.json"))
}
def digest(path: Path) -> str:
return hashlib.sha256(path.read_bytes()).hexdigest()
def verified_value(record: dict[str, Any]) -> bool | None:
value = record.get("verified")
return value if isinstance(value, bool) else None
def has_kaggle_source(record: dict[str, Any]) -> bool:
return any(
isinstance(url, str) and "kaggle.com" in url.lower()
for url in record.get("source_urls", [])
)
def name_warnings(category: str, rel: str, record: dict[str, Any]) -> list[str]:
warnings: list[str] = []
name = record.get("name")
if not isinstance(name, str):
return warnings
if name != name.strip():
warnings.append("leading/trailing whitespace in name")
if " " in name:
warnings.append("double spaces in name")
if "\ufffd" in name:
warnings.append("replacement character in name")
if name.count("(") != name.count(")"):
warnings.append("unbalanced parentheses in name")
words = re.findall(r"[A-Za-z0-9]+", name.lower())
if any(a == b and len(a) > 1 for a, b in zip(words, words[1:])):
warnings.append("repeated adjacent word in name")
if re.search(r"\b(unknown|unk|n/a|tbd|null)\b", name, re.I):
warnings.append("placeholder-like token in name")
return [f"{category}: {rel}: {warning}" for warning in warnings]
def value_warnings(category: str, rel: str, record: dict[str, Any]) -> list[str]:
warnings: list[str] = []
if category == "cpu":
cores = record.get("cores")
threads = record.get("threads")
base = record.get("base_clock_ghz")
boost = record.get("boost_clock_ghz")
if isinstance(cores, int) and isinstance(threads, int) and threads < cores:
warnings.append("threads < cores")
if (
isinstance(base, (int, float))
and isinstance(boost, (int, float))
and boost > 0
and base > 0
and boost < base
):
warnings.append("boost clock below base clock")
arch = record.get("architecture")
if isinstance(arch, str) and re.search(r"\b(unknown|n/a|tbd|null)\b", arch, re.I):
warnings.append("placeholder-like architecture")
if category == "gpu":
base = record.get("base_clock_mhz")
boost = record.get("boost_clock_mhz")
if (
isinstance(base, (int, float))
and isinstance(boost, (int, float))
and boost > 0
and base > 0
and boost < base
):
warnings.append("boost clock below base clock")
return [f"{category}: {rel}: {warning}" for warning in warnings]
lines: list[str] = []
lines.append("## Data summary")
lines.append("")
lines.append("| Category | Total | Verified | Unverified | Verified % |")
lines.append("| --- | ---: | ---: | ---: | ---: |")
total_all = verified_all = unverified_all = 0
by_category: dict[str, dict[str, int]] = {}
for category in CATEGORIES:
paths = rel_jsons(HEAD, category)
verified = unverified = 0
for path in paths.values():
value = verified_value(load_json(path))
if value is True:
verified += 1
elif value is False:
unverified += 1
total = len(paths)
pct = f"{(verified / total * 100):.1f}%" if total else "0.0%"
by_category[category] = {
"total": total,
"verified": verified,
"unverified": unverified,
}
total_all += total
verified_all += verified
unverified_all += unverified
lines.append(f"| {category} | {total} | {verified} | {unverified} | {pct} |")
pct_all = f"{(verified_all / total_all * 100):.1f}%" if total_all else "0.0%"
lines.append(f"| **all** | **{total_all}** | **{verified_all}** | **{unverified_all}** | **{pct_all}** |")
lines.append("")
lines.append("## PR data delta")
lines.append("")
lines.append("| Category | Added | Modified | Deleted | Added verified | Added unverified | Added Kaggle-sourced |")
lines.append("| --- | ---: | ---: | ---: | ---: | ---: | ---: |")
all_added: list[tuple[str, str, Path]] = []
for category in CATEGORIES:
head = rel_jsons(HEAD, category)
base = rel_jsons(BASE, category)
added_keys = sorted(set(head) - set(base))
deleted_keys = sorted(set(base) - set(head))
modified_keys = sorted(
key for key in set(head) & set(base) if digest(head[key]) != digest(base[key])
)
added_verified = added_unverified = added_kaggle = 0
for key in added_keys:
record = load_json(head[key])
all_added.append((category, key, head[key]))
if verified_value(record) is True:
added_verified += 1
elif verified_value(record) is False:
added_unverified += 1
if has_kaggle_source(record):
added_kaggle += 1
lines.append(
f"| {category} | {len(added_keys)} | {len(modified_keys)} | {len(deleted_keys)} | "
f"{added_verified} | {added_unverified} | {added_kaggle} |"
)
lines.append("")
lines.append("## Heuristic review")
lines.append("")
warnings: list[str] = []
manufacturer_counter: Counter[str] = Counter()
source_counter: Counter[str] = Counter()
for category, rel, path in all_added:
record = load_json(path)
manufacturer = record.get("manufacturer") or record.get("brand")
if isinstance(manufacturer, str):
manufacturer_counter[manufacturer] += 1
if has_kaggle_source(record):
source_counter["kaggle"] += 1
else:
source_counter["other"] += 1
warnings.extend(name_warnings(category, rel, record))
warnings.extend(value_warnings(category, rel, record))
if manufacturer_counter:
top = ", ".join(f"{name}: {count}" for name, count in manufacturer_counter.most_common(8))
lines.append(f"- Added records by manufacturer/brand: {top}")
if source_counter:
top = ", ".join(f"{name}: {count}" for name, count in source_counter.most_common())
lines.append(f"- Added records by source class: {top}")
if warnings:
lines.append(f"- Heuristic warnings: {len(warnings)} total; showing first {min(MAX_WARNINGS, len(warnings))}.")
lines.append("")
for warning in warnings[:MAX_WARNINGS]:
lines.append(f" - {warning}")
else:
lines.append("- Heuristic warnings: none found.")
Path("quality-summary.md").write_text("\n".join(lines) + "\n", encoding="utf-8")
PY
- name: Build PR comment
shell: bash
run: |
short_sha="${TECHAPI_HEAD_SHA:0:7}"
result="PASS"
if [ "${{ steps.validate.outputs.status }}" != "success" ]; then
result="FAIL"
fi
{
echo "<!-- techengine-pr-validation -->"
echo "## TechEngine validation: ${result}"
echo
echo "- PR: #${TECHAPI_PR_NUMBER}"
echo "- Ref: \`${TECHAPI_HEAD_REF:-detached}\`"
echo "- Commit: \`${short_sha}\`"
echo "- Requested by: @${REQUESTED_BY}"
echo "- Run: ${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}"
echo
echo "| Check | Result |"
echo "| --- | --- |"
echo "| \`python -m app.validate\` | $([ "${{ steps.validate.outputs.app_status }}" = "0" ] && echo PASS || echo FAIL) |"
echo "| \`python integrity_check.py TechAPI/data --strict\` | $([ "${{ steps.validate.outputs.integrity_status }}" = "0" ] && echo PASS || echo FAIL) |"
echo
cat quality-summary.md
echo
echo "<details><summary>Validation log</summary>"
echo
echo '```text'
tail -c 6000 validation.log
echo '```'
echo
echo "</details>"
} > comment.md
- name: Comment on TechAPI PR
if: env.TECHAPI_COMMENT_TOKEN != ''
env:
GH_TOKEN: ${{ env.TECHAPI_COMMENT_TOKEN }}
shell: bash
run: |
set -euo pipefail
marker="<!-- techengine-pr-validation -->"
comment_id="$(gh api "repos/GetTechAPI/TechAPI/issues/${TECHAPI_PR_NUMBER}/comments" --paginate \
--jq ".[] | select(.body | contains(\"${marker}\")) | .id" | tail -n 1)"
jq -n --rawfile body comment.md '{body: $body}' > comment.json
if [ -n "$comment_id" ]; then
gh api "repos/GetTechAPI/TechAPI/issues/comments/${comment_id}" \
--method PATCH \
--input comment.json
else
gh api "repos/GetTechAPI/TechAPI/issues/${TECHAPI_PR_NUMBER}/comments" \
--method POST \
--input comment.json
fi
- name: Warn when comment token is unset
if: env.TECHAPI_COMMENT_TOKEN == ''
run: echo "::warning::TECHENGINEBOT_TOKEN/TECHAPI_TOKEN is not configured; validation ran but no PR comment was posted."
- name: Fail on validation errors
if: steps.validate.outputs.status != 'success'
run: exit 1