techapi-pr-validation-comment #2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: techapi-pr-validation-comment | |
| # TechAPI data PRs can ask this repository to validate their head commit and | |
| # leave a curator-facing comment back on the PR. This keeps the PR owned by the | |
| # human contributor while TechEngineBot reports the engine verdict. | |
| on: | |
| repository_dispatch: | |
| types: [techapi-pr-validate] | |
| workflow_dispatch: | |
| inputs: | |
| pr_number: | |
| description: "TechAPI PR number to comment on" | |
| type: string | |
| required: true | |
| head_sha: | |
| description: "TechAPI commit SHA to validate" | |
| type: string | |
| required: true | |
| pr_url: | |
| description: "TechAPI PR URL" | |
| type: string | |
| required: false | |
| default: "" | |
| permissions: | |
| contents: read | |
| concurrency: | |
| group: techapi-pr-validation-${{ github.event.client_payload.pr_number || inputs.pr_number }} | |
| cancel-in-progress: true | |
| jobs: | |
| validate: | |
| runs-on: ubuntu-latest | |
| env: | |
| TECHAPI_COMMENT_TOKEN: ${{ secrets.TECHENGINEBOT_TOKEN || secrets.TECHAPI_TOKEN }} | |
| TECHAPI_PR_NUMBER: ${{ github.event.client_payload.pr_number || inputs.pr_number }} | |
| TECHAPI_HEAD_SHA: ${{ github.event.client_payload.head_sha || inputs.head_sha }} | |
| TECHAPI_HEAD_REF: ${{ github.event.client_payload.head_ref || '' }} | |
| TECHAPI_PR_URL: ${{ github.event.client_payload.pr_url || inputs.pr_url }} | |
| REQUESTED_BY: ${{ github.event.client_payload.requested_by || github.actor }} | |
| TECHAPI_DATA_DIR: ${{ github.workspace }}/TechAPI/data | |
| steps: | |
| - name: Checkout TechEngine | |
| uses: actions/checkout@v4 | |
| - name: Checkout TechAPI PR head | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: GetTechAPI/TechAPI | |
| ref: ${{ env.TECHAPI_HEAD_SHA }} | |
| path: TechAPI | |
| - name: Checkout TechAPI main | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: GetTechAPI/TechAPI | |
| ref: main | |
| path: TechAPI-main | |
| - uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| cache: pip | |
| - name: Install TechEngine | |
| run: pip install -e . | |
| - name: Validate TechAPI data | |
| id: validate | |
| shell: bash | |
| run: | | |
| set +e | |
| { | |
| echo "## app.validate" | |
| python -m app.validate | |
| echo "app_validate_status=$?" | |
| } > validation.log 2>&1 | |
| app_status=$(grep "app_validate_status=" validation.log | tail -n 1 | cut -d= -f2) | |
| { | |
| echo | |
| echo "## integrity_check.py --strict" | |
| python integrity_check.py TechAPI/data --strict | |
| echo "integrity_status=$?" | |
| } >> validation.log 2>&1 | |
| integrity_status=$(grep "integrity_status=" validation.log | tail -n 1 | cut -d= -f2) | |
| sed -i '/_status=/d' validation.log | |
| status="success" | |
| if [ "${app_status:-1}" != "0" ] || [ "${integrity_status:-1}" != "0" ]; then | |
| status="failure" | |
| fi | |
| echo "status=$status" >> "$GITHUB_OUTPUT" | |
| echo "app_status=${app_status:-1}" >> "$GITHUB_OUTPUT" | |
| echo "integrity_status=${integrity_status:-1}" >> "$GITHUB_OUTPUT" | |
| - name: Build data quality summary | |
| shell: bash | |
| run: | | |
| python - <<'PY' | |
| from __future__ import annotations | |
| import hashlib | |
| import json | |
| import re | |
| from collections import Counter, defaultdict | |
| from pathlib import Path | |
| from typing import Any | |
| HEAD = Path("TechAPI/data") | |
| BASE = Path("TechAPI-main/data") | |
| CATEGORIES = ("brand", "soc", "smartphone", "gpu", "cpu") | |
| MAX_WARNINGS = 20 | |
| def load_json(path: Path) -> dict[str, Any]: | |
| return json.loads(path.read_text(encoding="utf-8-sig")) | |
| def rel_jsons(root: Path, category: str) -> dict[str, Path]: | |
| base = root / category | |
| if not base.exists(): | |
| return {} | |
| return { | |
| str(path.relative_to(root)).replace("\\", "/"): path | |
| for path in sorted(base.rglob("*.json")) | |
| } | |
| def digest(path: Path) -> str: | |
| return hashlib.sha256(path.read_bytes()).hexdigest() | |
| def verified_value(record: dict[str, Any]) -> bool | None: | |
| value = record.get("verified") | |
| return value if isinstance(value, bool) else None | |
| def has_kaggle_source(record: dict[str, Any]) -> bool: | |
| return any( | |
| isinstance(url, str) and "kaggle.com" in url.lower() | |
| for url in record.get("source_urls", []) | |
| ) | |
| def name_warnings(category: str, rel: str, record: dict[str, Any]) -> list[str]: | |
| warnings: list[str] = [] | |
| name = record.get("name") | |
| if not isinstance(name, str): | |
| return warnings | |
| if name != name.strip(): | |
| warnings.append("leading/trailing whitespace in name") | |
| if " " in name: | |
| warnings.append("double spaces in name") | |
| if "\ufffd" in name: | |
| warnings.append("replacement character in name") | |
| if name.count("(") != name.count(")"): | |
| warnings.append("unbalanced parentheses in name") | |
| words = re.findall(r"[A-Za-z0-9]+", name.lower()) | |
| if any(a == b and len(a) > 1 for a, b in zip(words, words[1:])): | |
| warnings.append("repeated adjacent word in name") | |
| if re.search(r"\b(unknown|unk|n/a|tbd|null)\b", name, re.I): | |
| warnings.append("placeholder-like token in name") | |
| return [f"{category}: {rel}: {warning}" for warning in warnings] | |
| def value_warnings(category: str, rel: str, record: dict[str, Any]) -> list[str]: | |
| warnings: list[str] = [] | |
| if category == "cpu": | |
| cores = record.get("cores") | |
| threads = record.get("threads") | |
| base = record.get("base_clock_ghz") | |
| boost = record.get("boost_clock_ghz") | |
| if isinstance(cores, int) and isinstance(threads, int) and threads < cores: | |
| warnings.append("threads < cores") | |
| if ( | |
| isinstance(base, (int, float)) | |
| and isinstance(boost, (int, float)) | |
| and boost > 0 | |
| and base > 0 | |
| and boost < base | |
| ): | |
| warnings.append("boost clock below base clock") | |
| arch = record.get("architecture") | |
| if isinstance(arch, str) and re.search(r"\b(unknown|n/a|tbd|null)\b", arch, re.I): | |
| warnings.append("placeholder-like architecture") | |
| if category == "gpu": | |
| base = record.get("base_clock_mhz") | |
| boost = record.get("boost_clock_mhz") | |
| if ( | |
| isinstance(base, (int, float)) | |
| and isinstance(boost, (int, float)) | |
| and boost > 0 | |
| and base > 0 | |
| and boost < base | |
| ): | |
| warnings.append("boost clock below base clock") | |
| return [f"{category}: {rel}: {warning}" for warning in warnings] | |
| lines: list[str] = [] | |
| lines.append("## Data summary") | |
| lines.append("") | |
| lines.append("| Category | Total | Verified | Unverified | Verified % |") | |
| lines.append("| --- | ---: | ---: | ---: | ---: |") | |
| total_all = verified_all = unverified_all = 0 | |
| by_category: dict[str, dict[str, int]] = {} | |
| for category in CATEGORIES: | |
| paths = rel_jsons(HEAD, category) | |
| verified = unverified = 0 | |
| for path in paths.values(): | |
| value = verified_value(load_json(path)) | |
| if value is True: | |
| verified += 1 | |
| elif value is False: | |
| unverified += 1 | |
| total = len(paths) | |
| pct = f"{(verified / total * 100):.1f}%" if total else "0.0%" | |
| by_category[category] = { | |
| "total": total, | |
| "verified": verified, | |
| "unverified": unverified, | |
| } | |
| total_all += total | |
| verified_all += verified | |
| unverified_all += unverified | |
| lines.append(f"| {category} | {total} | {verified} | {unverified} | {pct} |") | |
| pct_all = f"{(verified_all / total_all * 100):.1f}%" if total_all else "0.0%" | |
| lines.append(f"| **all** | **{total_all}** | **{verified_all}** | **{unverified_all}** | **{pct_all}** |") | |
| lines.append("") | |
| lines.append("## PR data delta") | |
| lines.append("") | |
| lines.append("| Category | Added | Modified | Deleted | Added verified | Added unverified | Added Kaggle-sourced |") | |
| lines.append("| --- | ---: | ---: | ---: | ---: | ---: | ---: |") | |
| all_added: list[tuple[str, str, Path]] = [] | |
| for category in CATEGORIES: | |
| head = rel_jsons(HEAD, category) | |
| base = rel_jsons(BASE, category) | |
| added_keys = sorted(set(head) - set(base)) | |
| deleted_keys = sorted(set(base) - set(head)) | |
| modified_keys = sorted( | |
| key for key in set(head) & set(base) if digest(head[key]) != digest(base[key]) | |
| ) | |
| added_verified = added_unverified = added_kaggle = 0 | |
| for key in added_keys: | |
| record = load_json(head[key]) | |
| all_added.append((category, key, head[key])) | |
| if verified_value(record) is True: | |
| added_verified += 1 | |
| elif verified_value(record) is False: | |
| added_unverified += 1 | |
| if has_kaggle_source(record): | |
| added_kaggle += 1 | |
| lines.append( | |
| f"| {category} | {len(added_keys)} | {len(modified_keys)} | {len(deleted_keys)} | " | |
| f"{added_verified} | {added_unverified} | {added_kaggle} |" | |
| ) | |
| lines.append("") | |
| lines.append("## Heuristic review") | |
| lines.append("") | |
| warnings: list[str] = [] | |
| manufacturer_counter: Counter[str] = Counter() | |
| source_counter: Counter[str] = Counter() | |
| for category, rel, path in all_added: | |
| record = load_json(path) | |
| manufacturer = record.get("manufacturer") or record.get("brand") | |
| if isinstance(manufacturer, str): | |
| manufacturer_counter[manufacturer] += 1 | |
| if has_kaggle_source(record): | |
| source_counter["kaggle"] += 1 | |
| else: | |
| source_counter["other"] += 1 | |
| warnings.extend(name_warnings(category, rel, record)) | |
| warnings.extend(value_warnings(category, rel, record)) | |
| if manufacturer_counter: | |
| top = ", ".join(f"{name}: {count}" for name, count in manufacturer_counter.most_common(8)) | |
| lines.append(f"- Added records by manufacturer/brand: {top}") | |
| if source_counter: | |
| top = ", ".join(f"{name}: {count}" for name, count in source_counter.most_common()) | |
| lines.append(f"- Added records by source class: {top}") | |
| if warnings: | |
| lines.append(f"- Heuristic warnings: {len(warnings)} total; showing first {min(MAX_WARNINGS, len(warnings))}.") | |
| lines.append("") | |
| for warning in warnings[:MAX_WARNINGS]: | |
| lines.append(f" - {warning}") | |
| else: | |
| lines.append("- Heuristic warnings: none found.") | |
| Path("quality-summary.md").write_text("\n".join(lines) + "\n", encoding="utf-8") | |
| PY | |
| - name: Build PR comment | |
| shell: bash | |
| run: | | |
| short_sha="${TECHAPI_HEAD_SHA:0:7}" | |
| result="PASS" | |
| if [ "${{ steps.validate.outputs.status }}" != "success" ]; then | |
| result="FAIL" | |
| fi | |
| { | |
| echo "<!-- techengine-pr-validation -->" | |
| echo "## TechEngine validation: ${result}" | |
| echo | |
| echo "- PR: #${TECHAPI_PR_NUMBER}" | |
| echo "- Ref: \`${TECHAPI_HEAD_REF:-detached}\`" | |
| echo "- Commit: \`${short_sha}\`" | |
| echo "- Requested by: @${REQUESTED_BY}" | |
| echo "- Run: ${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}" | |
| echo | |
| echo "| Check | Result |" | |
| echo "| --- | --- |" | |
| echo "| \`python -m app.validate\` | $([ "${{ steps.validate.outputs.app_status }}" = "0" ] && echo PASS || echo FAIL) |" | |
| echo "| \`python integrity_check.py TechAPI/data --strict\` | $([ "${{ steps.validate.outputs.integrity_status }}" = "0" ] && echo PASS || echo FAIL) |" | |
| echo | |
| cat quality-summary.md | |
| echo | |
| echo "<details><summary>Validation log</summary>" | |
| echo | |
| echo '```text' | |
| tail -c 6000 validation.log | |
| echo '```' | |
| echo | |
| echo "</details>" | |
| } > comment.md | |
| - name: Comment on TechAPI PR | |
| if: env.TECHAPI_COMMENT_TOKEN != '' | |
| env: | |
| GH_TOKEN: ${{ env.TECHAPI_COMMENT_TOKEN }} | |
| shell: bash | |
| run: | | |
| set -euo pipefail | |
| marker="<!-- techengine-pr-validation -->" | |
| comment_id="$(gh api "repos/GetTechAPI/TechAPI/issues/${TECHAPI_PR_NUMBER}/comments" --paginate \ | |
| --jq ".[] | select(.body | contains(\"${marker}\")) | .id" | tail -n 1)" | |
| jq -n --rawfile body comment.md '{body: $body}' > comment.json | |
| if [ -n "$comment_id" ]; then | |
| gh api "repos/GetTechAPI/TechAPI/issues/comments/${comment_id}" \ | |
| --method PATCH \ | |
| --input comment.json | |
| else | |
| gh api "repos/GetTechAPI/TechAPI/issues/${TECHAPI_PR_NUMBER}/comments" \ | |
| --method POST \ | |
| --input comment.json | |
| fi | |
| - name: Warn when comment token is unset | |
| if: env.TECHAPI_COMMENT_TOKEN == '' | |
| run: echo "::warning::TECHENGINEBOT_TOKEN/TECHAPI_TOKEN is not configured; validation ran but no PR comment was posted." | |
| - name: Fail on validation errors | |
| if: steps.validate.outputs.status != 'success' | |
| run: exit 1 |