diff --git a/.bandit b/.bandit new file mode 100644 index 00000000..b49d6ebb --- /dev/null +++ b/.bandit @@ -0,0 +1,11 @@ +# Bandit configuration (YAML format, bandit 1.9+) +# https://bandit.readthedocs.io/en/latest/config.html +# +# NOTE: scan targets must be passed via -r on the CLI; the YAML config +# does not support a "targets" key. Edit the workflow's bandit command +# to add new directories. + +# Exclude test directories (test code often has intentional patterns +# that trigger false positives like assert, subprocess in fixtures) +exclude_dirs: + - "*/tests/*" diff --git a/.checkov.yaml b/.checkov.yaml new file mode 100644 index 00000000..0b054b7e --- /dev/null +++ b/.checkov.yaml @@ -0,0 +1,28 @@ +# Checkov configuration +# https://www.checkov.io/2.Basics/CLI%20Command%20Reference.html + +# Scan GitHub Actions workflows and Dockerfiles +framework: + - github_actions + - dockerfile + +# Skip checks that conflict with this repo's patterns. +# +# Repo-wide suppressions go here. For file-level suppressions, use inline +# comments in the source file: +# +# Dockerfile: +# # checkov:skip=CKV_DOCKER_2:healthcheck not needed for build-only image +# FROM python:3.12-slim +# +# GitHub Actions YAML: +# # checkov:skip=CKV_GHA_7:buildspec-override requires user parameters +# - uses: aws-actions/aws-codebuild-run-build@v1 +# +# Multiple skips on one line: +# # checkov:skip=CKV_DOCKER_2,CKV_DOCKER_3:reason for both +skip-check: + # CKV_GHA_7: "The build output cannot be affected by user parameters other + # than the build entry point and the top-level source location" + # — conflicts with inline buildspec-override in codebuild.yml + - CKV_GHA_7 diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 00000000..3e568582 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,6 @@ +{ + "$schema": "https://json.schemastore.org/claude-code-settings.json", + "attribution": { + "pr": "By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of the [project license](https://github.com/awslabs/aidlc-workflows/blob/main/LICENSE)." + } +} diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 00000000..33f05fd5 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,21 @@ +# [CODEOWNERS](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners#about-code-owners) + +## Default owners for everything in the repo + +* @awslabs/aidlc-admins @awslabs/aidlc-maintainers + +.github/ @awslabs/aidlc-admins +.gitignore @awslabs/aidlc-admins +aidlc-rules/ @awslabs/aidlc-admins @awslabs/aidlc-maintainers @awslabs/aidlc-writers +assets/ @awslabs/aidlc-admins @awslabs/aidlc-maintainers @awslabs/aidlc-writers +CHANGELOG.md @awslabs/aidlc-admins +cliff.toml @awslabs/aidlc-admins +CODE_OF_CONDUCT.md @awslabs/aidlc-admins +CONTRIBUTING.md @awslabs/aidlc-admins +LICENSE @awslabs/aidlc-admins +README.md @awslabs/aidlc-admins +scripts/ @awslabs/aidlc-admins @awslabs/aidlc-maintainers + +## Secure the CODEOWNERS file + +/.github/CODEOWNERS @awslabs/aidlc-admins diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 00000000..13316eba --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,108 @@ +name: Bug Report +description: Report a bug or unexpected behavior +title: "[Bug]: " +labels: ["bug"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to report a bug. Please fill out the sections below so we can reproduce and fix the issue. + + - type: textarea + id: description + attributes: + label: Description + description: A clear and concise description of the bug. + placeholder: Describe what went wrong. + validations: + required: true + + - type: textarea + id: steps + attributes: + label: Steps to Reproduce + description: Steps to reproduce the behavior. + placeholder: | + 1. Set up ... + 2. Run ... + 3. Observe ... + validations: + required: true + + - type: textarea + id: expected + attributes: + label: Expected Behavior + description: What you expected to happen. + validations: + required: true + + - type: textarea + id: actual + attributes: + label: Actual Behavior + description: What actually happened. + validations: + required: true + + - type: input + id: version + attributes: + label: AI-DLC Version + description: Which version of AI-DLC are you using? Or the git commit hash if running from source. + placeholder: e.g., 0.1.5 or abc1234 + validations: + required: true + + - type: dropdown + id: phase + attributes: + label: AI-DLC Phase + description: Which workflow phase were you in when the bug occurred? + options: + - Inception (requirements, user stories, application design) + - Construction (component design, code generation, testing) + - Operations (deployment, monitoring) + - Not phase-specific + validations: + required: true + + - type: dropdown + id: platform + attributes: + label: Platform / IDE + description: Which platform or IDE are you using AI-DLC with? + multiple: true + options: + - Kiro IDE + - Kiro CLI + - Amazon Q Developer IDE Plugin + - Cursor IDE + - Cline + - Claude Code + - GitHub Copilot + - Other + validations: + required: true + + - type: input + id: model + attributes: + label: AI Model + description: Which AI model were you using? + placeholder: e.g., Claude Sonnet 4, GPT-4o, Amazon Nova + + - type: textarea + id: environment + attributes: + label: Environment + description: Any other relevant environment details (OS, IDE version, etc.). + placeholder: | + - OS: macOS 15.x + - IDE: VS Code 1.x + + - type: textarea + id: additional + attributes: + label: Additional Context + description: Add any other context, screenshots, or log output about the problem here. diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 00000000..0086358d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: true diff --git a/.github/ISSUE_TEMPLATE/documentation.yml b/.github/ISSUE_TEMPLATE/documentation.yml new file mode 100644 index 00000000..beed500e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/documentation.yml @@ -0,0 +1,46 @@ +name: Documentation Problem +description: Report incorrect, unclear, or missing documentation +title: "[Docs]: " +labels: ["documentation"] +body: + - type: markdown + attributes: + value: | + Help us improve the documentation. Please describe the problem you found or what is missing. + + - type: dropdown + id: type + attributes: + label: Type of Documentation Problem + options: + - Incorrect or outdated information + - Unclear or confusing explanation + - Missing documentation + - Broken link + - Typo or formatting issue + - Other + validations: + required: true + + - type: textarea + id: location + attributes: + label: Location + description: Where is the problem? Provide a file path, URL, or section name. + placeholder: e.g., README.md, "Platform-Specific Setup" section + validations: + required: true + + - type: textarea + id: description + attributes: + label: Description + description: Describe what is wrong or missing. + validations: + required: true + + - type: textarea + id: suggestion + attributes: + label: Suggested Fix + description: If you have a suggestion for how to fix the documentation, describe it here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 00000000..890964b0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,47 @@ +name: Feature Request +description: Suggest a small enhancement or improvement +title: "[Feature]: " +labels: ["enhancement"] +body: + - type: markdown + attributes: + value: | + Suggest an enhancement or improvement. For larger, design-level proposals, please use the [RFC template](https://github.com/awslabs/aidlc-workflows/issues/new?template=rfc.yml) instead. + + - type: textarea + id: description + attributes: + label: Description + description: A clear and concise description of the feature or improvement you'd like. + validations: + required: true + + - type: textarea + id: use-case + attributes: + label: Use Case + description: What problem does this solve or what workflow does it improve? + validations: + required: true + + - type: dropdown + id: area + attributes: + label: Area + description: What area of AI-DLC does this relate to? + options: + - Inception phase rules + - Construction phase rules + - Operations phase rules + - New extension (e.g., compliance, security) + - Platform support + - Documentation + - Other + validations: + required: true + + - type: textarea + id: additional + attributes: + label: Additional Context + description: Any other context, examples, or references that help explain the request. diff --git a/.github/ISSUE_TEMPLATE/rfc.yml b/.github/ISSUE_TEMPLATE/rfc.yml new file mode 100644 index 00000000..b881ae3c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/rfc.yml @@ -0,0 +1,51 @@ +name: RFC (Request for Comments) +description: Propose a significant change or new feature for discussion +title: "[RFC]: " +labels: ["rfc"] +body: + - type: markdown + attributes: + value: | + Use this template to propose a significant change or new feature. RFCs are meant to gather feedback before implementation begins. + + - type: textarea + id: summary + attributes: + label: Summary + description: A brief one-paragraph summary of the proposal. + validations: + required: true + + - type: textarea + id: motivation + attributes: + label: Motivation + description: Why should this change be made? What problem does it solve or what use case does it enable? + validations: + required: true + + - type: textarea + id: proposal + attributes: + label: Detailed Proposal + description: Describe the proposed change in detail. Include specifics on how it would work. + validations: + required: true + + - type: textarea + id: alternatives + attributes: + label: Alternatives Considered + description: What other approaches did you consider and why were they not chosen? + + - type: textarea + id: drawbacks + attributes: + label: Drawbacks + description: Are there any reasons why this proposal should not be adopted? + + - type: textarea + id: additional + attributes: + label: Additional Context + description: Any other information, mockups, references, or examples that help explain the proposal. diff --git a/.github/labeler.yml b/.github/labeler.yml new file mode 100644 index 00000000..5eb6caa8 --- /dev/null +++ b/.github/labeler.yml @@ -0,0 +1,16 @@ +# Auto-label configuration for actions/labeler +# See https://github.com/actions/labeler#match-object for syntax + +rules: +- changed-files: + - any-glob-to-any-file: 'aidlc-rules/**' + +documentation: +- changed-files: + - all-globs-to-any-file: + - '**/*.md' + - '!aidlc-rules/**' + +github: +- changed-files: + - any-glob-to-any-file: '.github/**' diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 00000000..3532b710 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,28 @@ +# Summary + +> replace with your summary... + +## Changes + +> replace with a description of the changes + +## User experience + +> Please share what the user experience looks like before and after this change + +## Checklist + +If your change doesn't seem to apply, please leave them unchecked. + +* [ ] I have reviewed the [contributing guidelines](https://github.com/awslabs/aidlc-workflows/blob/main/CONTRIBUTING.md) +* [ ] I have performed a self-review of this change +* [ ] Changes have been tested +* [ ] Changes are documented + +## Test Plan + +> replace with instructions or a checklist for reviewers to verify + +## Acknowledgment + +By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of the [project license](https://github.com/awslabs/aidlc-workflows/blob/main/LICENSE). diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..18e46ddc --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,26 @@ +name: CI + +on: + pull_request: + branches: [ "main" ] + push: + branches: [ "main" ] + workflow_dispatch: + +permissions: {} + +concurrency: + group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + markdownlint: + name: Markdown Lint + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 #v6.0.1 + - uses: DavidAnson/markdownlint-cli2-action@ce4853d43830c74c1753b39f3cf40f71c2031eb9 #v23.0.0 + with: + globs: "**/*.md" diff --git a/.github/workflows/codebuild.yml b/.github/workflows/codebuild.yml new file mode 100644 index 00000000..cb9e7503 --- /dev/null +++ b/.github/workflows/codebuild.yml @@ -0,0 +1,519 @@ +name: CodeBuild + +on: + workflow_dispatch: {} + pull_request: + branches: + - main + types: + - labeled + - opened + - ready_for_review + - reopened + - synchronize + - unlabeled + paths: + - 'aidlc-rules/**' + push: + branches: + - main + tags: + - 'v*' + +concurrency: + group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} + cancel-in-progress: true + +env: + CODEBUILD_PROJECT_NAME: ${{ vars.CODEBUILD_PROJECT_NAME || 'codebuild-project' }} + LABEL_REMINDER_MARKER: rules-label-reminder + +permissions: {} + +jobs: + label-reminder: + if: >- + github.event_name == 'pull_request' + && !contains(github.event.pull_request.labels.*.name, 'rules') + + permissions: + pull-requests: write + + runs-on: ubuntu-latest + + steps: + - name: Warn about missing rules label + run: | + echo "::warning::This PR changes aidlc-rules/ but does not have the 'rules' label. Add the label to trigger the CodeBuild evaluation pipeline." + + - name: Comment on PR + if: github.event.pull_request.head.repo.full_name == github.repository + env: + GH_TOKEN: ${{ github.token }} + PR_NUMBER: ${{ github.event.pull_request.number }} + REPO: ${{ github.repository }} + MARKER: ${{ env.LABEL_REMINDER_MARKER }} + run: | + EXISTING=$(gh api "repos/$REPO/issues/$PR_NUMBER/comments" \ + --jq ".[] | select(.body | contains(\"$MARKER\")) | .id" \ + | head -1) + if [ -n "$EXISTING" ]; then + echo "Reminder comment already exists ($EXISTING) — skipping" + exit 0 + fi + BODY=" + > **Note:** This PR changes \`aidlc-rules/\` but the \`rules\` label has not been applied. + > + > A maintainer must add the **rules** label to trigger the CodeBuild evaluation pipeline. + > Once labeled, subsequent pushes will re-trigger the build automatically." + gh pr comment "$PR_NUMBER" --repo "$REPO" --body "$BODY" + + - name: Fork PR notice + if: github.event.pull_request.head.repo.full_name != github.repository + run: | + echo "::notice::Skipping PR comment — fork PRs have a read-only GITHUB_TOKEN. The warning annotation above is still visible to maintainers." + + label-cleanup: + if: >- + github.event_name == 'pull_request' + && contains(github.event.pull_request.labels.*.name, 'rules') + && github.event.pull_request.head.repo.full_name == github.repository + + permissions: + issues: write + pull-requests: write + + runs-on: ubuntu-latest + + steps: + - name: Remove label reminder comment + env: + GH_TOKEN: ${{ github.token }} + PR_NUMBER: ${{ github.event.pull_request.number }} + REPO: ${{ github.repository }} + MARKER: ${{ env.LABEL_REMINDER_MARKER }} + run: | + COMMENT_ID=$(gh api "repos/$REPO/issues/$PR_NUMBER/comments" \ + --jq ".[] | select(.user.login == \"github-actions[bot]\" and (.body | contains(\"$MARKER\"))) | .id" \ + | head -1) + if [ -z "$COMMENT_ID" ]; then + echo "No label-reminder comment found — nothing to clean up" + exit 0 + fi + if gh api -X DELETE "repos/$REPO/issues/comments/$COMMENT_ID"; then + echo "Removed label-reminder comment ($COMMENT_ID)" + else + echo "::warning::Failed to delete label-reminder comment ($COMMENT_ID) — it may have been removed already" + fi + + build: + # Fork PRs are skipped because they cannot access the repository secrets + # or OIDC credentials needed for AWS CodeBuild. + if: >- + (github.event_name != 'pull_request' + || contains(github.event.pull_request.labels.*.name, 'rules')) + && (github.event_name != 'pull_request' + || github.event.pull_request.head.repo.full_name == github.repository) + environment: codebuild + + permissions: + actions: write + contents: write + id-token: write # Required for OIDC token request to AWS STS + pull-requests: write # Required for posting trend report comments on PRs + + runs-on: ubuntu-latest + + steps: + - name: List caches + env: + GH_TOKEN: ${{ github.token }} + REPO: ${{ github.repository }} + run: | + gh cache list -R "$REPO" --key "$CODEBUILD_PROJECT_NAME-" --order asc | cat + + - name: Check cache + id: cache-check + uses: actions/cache/restore@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + with: + path: ${{ env.CODEBUILD_PROJECT_NAME }}.zip + key: ${{ env.CODEBUILD_PROJECT_NAME }}-${{ github.ref_name }}-${{ github.sha }} + lookup-only: true + + - name: Configure AWS credentials + # env.ACT is set by the 'act' CLI tool for local testing + if: ${{ !env.ACT && steps.cache-check.outputs.cache-hit != 'true' }} + uses: aws-actions/configure-aws-credentials@8df5847569e6427dd6c4fb1cf565c83acfa8afa7 # v6.0.0 + with: + role-to-assume: ${{ secrets.AWS_CODEBUILD_ROLE_ARN }} + aws-region: ${{ vars.AWS_REGION || 'us-east-1' }} + role-duration-seconds: ${{ vars.ROLE_DURATION_SECONDS || 7200 }} + role-session-name: GitHubActions${{ github.run_id }} + mask-aws-account-id: true + retry-max-attempts: 0 + + - name: Run CodeBuild + if: steps.cache-check.outputs.cache-hit != 'true' + id: codebuild + uses: aws-actions/aws-codebuild-run-build@7e46c3fa1c1f217e26a73712796b1f78938b534b # v1.0.19 + with: + project-name: ${{ env.CODEBUILD_PROJECT_NAME }} + source-version-override: ${{ github.sha }} + buildspec-override: | + version: 0.2 + env: + variables: + GH_TOKEN: ${{ github.token }} + GH_REF_NAME: ${{ github.ref_name }} + GH_HEAD_REF: ${{ github.head_ref }} + GH_EVENT_NAME: ${{ github.event_name }} + phases: + install: + commands: + - mkdir -p .codebuild + - touch ./.codebuild/codebuild.out + - dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo || echo "dnf config-manager" + - dnf install -y 'dnf-command(config-manager)' gh || echo "dnf install failed" + - curl -LsSf https://astral.sh/uv/install.sh | sh || echo "uv failed" + - export PATH=$HOME/.local/bin:$PATH + - git config --global --add safe.directory "/codebuild/output/srcDownload/src" + pre_build: + commands: + - echo "pre_build" + build: + commands: + - DEFAULT_BRANCH=$(gh repo view --json defaultBranchRef --jq '.defaultBranchRef.name' 2>/dev/null || echo "main") + - CURRENT_BRANCH=$(git symbolic-ref --short HEAD 2>/dev/null || echo "${GH_HEAD_REF:-${GH_REF_NAME:-}}") + - CURRENT_TAG=$(git describe --tags --exact-match 2>/dev/null || echo "") + - IS_RELEASE=$([[ -n "$CURRENT_TAG" ]] && echo "true" || echo "false") + - IS_PRE_RELEASE=$([[ "$CURRENT_BRANCH" == "$DEFAULT_BRANCH" ]] && echo "true" || echo "false") + - IS_PRE_MERGE=$([[ -z "$CURRENT_TAG" && "$CURRENT_BRANCH" != "$DEFAULT_BRANCH" ]] && echo "true" || echo "false") + - echo "Branch=$CURRENT_BRANCH Tag=$CURRENT_TAG Release=$IS_RELEASE PreRelease=$IS_PRE_RELEASE PreMerge=$IS_PRE_MERGE" + - | + echo "========================================" + echo " Regression Validation" + echo "========================================" + + EVALUATOR_DIR="$CODEBUILD_SRC_DIR/scripts/aidlc-evaluator" + if [[ ! -d "$EVALUATOR_DIR" ]]; then + echo "ERROR: Evaluation framework not found at $EVALUATOR_DIR" + exit 1 + fi + + cd "$EVALUATOR_DIR" + EVALUATOR_DIR="$(pwd -P)" # resolve symlinks for consistent paths + RULES_REF="${CURRENT_TAG:-$CURRENT_BRANCH}" + PR_NUMBER=$(gh pr view "$CURRENT_BRANCH" --repo "${GITHUB_REPOSITORY:-awslabs/aidlc-workflows}" --json number --jq '.number' 2>/dev/null || echo "") + echo "Rules ref: $RULES_REF" + if [[ -n "$PR_NUMBER" ]]; then + echo "PR number: $PR_NUMBER (will label as pr-$PR_NUMBER in trend report)" + fi + + # Install dependencies + uv sync + ./docker/sandbox/build.sh || exit 1 + + # Unit tests (245 across 7 packages, excludes trend-reports) + echo "========================================" + echo " Unit Tests" + echo "========================================" + uv run python run.py test + + # Trend reports unit tests (138 tests, separate package) + echo "========================================" + echo " Trend Reports Unit Tests" + echo "========================================" + uv run pytest packages/trend-reports/tests/ -v + + # Full evaluation with the PR's rules + echo "========================================" + echo " Evaluation (rules-ref=$RULES_REF)" + echo "========================================" + uv run python run.py full --rules-ref "$RULES_REF" + + # Locate the evaluation run directory for trend report input + EVAL_RUN_DIR=$(ls -dt "$EVALUATOR_DIR/runs/"*/*/ 2>/dev/null | head -1) + LOCAL_BUNDLE_ARG="" + if [[ -n "$EVAL_RUN_DIR" ]]; then + echo "Evaluation run folder: $EVAL_RUN_DIR" + + # Patch run-meta.yaml with PR label so trend report classifies it as PR + if [[ -n "$PR_NUMBER" && -f "$EVAL_RUN_DIR/run-meta.yaml" ]]; then + sed -i "s|rules_ref:.*|rules_ref: pr-$PR_NUMBER|" "$EVAL_RUN_DIR/run-meta.yaml" + echo "Patched run-meta.yaml: rules_ref -> pr-$PR_NUMBER" + fi + LOCAL_BUNDLE_ARG="--local-run-dir $EVAL_RUN_DIR" + else + echo "WARNING: No evaluation run folder found -- trend report will not include current PR" + fi + + # Trend report across releases + current PR + echo "========================================" + echo " Trend Report" + echo "========================================" + uv run python -m trend_reports trend \ + --baseline test_cases/sci-calc/golden.yaml \ + --format all \ + --output-dir "$CODEBUILD_SRC_DIR/.codebuild/trend-runs" \ + --gate \ + $LOCAL_BUNDLE_ARG + + # Collect artifacts + mkdir -p "$CODEBUILD_SRC_DIR/.codebuild/regression-runs" + cp -r "$EVALUATOR_DIR/runs/"*/* "$CODEBUILD_SRC_DIR/.codebuild/regression-runs/" 2>/dev/null || true + cd "$CODEBUILD_SRC_DIR" + post_build: + commands: + - echo "Build completed with status $CODEBUILD_BUILD_SUCCEEDING" + - cat ./.codebuild/codebuild.out + artifacts: + files: + - '**/*' + base-directory: .codebuild + discard-paths: no + secondary-artifacts: + evaluation: + files: + - '**/contract-test-results.yaml' + - '**/evaluation-config.yaml' + - '**/qualitative-comparison.yaml' + - '**/quality-report.yaml' + - '**/report.yaml' + - '**/report.md' + - '**/report.html' + - '**/run-meta.yaml' + - '**/run-metrics.yaml' + - '**/test-results.yaml' + name: evaluation + discard-paths: no + base-directory: .codebuild/regression-runs + trend: + files: + - '**/*' + name: trend + discard-paths: no + base-directory: .codebuild/trend-runs + + - name: Build ID + if: always() && steps.cache-check.outputs.cache-hit != 'true' + run: echo "CodeBuild Build ID ${{ steps.codebuild.outputs.aws-build-id }}" + + - name: Download CodeBuild artifacts + if: steps.cache-check.outputs.cache-hit != 'true' + run: | + DOWNLOADS="${ACT_CODEBUILD_DIR:-${GITHUB_WORKSPACE}/.codebuild/downloads}" + mkdir -p "$DOWNLOADS" + PRIMARY_ARTIFACT_LOCATION=$(aws codebuild batch-get-builds \ + --ids "${{ steps.codebuild.outputs.aws-build-id }}" \ + --query 'builds[0].artifacts.location' \ + --output text) + aws s3 cp "s3://${PRIMARY_ARTIFACT_LOCATION#arn:aws:s3:::}" "$DOWNLOADS/$CODEBUILD_PROJECT_NAME.zip" + SECONDARY_ARTIFACT_LOCATIONS=$(aws codebuild batch-get-builds \ + --ids "${{ steps.codebuild.outputs.aws-build-id }}" \ + --query 'builds[0].secondaryArtifacts[*].[artifactIdentifier, location]' \ + --output json) + echo "$SECONDARY_ARTIFACT_LOCATIONS" | jq -r '.[] | @tsv' | while IFS=$'\t' read -r NAME LOCATION; do + echo "Downloading secondary artifact: $NAME" + aws s3 cp "s3://${LOCATION#arn:aws:s3:::}" "$DOWNLOADS/${NAME}.zip" + done + + - name: List CodeBuild artifacts + if: steps.cache-check.outputs.cache-hit != 'true' + run: | + DOWNLOADS="${ACT_CODEBUILD_DIR:-${GITHUB_WORKSPACE}/.codebuild/downloads}" + ls -alR "$DOWNLOADS" + unzip -l "$DOWNLOADS/$CODEBUILD_PROJECT_NAME.zip" + unzip -l "$DOWNLOADS/evaluation.zip" + unzip -l "$DOWNLOADS/trend.zip" + + - name: Post trend report summary on PR + if: github.event_name == 'pull_request' && steps.cache-check.outputs.cache-hit != 'true' + continue-on-error: true + env: + GH_TOKEN: ${{ github.token }} + PR_NUMBER: ${{ github.event.pull_request.number }} + REPO: ${{ github.repository }} + run: | + DOWNLOADS="${ACT_CODEBUILD_DIR:-${GITHUB_WORKSPACE}/.codebuild/downloads}" + STAGING="$DOWNLOADS/trend-staging" + mkdir -p "$STAGING" + + # Extract trend-report.md from trend.zip + unzip -j -o "$DOWNLOADS/trend.zip" "*/trend-report.md" -d "$STAGING" 2>/dev/null || true + + if [[ ! -f "$STAGING/trend-report.md" ]]; then + echo "WARNING: trend-report.md not found in trend.zip — skipping PR comment" + exit 0 + fi + + # Extract just the Executive Summary section (Section A) + SUMMARY=$(sed -n '/^## A\. Executive Summary/,/^---$/p' "$STAGING/trend-report.md" | sed '$d') + + if [[ -z "$SUMMARY" ]]; then + echo "WARNING: Could not extract executive summary — skipping PR comment" + exit 0 + fi + + MARKER="" + BODY="${MARKER} + ${SUMMARY} + + --- + *Full trend report available in the [workflow artifacts](https://github.com/${REPO}/actions/runs/${{ github.run_id }}).*" + + # Update existing comment or create new one + EXISTING=$(gh api "repos/$REPO/issues/$PR_NUMBER/comments" \ + --jq ".[] | select(.body | contains(\"$MARKER\")) | .id" \ + | head -1) + + if [[ -n "$EXISTING" ]]; then + gh api -X PATCH "repos/$REPO/issues/comments/$EXISTING" \ + -f body="$BODY" + echo "Updated existing trend report comment ($EXISTING)" + else + gh pr comment "$PR_NUMBER" --repo "$REPO" --body "$BODY" + echo "Posted trend report comment on PR #$PR_NUMBER" + fi + + - name: Clean old report caches + if: steps.cache-check.outputs.cache-hit != 'true' + env: + GH_TOKEN: ${{ github.token }} + REPO: ${{ github.repository }} + REF_NAME: ${{ github.ref_name }} + run: | + gh cache list -R "$REPO" --key "$CODEBUILD_PROJECT_NAME-$REF_NAME-" --order asc \ + | tail -n 3 \ + | cut -f1 \ + | xargs -I {} gh cache delete -R "$REPO" "{}" || true + + - name: Save report to cache + if: steps.cache-check.outputs.cache-hit != 'true' + uses: actions/cache/save@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3 + with: + path: ${{ github.workspace }}/.codebuild/downloads/${{ env.CODEBUILD_PROJECT_NAME }}.zip + key: ${{ env.CODEBUILD_PROJECT_NAME }}-${{ github.ref_name }}-${{ github.sha }} + + - name: Upload CodeBuild primary artifact + # env.ACT is set by the 'act' CLI tool for local testing + if: ${{ !env.ACT }} + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: ${{ env.CODEBUILD_PROJECT_NAME }}.zip + path: ${{ github.workspace }}/.codebuild/downloads/${{ env.CODEBUILD_PROJECT_NAME }}.zip + if-no-files-found: error + archive: false + + - name: Upload Evaluation Report + # env.ACT is set by the 'act' CLI tool for local testing + if: ${{ !env.ACT }} + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: evaluation.zip + path: ${{ github.workspace }}/.codebuild/downloads/evaluation.zip + if-no-files-found: error + archive: false + + - name: Upload Trend Report + # env.ACT is set by the 'act' CLI tool for local testing + if: ${{ !env.ACT }} + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: trend.zip + path: ${{ github.workspace }}/.codebuild/downloads/trend.zip + if-no-files-found: error + archive: false + + + - name: Extract Report Bundle from Evaluation + if: steps.cache-check.outputs.cache-hit != 'true' + run: | + DOWNLOADS="${ACT_CODEBUILD_DIR:-${GITHUB_WORKSPACE}/.codebuild/downloads}" + BUNDLE_DIR="$DOWNLOADS/report-bundle-staging" + mkdir -p "$BUNDLE_DIR" + + YAML_FILES=( + run-meta.yaml + run-metrics.yaml + test-results.yaml + contract-test-results.yaml + quality-report.yaml + qualitative-comparison.yaml + ) + for f in "${YAML_FILES[@]}"; do + unzip -j -o "$DOWNLOADS/evaluation.zip" "*/$f" -d "$BUNDLE_DIR" 2>/dev/null || true + done + + if [[ -f "$BUNDLE_DIR/run-meta.yaml" ]]; then + (cd "$BUNDLE_DIR" && zip -j "$DOWNLOADS/report-bundle.zip" "${YAML_FILES[@]}" 2>/dev/null) || true + echo "Created report-bundle.zip from evaluation.zip contents" + else + echo "WARNING: run-meta.yaml not found in evaluation.zip — report bundle will be empty" + fi + + - name: Upload Report Bundle + # env.ACT is set by the 'act' CLI tool for local testing + if: ${{ !env.ACT }} + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: >- + ${{ github.event_name == 'pull_request' + && format('report-pr-{0}', github.event.pull_request.number) + || github.ref == 'refs/heads/main' && 'report-main' + || format('report-head') }} + path: ${{ github.workspace }}/.codebuild/downloads/report-bundle.zip + if-no-files-found: warn + archive: false + + - name: Upload artifacts to release + if: startsWith(github.ref, 'refs/tags/v') + env: + GH_TOKEN: ${{ github.token }} + TAG: ${{ github.ref_name }} + REPO: ${{ github.repository }} + run: | + DOWNLOADS="${GITHUB_WORKSPACE}/.codebuild/downloads" + ARTIFACTS=( + "$DOWNLOADS/$CODEBUILD_PROJECT_NAME.zip" + "$DOWNLOADS/evaluation.zip" + "$DOWNLOADS/trend.zip" + ) + # Rename report bundle to match trend report fetcher pattern (report*.zip) + if [[ -f "$DOWNLOADS/report-bundle.zip" ]]; then + cp "$DOWNLOADS/report-bundle.zip" "$DOWNLOADS/report-${TAG}.zip" + ARTIFACTS+=("$DOWNLOADS/report-${TAG}.zip") + fi + + # Wait for release to exist (release.yml typically finishes in ~30s, + # CodeBuild takes minutes — this is a safety net) + RELEASE_EXISTS=false + for i in $(seq 1 30); do + if gh release view "$TAG" --repo "$REPO" --json isDraft,tagName &>/dev/null; then + RELEASE_EXISTS=true + break + fi + echo "Waiting for release $TAG (attempt $i/30)..." + sleep 10 + done + + if [[ "$RELEASE_EXISTS" == "true" ]]; then + # Release exists (draft or published) — upload/replace artifacts + IS_DRAFT=$(gh release view "$TAG" --repo "$REPO" --json isDraft --jq '.isDraft') + if [[ "$IS_DRAFT" == "true" ]]; then + echo "Draft release $TAG found — uploading artifacts" + else + echo "Published release $TAG found — attempting to replace artifacts" + fi + gh release upload "$TAG" "${ARTIFACTS[@]}" --repo "$REPO" --clobber || { + echo "WARNING: Failed to upload artifacts to release $TAG (release may be immutable)" + echo "Artifacts are still available as workflow artifacts above" + } + else + # No release exists — create a draft with artifacts + echo "No release found for $TAG — creating draft release with artifacts" + gh release create "$TAG" "${ARTIFACTS[@]}" \ + --repo "$REPO" \ + --draft \ + --title "AI-DLC Workflow ${TAG#v}" \ + --notes "Build artifacts from CodeBuild. Rules zip pending from release workflow." + fi diff --git a/.github/workflows/pull-request-lint.yml b/.github/workflows/pull-request-lint.yml new file mode 100644 index 00000000..0eb5f57a --- /dev/null +++ b/.github/workflows/pull-request-lint.yml @@ -0,0 +1,186 @@ +name: Pull Request Validation + +on: + pull_request_target: + branches: [ "main" ] + types: + - edited + - labeled + - opened + - ready_for_review + - reopened + - synchronize + - unlabeled + merge_group: + types: + - checks_requested + +permissions: {} + +concurrency: + group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} + cancel-in-progress: true + +env: + DO_NOT_MERGE_LABEL: ${{ vars.DO_NOT_MERGE_LABEL || 'do-not-merge' }} + HALT_MERGES: ${{ vars.HALT_MERGES || '0' }} + +jobs: + get-pr-info: + permissions: + contents: read + pull-requests: read + # id-token: write + runs-on: ubuntu-latest + outputs: + pr_number: ${{ steps.get-pr.outputs.pr-number }} + pr_labels: ${{ steps.get-pr.outputs.pr-labels }} + env: + GH_TOKEN: ${{ github.token }} + PR_LABELS_JSON: ${{ toJson(github.event.pull_request.labels.*.name) }} + steps: + - name: Get PR info + id: get-pr + run: | + if [ "${{ github.event_name }}" == "merge_group" ]; then + PR_NUMBER=$(echo "${{ github.ref }}" | grep -oP '(?<=/pr-)\d+' || echo "") + PR_LABELS=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER | jq -c '[.labels[].name] // []') + echo "::group::Getting Information" + gh api repos/${{ github.repository }}/pulls/$PR_NUMBER + echo $PR_LABELS + echo "::endgroup::" + elif [ "${{ github.event_name }}" == "pull_request" -o "${{ github.event_name }}" == "pull_request_target" ]; then + PR_NUMBER="${{ github.event.pull_request.number }}" + PR_LABELS=$(echo "$PR_LABELS_JSON" | jq -c '.') + fi + echo "::group::Debug Output Values" + echo "PR_NUMBER: $PR_NUMBER" + echo "PR_LABELS: $PR_LABELS" + echo "::endgroup::" + echo "pr-number=$PR_NUMBER" >> $GITHUB_OUTPUT + echo "pr-labels=$PR_LABELS" >> $GITHUB_OUTPUT + + check-merge-status: + name: Check Merge Status + runs-on: ubuntu-latest + needs: get-pr-info + permissions: + pull-requests: read + if: always() + env: + GH_TOKEN: ${{ github.token }} + steps: + - run: | + PR_NUMBER="${{ needs.get-pr-info.outputs.pr_number }}" + # Default to 0 (allow all) if not set + if [ -z "$HALT_MERGES" ]; then + HALT_MERGES=0 + fi + echo "::debug::HALT_MERGES value: $HALT_MERGES" + echo "::debug::This PR number: $PR_NUMBER" + echo "::group::Open Release Pull Requests" + gh pr list --state "open" --repo "${{ github.repository }}" --json "number,headRefName" + OPEN_RELEASES=$(gh pr list --state "open" --repo "${{ github.repository }}" --json "number,headRefName" | \ + jq '[.[] | select(.headRefName | startswith("release/"))]') + echo $OPEN_RELEASES + echo "::endgroup::" + echo $OPEN_RELEASES | jq --exit-status '[.[] | select(.number != '$PR_NUMBER')] | length == 0' && \ + echo "No other open release pull requests" || \ + (echo "::warning::⚠️ Merges are rejected while there are open release pull requests" && exit 1) + if [ "$HALT_MERGES" = "0" ]; then + echo "✅ All merges are allowed (HALT_MERGES=0)" + exit 0 + elif [ "$HALT_MERGES" = "$PR_NUMBER" ]; then + echo "✅ This PR #$PR_NUMBER is explicitly allowed" + exit 0 + else + echo "::debug::🛑 Merges are blocked. HALT_MERGES is set to $HALT_MERGES" + if [ "$HALT_MERGES" -lt 0 ]; then + echo "::error::🛑 All merges are blocked" + else + echo "::warning::⚠️ Only PR #$HALT_MERGES is allowed to merge" + fi + exit 1 + fi + + fail-by-label: + name: Fail by Label + runs-on: ubuntu-latest + needs: get-pr-info + if: always() + steps: + - run: | + echo "::group::Debug Output Values" + echo "PR_LABELS: ${{ needs.get-pr-info.outputs.pr_labels }}" + echo "::endgroup::" + - name: When PR has the "${{ env.DO_NOT_MERGE_LABEL }}" label + id: pr-has-label + if: contains(needs.get-pr-info.outputs.pr_labels, env.DO_NOT_MERGE_LABEL) + run: | + echo "::error::❌ The label \"${{ env.DO_NOT_MERGE_LABEL }}\" is used to prevent merging." + exit 1 + - name: When PR does not have the "${{ env.DO_NOT_MERGE_LABEL }}" label + id: pr-missing-label + if: ! contains(needs.get-pr-info.outputs.pr_labels, env.DO_NOT_MERGE_LABEL) + run: | + echo "✅ The label \"${{ env.DO_NOT_MERGE_LABEL }}\" is absent" + exit 0 + + validate: + name: Validate PR title + runs-on: ubuntu-latest + permissions: + pull-requests: read + if: (github.event_name == 'pull_request' || github.event_name == 'pull_request_target') + steps: + - uses: amannn/action-semantic-pull-request@48f256284bd46cdaab1048c3721360e808335d50 #v6.1.1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + types: |- + fix + feat + build + chore + ci + docs + style + refactor + perf + test + requireScope: false + + auto-label: + name: Auto-label + runs-on: ubuntu-latest + permissions: + contents: read + issues: write + pull-requests: write + if: github.event_name == 'pull_request_target' + steps: + - uses: actions/labeler@634933edcd8ababfe52f92936142cc22ac488b1b # v6.0.1 + with: + sync-labels: true + + contributorStatement: + name: Require Contributor Statement + runs-on: ubuntu-latest + permissions: + pull-requests: read + env: + PR_BODY: ${{ github.event.pull_request.body }} + EXPECTED: By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of the [project license](https://github.com/${{ github.repository }}/blob/main/LICENSE). + HELP: Contributor statement missing from PR description. Please include the following text in the PR description + if: (github.event_name == 'pull_request' || github.event_name == 'pull_request_target') && !(github.event.pull_request.user.login == 'aidlc-workflows' || github.event.pull_request.user.login == 'dependabot[bot]' || github.event.pull_request.user.login == 'github-actions' || github.event.pull_request.user.login == 'github-actions[bot]') + steps: + - uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd #v8.0.0 + with: + script: |- + const actual = process.env.PR_BODY.replace(/\r?\n/g, "\n"); + const expected = process.env.EXPECTED.replace(/\r?\n/g, "\n"); + if (!actual.includes(expected)) { + console.log("%j", actual); + console.log("%j", expected); + core.setFailed(`${process.env.HELP}: ${expected}`); + } diff --git a/.github/workflows/release-pr.yml b/.github/workflows/release-pr.yml new file mode 100644 index 00000000..191e3dbe --- /dev/null +++ b/.github/workflows/release-pr.yml @@ -0,0 +1,175 @@ +# Release PR +# +# Creates a PR with an updated CHANGELOG.md for a new release. +# The changelog is generated from conventional commits using git-cliff. +# +# When the PR is merged, tag-on-merge.yml automatically tags the merge commit, +# which triggers release.yml (draft release) and codebuild.yml (build artifacts). +# +# Usage: +# 1. Run this workflow via workflow_dispatch (optionally specify a version) +# 2. Review and merge the resulting PR +# 3. The tag is created automatically — review and publish the draft release + +name: Release PR + +on: + workflow_dispatch: + inputs: + version: + description: 'Release version (e.g., 0.2.0). Leave empty to auto-determine from conventional commits.' + required: false + type: string + +permissions: {} + +jobs: + release-pr: + name: Create Release PR + permissions: + contents: write + pull-requests: write + issues: write + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + fetch-depth: 0 + + - name: Install git-cliff + uses: orhun/git-cliff-action@e16f179f0be49ecdfe63753837f20b9531642772 # v4.7.0 + with: + config: cliff.toml + args: --version + env: + OUTPUT: /dev/null + + - name: Determine version + id: version + env: + INPUT_VERSION: ${{ inputs.version }} + run: | + if [[ -n "$INPUT_VERSION" ]]; then + # Strip leading v if present for validation + VERSION="${INPUT_VERSION#v}" + if [[ ! "$VERSION" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then + echo "ERROR: Version '$INPUT_VERSION' is not valid semver (expected: MAJOR.MINOR.PATCH, e.g. 0.2.0)" + exit 1 + fi + else + VERSION=$(git-cliff --bumped-version 2>/dev/null || echo "") + if [[ -z "$VERSION" ]]; then + # Fall back to patch bump from latest tag + LATEST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "") + if [[ -n "$LATEST_TAG" ]]; then + LATEST="${LATEST_TAG#v}" + MAJOR="${LATEST%%.*}" + REST="${LATEST#*.}" + MINOR="${REST%%.*}" + PATCH="${REST#*.}" + PATCH=$((PATCH + 1)) + VERSION="${MAJOR}.${MINOR}.${PATCH}" + echo "WARNING: No conventional commits detected — falling back to patch bump: $VERSION" + else + echo "::error::No conventional commits and no existing tags — nothing to release" + exit 1 + fi + fi + fi + # Strip leading v if present + VERSION="${VERSION#v}" + echo "version=$VERSION" >> "$GITHUB_OUTPUT" + echo "tag=v$VERSION" >> "$GITHUB_OUTPUT" + echo "Determined version: $VERSION (tag: v$VERSION)" + + - name: Check tag does not exist + env: + TAG: ${{ steps.version.outputs.tag }} + run: | + if git rev-parse "refs/tags/$TAG" &>/dev/null; then + echo "ERROR: Tag $TAG already exists" + exit 1 + fi + + - name: Generate changelog + uses: orhun/git-cliff-action@e16f179f0be49ecdfe63753837f20b9531642772 # v4.7.0 + with: + config: cliff.toml + args: --tag ${{ steps.version.outputs.tag }} + env: + OUTPUT: CHANGELOG.md + GITHUB_REPO: ${{ github.repository }} + + - name: Create release PR + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + VERSION: ${{ steps.version.outputs.version }} + TAG: ${{ steps.version.outputs.tag }} + run: | + BRANCH="release/$TAG" + + # Check if branch already exists (local or remote) + if git ls-remote --exit-code --heads origin "$BRANCH" &>/dev/null; then + echo "::error::Branch '$BRANCH' already exists. A release PR may already be open — close it and delete the branch to re-run." + exit 1 + fi + + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + + echo "$VERSION" > aidlc-rules/VERSION + git add CHANGELOG.md aidlc-rules/VERSION + if git diff --cached --quiet CHANGELOG.md; then + echo "::error::No releasable commits since last tag – CHANGELOG.md is already up to date. Push new conventional commits or adjust the requested version before rerunning this workflow." + exit 1 + fi + + git checkout -b "$BRANCH" + git commit -m "docs: update changelog for $TAG" + git push origin "$BRANCH" + + label_args=() + for LABEL in "release" "rules"; do + if gh label list --search "$LABEL" --json name --jq '.[].name' | grep -qx "$LABEL"; then + label_args+=("--label" "$LABEL") + fi + done + + # Draft PR because the github-actions[bot] does not trigger a pull_request_target workflow + gh pr create \ + --title "docs: update changelog for $TAG" \ + --draft \ + --body "$(cat < [!WARNING] + > All other pull requests are blocked until merged or closed + + This pull request is for the $TAG release. + + ## Checklist (in order) + + 1. [ ] Mark the pull request "Ready for review" and label with "codebuild" to trigger required workflows + 2. [ ] Inspect the CHANGELOG.md and "Approve" or "Reject" the pending [CodeBuild](https://github.com/awslabs/aidlc-workflows/actions/workflows/codebuild.yml) GitHub Action + 3. [ ] Evaluate the artifacts + 4. [ ] Review the pull request (if approved set the "Merge when ready") + + ## Post Merge + + * [ ] Verify $TAG tag + * [ ] Approve "Approve" or "Reject" the pending [CodeBuild](https://github.com/awslabs/aidlc-workflows/actions/workflows/codebuild.yml) GitHub Action + * [ ] Review the drafted release artifacts for completion + * [ ] Publish the release + + > [!CAUTION] + > Simply closing this will block a subsequent $TAG release, so delete the branch or reopen the pull request if necessary + + **When merged**, the merge commit will be automatically tagged as \`$TAG\`, which triggers: + - \`release.yml\` — creates a draft GitHub Release with the rules zip + - \`codebuild.yml\` — runs CodeBuild and attaches build artifacts to the draft + + After both workflows complete, review and publish the draft release. + EOF + )" \ + "${label_args[@]}" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..309adc66 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,75 @@ +# Release Pipeline +# +# This workflow handles versioning and distribution of the AI-DLC methodology. +# It triggers when a version tag is pushed (by tag-on-merge.yml or manually) and: +# +# 1. Creates distribution artifact: +# - ai-dlc-rules-vX.X.X.zip (Rules format for Amazon Q, Kiro, etc.) +# 2. Creates a draft GitHub Release with artifact attached +# +# The release is created as a draft so that build artifacts from CodeBuild +# (codebuild.yml) can be attached before a human reviews and publishes it. +# +# Normal flow: +# 1. release-pr.yml creates a PR with CHANGELOG update (manual dispatch) +# 2. Human reviews and merges the release PR +# 3. tag-on-merge.yml auto-tags the merge commit → triggers this workflow + +name: Release + +on: + workflow_dispatch: {} + push: + tags: + - 'v*' + +permissions: {} + +jobs: + release: + name: Create Release + permissions: + contents: write + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + fetch-depth: 0 + + - name: Extract version + id: version + run: | + if [[ "$GITHUB_REF" != refs/tags/v* ]]; then + echo "::warning::Skipping release — this workflow must run on a v* tag (got $GITHUB_REF). Use workflow_dispatch from a tag, not a branch." + echo "skip=true" >> "$GITHUB_OUTPUT" + exit 0 + fi + VERSION=${GITHUB_REF#refs/tags/v} + echo "version=$VERSION" >> "$GITHUB_OUTPUT" + echo "tag=${GITHUB_REF#refs/tags/}" >> "$GITHUB_OUTPUT" + + - name: Create release artifact + if: steps.version.outputs.skip != 'true' + run: | + VERSION="${{ steps.version.outputs.version }}" + zip -r "ai-dlc-rules-v${VERSION}.zip" aidlc-rules/ + + - name: Create GitHub Release + if: steps.version.outputs.skip != 'true' + uses: softprops/action-gh-release@a06a81a03ee405af7f2048a818ed3f03bbf83c7b # v2.5.0 + with: + draft: true + name: "AI-DLC Workflow v${{ steps.version.outputs.version }}" + body: | + Release v${{ steps.version.outputs.version }} + + See [CHANGELOG.md](CHANGELOG.md) for details. + + ## Downloads + - **ai-dlc-rules** - Rules format (Amazon Q, Kiro, etc.) + files: | + ai-dlc-rules-v${{ steps.version.outputs.version }}.zip + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/security-scanners.yml b/.github/workflows/security-scanners.yml new file mode 100644 index 00000000..e6d95a66 --- /dev/null +++ b/.github/workflows/security-scanners.yml @@ -0,0 +1,350 @@ +name: Security Scanners + +on: + schedule: + # Daily at 03:47 UTC (random time to avoid GitHub Actions load spikes) + - cron: '47 3 * * *' + workflow_dispatch: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: {} + +concurrency: + group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + gitleaks: + permissions: + actions: read + contents: read + security-events: write + runs-on: ubuntu-latest + env: + GITLEAKS_VERSION: "8.30.1" + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + fetch-depth: 0 + - name: Install gitleaks + run: | + curl -sSfL "https://github.com/gitleaks/gitleaks/releases/download/v${GITLEAKS_VERSION}/gitleaks_${GITLEAKS_VERSION}_linux_x64.tar.gz" | tar -xz + sudo mv gitleaks /usr/local/bin/ + gitleaks --version + - name: Run gitleaks (full history) + id: gitleaks + run: | + ARGS="" + [ -f .gitleaks.toml ] && ARGS="$ARGS --config=.gitleaks.toml" + [ -f .gitleaks-baseline.json ] && ARGS="$ARGS --baseline-path=.gitleaks-baseline.json" + set +e + gitleaks git $ARGS --report-path=gitleaks-report_sarif.json --report-format=sarif . + GITLEAKS_EXIT=$? + set -e + echo "exit_code=$GITLEAKS_EXIT" >> "$GITHUB_OUTPUT" + exit 0 + - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + if: always() + with: + name: gitleaks.sarif + path: gitleaks-report_sarif.json + if-no-files-found: error + - uses: github/codeql-action/upload-sarif@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 + continue-on-error: true + with: + sarif_file: gitleaks-report_sarif.json + - if: steps.gitleaks.outputs.exit_code != '0' + env: + SCANNER_EXIT: ${{ steps.gitleaks.outputs.exit_code }} + run: | + echo "::error::gitleaks found secrets" + exit "$SCANNER_EXIT" + + semgrep: + permissions: + actions: read + contents: read + security-events: write + runs-on: ubuntu-latest + env: + SEMGREP_VERSION: "1.157.0" + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + fetch-depth: 0 + - run: | + echo "semgrep==$SEMGREP_VERSION" > requirements.txt + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: '3.x' + cache: 'pip' + - run: | + pip install -r requirements.txt + rm requirements.txt + - name: Run semgrep + env: + BASELINE_SHA: ${{ github.event.pull_request.base.sha || github.event.merge_group.base_sha }} + run: | + BASELINE_ARGS="" + if [ -n "$BASELINE_SHA" ]; then + BASELINE_ARGS="--baseline-commit $BASELINE_SHA" + fi + set +e + semgrep scan --oss-only --verbose --metrics=off --config=r/all \ + --max-log-list-entries=0 \ + --sarif-output semgrep-report_sarif.json $BASELINE_ARGS + SEMGREP_EXIT=$? + set -e + exit 0 + - name: Fix SARIF for GitHub compatibility + continue-on-error: true + run: | + # Fix 1: Convert text security-severity to numeric (CVSS 0.0-10.0) + sed -i \ + -e 's/"security-severity":"Info"/"security-severity":"0.0"/gI' \ + -e 's/"security-severity":"Low"/"security-severity":"2.0"/gI' \ + -e 's/"security-severity":"Medium"/"security-severity":"5.0"/gI' \ + -e 's/"security-severity":"High"/"security-severity":"7.0"/gI' \ + -e 's/"security-severity":"Critical"/"security-severity":"9.0"/gI' \ + semgrep-report_sarif.json + + # Fix 2: Truncate rule IDs exceeding 255 characters + jq ' + (.runs[0].tool.driver.rules // []) as $rules | + ([$rules[] | select((.id | length) > 255) | + {key: .id, value: (.id[0:247] + "-" + (.id | @base64 | gsub("[+/]"; "_") | .[0:7]))} + ] | from_entries) as $id_map | + .runs[0].tool.driver.rules = [ + $rules[] | + if (.id | length) > 255 then + .id = $id_map[.id] | + .name = $id_map[.name] // .name + else . end + ] | + .runs[0].results = [ + (.runs[0].results // [])[] | + if (.ruleId | length) > 255 then + .ruleId = $id_map[.ruleId] + else . end + ] + ' semgrep-report_sarif.json > semgrep.sarif.tmp.json && mv semgrep.sarif.tmp.json semgrep-report_sarif.json + - name: Check for ERROR severity findings + id: semgrep + run: | + # Fail only if ERROR severity findings exist (level=error in SARIF) + HIGH_COUNT=$(jq '[.runs[0].results[] | select(.level == "error")] | length' semgrep-report_sarif.json 2>/dev/null || echo 0) + if [ "$HIGH_COUNT" -gt 0 ]; then + echo "exit_code=1" >> "$GITHUB_OUTPUT" + else + echo "exit_code=0" >> "$GITHUB_OUTPUT" + fi + - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: semgrep.sarif + path: semgrep-report_sarif.json + if-no-files-found: error + - uses: github/codeql-action/upload-sarif@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 + continue-on-error: true + with: + sarif_file: semgrep-report_sarif.json + - if: steps.semgrep.outputs.exit_code != '0' + env: + SCANNER_EXIT: ${{ steps.semgrep.outputs.exit_code }} + run: | + echo "::error::semgrep found new security issues" + exit "$SCANNER_EXIT" + + grype: + permissions: + actions: read + contents: read + security-events: write + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - run: | + curl -sSfL https://raw.githubusercontent.com/anchore/grype/dee8de483dfba5b4e0bc0aa8e4ab2ce52137e490/install.sh | sh -s -- -b /usr/local/bin v0.110.0 + grype --version + - name: Run grype + id: grype + run: | + set +e + grype --config .grype.yaml --output sarif . | tee grype-report_sarif.json + GRYPE_EXIT=${PIPESTATUS[0]} + set -e + echo "exit_code=$GRYPE_EXIT" >> "$GITHUB_OUTPUT" + exit 0 + - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + if: always() + with: + name: grype.sarif + path: grype-report_sarif.json + if-no-files-found: error + - uses: github/codeql-action/upload-sarif@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 + continue-on-error: true + with: + sarif_file: grype-report_sarif.json + - if: steps.grype.outputs.exit_code != '0' + env: + SCANNER_EXIT: ${{ steps.grype.outputs.exit_code }} + run: | + echo "::error::grype found vulnerabilities" + exit "$SCANNER_EXIT" + + bandit: + permissions: + actions: read + contents: read + security-events: write + runs-on: ubuntu-latest + env: + BANDIT_VERSION: "1.9.4" + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - run: | + echo "bandit[sarif]==$BANDIT_VERSION" > requirements.txt + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: '3.x' + cache: 'pip' + - name: Run bandit + id: bandit + run: | + pip install -r requirements.txt + rm requirements.txt + set +e + bandit -c .bandit -r scripts/aidlc-evaluator -f sarif -o bandit-report_sarif.json + BANDIT_EXIT=$? + set -e + # Fail only if HIGH severity findings exist (level=error in SARIF) + HIGH_COUNT=$(jq '[.runs[0].results[] | select(.level == "error")] | length' bandit-report_sarif.json 2>/dev/null || echo 0) + if [ "$HIGH_COUNT" -gt 0 ]; then + echo "exit_code=1" >> "$GITHUB_OUTPUT" + else + echo "exit_code=0" >> "$GITHUB_OUTPUT" + fi + exit 0 + - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + if: always() + with: + name: bandit.sarif + path: bandit-report_sarif.json + if-no-files-found: error + - uses: github/codeql-action/upload-sarif@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 + continue-on-error: true + with: + sarif_file: bandit-report_sarif.json + - if: steps.bandit.outputs.exit_code != '0' + env: + SCANNER_EXIT: ${{ steps.bandit.outputs.exit_code }} + run: | + echo "::error::bandit found security issues" + exit "$SCANNER_EXIT" + + checkov: + permissions: + actions: read + contents: read + security-events: write + runs-on: ubuntu-latest + env: + CHECKOV_VERSION: "3.2.513" + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - run: | + echo "checkov==$CHECKOV_VERSION" > requirements.txt + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: '3.x' + cache: 'pip' + - name: Run checkov + run: | + pip install -r requirements.txt + rm requirements.txt + set +e + checkov -d . --output sarif --output-file-path . + CHECKOV_EXIT=$? + mv results_sarif.sarif checkov-report_sarif.json || true + set -e + exit 0 + - name: Check for ERROR severity findings + id: checkov + run: | + # Fail only if ERROR severity findings exist (level=error in SARIF) + HIGH_COUNT=$(jq '[.runs[0].results[] | select(.level == "error")] | length' checkov-report_sarif.json 2>/dev/null || echo 0) + if [ "$HIGH_COUNT" -gt 0 ]; then + echo "exit_code=1" >> "$GITHUB_OUTPUT" + else + echo "exit_code=0" >> "$GITHUB_OUTPUT" + fi + - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + if: always() + with: + name: checkov.sarif + path: checkov-report_sarif.json + if-no-files-found: error + - uses: github/codeql-action/upload-sarif@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 + continue-on-error: true + with: + sarif_file: checkov-report_sarif.json + - if: steps.checkov.outputs.exit_code != '0' + env: + SCANNER_EXIT: ${{ steps.checkov.outputs.exit_code }} + run: | + echo "::error::checkov found IaC issues" + exit "$SCANNER_EXIT" + + clamav: + permissions: + actions: read + contents: read + runs-on: ubuntu-latest + services: + clamav: + image: clamav/clamav@sha256:bf876a415b7ff77b9305b1de087e6d16833d170931581b01404e8761cb0dc87c + ports: + - 127.0.0.1:3310:3310 + options: >- + --health-cmd "/usr/local/bin/clamdcheck.sh" + --health-interval 10s + --health-timeout 5s + --health-retries 10 + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Wait for ClamAV service + run: timeout 300 bash -c 'until echo > /dev/tcp/localhost/3310; do sleep 5; done' 2>/dev/null + - name: Install clamdscan client + run: | + sudo apt-get update || true + sudo rm -f /var/lib/man-db/auto-update + sudo apt-get install -y --no-install-recommends clamdscan + sudo mkdir -p /etc/clamav + cat << EOF | sudo tee /etc/clamav/clamd.conf + TCPSocket 3310 + TCPAddr 127.0.0.1 + EOF + clamdscan --version + - name: Run ClamAV scan + id: clamav + run: | + set +e + clamdscan --verbose --log=clamdscan.txt --stream --fdpass --multiscan . + CLAMAV_EXIT=$? + set -e + echo "exit_code=$CLAMAV_EXIT" >> "$GITHUB_OUTPUT" + exit 0 + - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + if: always() + with: + name: clamdscan.txt + path: clamdscan.txt + if-no-files-found: error + - if: steps.clamav.outputs.exit_code != '0' + env: + SCANNER_EXIT: ${{ steps.clamav.outputs.exit_code }} + run: | + echo "::error::clamav detected malware" + exit "$SCANNER_EXIT" diff --git a/.github/workflows/tag-on-merge.yml b/.github/workflows/tag-on-merge.yml new file mode 100644 index 00000000..adbd01b6 --- /dev/null +++ b/.github/workflows/tag-on-merge.yml @@ -0,0 +1,98 @@ +# Tag Release +# +# Automatically creates a version tag when a release PR is merged. +# Triggered by pull_request close events; only runs when: +# - The PR was actually merged (not just closed) +# - The branch name starts with "release/v" +# +# After tagging, dispatches release.yml first and waits for it to create the +# draft release, then dispatches codebuild.yml. This ensures the draft exists +# before build artifacts are uploaded. Uses workflow_dispatch (not tag push) +# because tags created with GITHUB_TOKEN don't trigger other workflows. + +name: Tag Release + +on: + pull_request: + types: [closed] + +permissions: {} + +jobs: + tag: + name: Create Release Tag + permissions: + contents: write + actions: write + if: >- + github.event.pull_request.merged == true && + startsWith(github.event.pull_request.head.ref, 'release/v') + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 + with: + fetch-depth: 1 + - name: Create tag + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + BRANCH: ${{ github.event.pull_request.head.ref }} + SHA: ${{ github.event.pull_request.merge_commit_sha }} + REPO: ${{ github.repository }} + run: | + TAG="${BRANCH#release/}" + echo "Creating tag $TAG on merge commit $SHA" + + # Verify tag doesn't already exist + if gh api "repos/$REPO/git/refs/tags/$TAG" &>/dev/null; then + echo "ERROR: Tag $TAG already exists" + exit 1 + fi + + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git tag -a "$TAG" "$SHA" -m "Release $TAG" + git push origin "$TAG" + + echo "Tag $TAG created on $SHA" + + - name: Dispatch release workflow and wait + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + BRANCH: ${{ github.event.pull_request.head.ref }} + REPO: ${{ github.repository }} + run: | + TAG="${BRANCH#release/}" + echo "Dispatching release.yml on ref $TAG" + gh workflow run release.yml --ref "$TAG" --repo "$REPO" + + # Wait for the run to appear, then watch it to completion + echo "Waiting for release workflow run to appear..." + sleep 5 + for i in $(seq 1 12); do + RUN_ID=$(gh run list --workflow=release.yml --repo "$REPO" --json databaseId,headBranch,event,status --jq '[.[] | select(.event=="workflow_dispatch")] | first | .databaseId' 2>/dev/null || echo "") + if [[ -n "$RUN_ID" ]]; then + break + fi + echo "Waiting for run to appear (attempt $i/12)..." + sleep 5 + done + + if [[ -z "$RUN_ID" ]]; then + echo "WARNING: Could not find release workflow run — dispatching codebuild anyway" + else + echo "Watching release workflow run $RUN_ID" + gh run watch "$RUN_ID" --repo "$REPO" --exit-status || { + echo "WARNING: Release workflow did not succeed (run $RUN_ID) — dispatching codebuild anyway" + } + fi + + - name: Dispatch codebuild workflow + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + BRANCH: ${{ github.event.pull_request.head.ref }} + REPO: ${{ github.repository }} + run: | + TAG="${BRANCH#release/}" + echo "Dispatching codebuild.yml on ref $TAG" + gh workflow run codebuild.yml --ref "$TAG" --repo "$REPO" diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..286d2384 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +.DS_Store +.amazonq/** +.claude/** +!.claude/settings.json +.codebuild/** +.worktrees/** +.vscode/** +.env +buildspec.yml +codebuild_build.sh diff --git a/.gitleaks-baseline.json b/.gitleaks-baseline.json new file mode 100644 index 00000000..be79ffe7 --- /dev/null +++ b/.gitleaks-baseline.json @@ -0,0 +1,254 @@ +[ + { + "RuleID": "jwt", + "Description": "Uncovered a JSON Web Token, which may lead to unauthorized access to web applications and sensitive user data.", + "StartLine": 29, + "EndLine": 29, + "StartColumn": 40, + "EndColumn": 195, + "Match": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c\"", + "Secret": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c", + "File": "scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py", + "SymlinkFile": "", + "Commit": "aaca23db0bb62347189d2305536ad1c1d2e991f3", + "Link": "https://github.com/awslabs/aidlc-workflows/blob/aaca23db0bb62347189d2305536ad1c1d2e991f3/scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py#L29", + "Entropy": 5.4440703, + "Author": "Jeff Harman", + "Email": "109810187+harmjeff@users.noreply.github.com", + "Date": "2026-03-19T17:06:40Z", + "Message": "feat: add AIDLC Evaluation \u0026 Reporting Framework (#115)\n\n* feat: add aidlc-evaluator framework\n\nEvaluation and reporting framework for validating AI-DLC workflow changes.\nIncludes execution, qualitative/quantitative scoring, contract testing,\nreporting packages, and CLI/IDE harness adapters.\n\nAlso fixes pytest import-mode collision for same-named test files across\npackages, and documents known Windows test_run_command.py failures.\n\nCo-Authored-By: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e\n\n* test: fix cross-platform compatibility in test_run_command.py\n\nReplace shell-specific commands with Python equivalents to ensure tests\npass on all platforms (Windows/Mac/Linux) when using shell=False:\n\n- Replace `echo 'content' \u003e file` with Python pathlib file writing\n- Replace shell builtin `exit N` with Python `sys.exit(N)`\n- Replace `echo 'msg' \u003e\u00262` with Python `sys.stderr.write()`\n- Update command-not-found test to handle both OSError and exit code 127\n\nAll 245 tests now pass successfully on Windows.\n\nCo-Authored-By: Claude Sonnet 4.5 \u003cnoreply@anthropic.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/prompt_template.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/prompt_template.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Remove region profile\n\n* More profile updates\n\n* More profile updates\n\n* Update scripts/aidlc-evaluator/pyproject.toml\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Make docker builder script executable\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/contracttest/src/contracttest/server.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n---------\n\nCo-authored-by: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e", + "Tags": [], + "Fingerprint": "aaca23db0bb62347189d2305536ad1c1d2e991f3:scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py:jwt:29" + }, + { + "RuleID": "generic-api-key", + "Description": "Detected a Generic API Key, potentially exposing access to various services and sensitive operations.", + "StartLine": 36, + "EndLine": 36, + "StartColumn": 18, + "EndColumn": 67, + "Match": "GITHUB_TOKEN=ghp_1234567890abcdefghijklmnopqrstuv\"", + "Secret": "ghp_1234567890abcdefghijklmnopqrstuv", + "File": "scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py", + "SymlinkFile": "", + "Commit": "aaca23db0bb62347189d2305536ad1c1d2e991f3", + "Link": "https://github.com/awslabs/aidlc-workflows/blob/aaca23db0bb62347189d2305536ad1c1d2e991f3/scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py#L36", + "Entropy": 5.003258, + "Author": "Jeff Harman", + "Email": "109810187+harmjeff@users.noreply.github.com", + "Date": "2026-03-19T17:06:40Z", + "Message": "feat: add AIDLC Evaluation \u0026 Reporting Framework (#115)\n\n* feat: add aidlc-evaluator framework\n\nEvaluation and reporting framework for validating AI-DLC workflow changes.\nIncludes execution, qualitative/quantitative scoring, contract testing,\nreporting packages, and CLI/IDE harness adapters.\n\nAlso fixes pytest import-mode collision for same-named test files across\npackages, and documents known Windows test_run_command.py failures.\n\nCo-Authored-By: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e\n\n* test: fix cross-platform compatibility in test_run_command.py\n\nReplace shell-specific commands with Python equivalents to ensure tests\npass on all platforms (Windows/Mac/Linux) when using shell=False:\n\n- Replace `echo 'content' \u003e file` with Python pathlib file writing\n- Replace shell builtin `exit N` with Python `sys.exit(N)`\n- Replace `echo 'msg' \u003e\u00262` with Python `sys.stderr.write()`\n- Update command-not-found test to handle both OSError and exit code 127\n\nAll 245 tests now pass successfully on Windows.\n\nCo-Authored-By: Claude Sonnet 4.5 \u003cnoreply@anthropic.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/prompt_template.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/prompt_template.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Remove region profile\n\n* More profile updates\n\n* More profile updates\n\n* Update scripts/aidlc-evaluator/pyproject.toml\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Make docker builder script executable\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/contracttest/src/contracttest/server.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n---------\n\nCo-authored-by: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e", + "Tags": [], + "Fingerprint": "aaca23db0bb62347189d2305536ad1c1d2e991f3:scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py:generic-api-key:36" + }, + { + "RuleID": "generic-api-key", + "Description": "Detected a Generic API Key, potentially exposing access to various services and sensitive operations.", + "StartLine": 63, + "EndLine": 63, + "StartColumn": 18, + "EndColumn": 58, + "Match": "api_key=a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4\"", + "Secret": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4", + "File": "scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py", + "SymlinkFile": "", + "Commit": "aaca23db0bb62347189d2305536ad1c1d2e991f3", + "Link": "https://github.com/awslabs/aidlc-workflows/blob/aaca23db0bb62347189d2305536ad1c1d2e991f3/scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py#L63", + "Entropy": 3.561278, + "Author": "Jeff Harman", + "Email": "109810187+harmjeff@users.noreply.github.com", + "Date": "2026-03-19T17:06:40Z", + "Message": "feat: add AIDLC Evaluation \u0026 Reporting Framework (#115)\n\n* feat: add aidlc-evaluator framework\n\nEvaluation and reporting framework for validating AI-DLC workflow changes.\nIncludes execution, qualitative/quantitative scoring, contract testing,\nreporting packages, and CLI/IDE harness adapters.\n\nAlso fixes pytest import-mode collision for same-named test files across\npackages, and documents known Windows test_run_command.py failures.\n\nCo-Authored-By: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e\n\n* test: fix cross-platform compatibility in test_run_command.py\n\nReplace shell-specific commands with Python equivalents to ensure tests\npass on all platforms (Windows/Mac/Linux) when using shell=False:\n\n- Replace `echo 'content' \u003e file` with Python pathlib file writing\n- Replace shell builtin `exit N` with Python `sys.exit(N)`\n- Replace `echo 'msg' \u003e\u00262` with Python `sys.stderr.write()`\n- Update command-not-found test to handle both OSError and exit code 127\n\nAll 245 tests now pass successfully on Windows.\n\nCo-Authored-By: Claude Sonnet 4.5 \u003cnoreply@anthropic.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/prompt_template.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/prompt_template.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Remove region profile\n\n* More profile updates\n\n* More profile updates\n\n* Update scripts/aidlc-evaluator/pyproject.toml\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Make docker builder script executable\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/contracttest/src/contracttest/server.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n---------\n\nCo-authored-by: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e", + "Tags": [], + "Fingerprint": "aaca23db0bb62347189d2305536ad1c1d2e991f3:scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py:generic-api-key:63" + }, + { + "RuleID": "generic-api-key", + "Description": "Detected a Generic API Key, potentially exposing access to various services and sensitive operations.", + "StartLine": 73, + "EndLine": 73, + "StartColumn": 10, + "EndColumn": 51, + "Match": "TOKEN=ghp_1234567890abcdefghijklmnopqrstuv", + "Secret": "ghp_1234567890abcdefghijklmnopqrstuv", + "File": "scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py", + "SymlinkFile": "", + "Commit": "aaca23db0bb62347189d2305536ad1c1d2e991f3", + "Link": "https://github.com/awslabs/aidlc-workflows/blob/aaca23db0bb62347189d2305536ad1c1d2e991f3/scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py#L73", + "Entropy": 5.003258, + "Author": "Jeff Harman", + "Email": "109810187+harmjeff@users.noreply.github.com", + "Date": "2026-03-19T17:06:40Z", + "Message": "feat: add AIDLC Evaluation \u0026 Reporting Framework (#115)\n\n* feat: add aidlc-evaluator framework\n\nEvaluation and reporting framework for validating AI-DLC workflow changes.\nIncludes execution, qualitative/quantitative scoring, contract testing,\nreporting packages, and CLI/IDE harness adapters.\n\nAlso fixes pytest import-mode collision for same-named test files across\npackages, and documents known Windows test_run_command.py failures.\n\nCo-Authored-By: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e\n\n* test: fix cross-platform compatibility in test_run_command.py\n\nReplace shell-specific commands with Python equivalents to ensure tests\npass on all platforms (Windows/Mac/Linux) when using shell=False:\n\n- Replace `echo 'content' \u003e file` with Python pathlib file writing\n- Replace shell builtin `exit N` with Python `sys.exit(N)`\n- Replace `echo 'msg' \u003e\u00262` with Python `sys.stderr.write()`\n- Update command-not-found test to handle both OSError and exit code 127\n\nAll 245 tests now pass successfully on Windows.\n\nCo-Authored-By: Claude Sonnet 4.5 \u003cnoreply@anthropic.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/prompt_template.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/prompt_template.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Remove region profile\n\n* More profile updates\n\n* More profile updates\n\n* Update scripts/aidlc-evaluator/pyproject.toml\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Make docker builder script executable\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/contracttest/src/contracttest/server.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n---------\n\nCo-authored-by: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e", + "Tags": [], + "Fingerprint": "aaca23db0bb62347189d2305536ad1c1d2e991f3:scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py:generic-api-key:73" + }, + { + "RuleID": "generic-api-key", + "Description": "Detected a Generic API Key, potentially exposing access to various services and sensitive operations.", + "StartLine": 107, + "EndLine": 107, + "StartColumn": 15, + "EndColumn": 60, + "Match": "token\": \"ghp_1234567890abcdefghijklmnopqrstuv\"", + "Secret": "ghp_1234567890abcdefghijklmnopqrstuv", + "File": "scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py", + "SymlinkFile": "", + "Commit": "aaca23db0bb62347189d2305536ad1c1d2e991f3", + "Link": "https://github.com/awslabs/aidlc-workflows/blob/aaca23db0bb62347189d2305536ad1c1d2e991f3/scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py#L107", + "Entropy": 5.003258, + "Author": "Jeff Harman", + "Email": "109810187+harmjeff@users.noreply.github.com", + "Date": "2026-03-19T17:06:40Z", + "Message": "feat: add AIDLC Evaluation \u0026 Reporting Framework (#115)\n\n* feat: add aidlc-evaluator framework\n\nEvaluation and reporting framework for validating AI-DLC workflow changes.\nIncludes execution, qualitative/quantitative scoring, contract testing,\nreporting packages, and CLI/IDE harness adapters.\n\nAlso fixes pytest import-mode collision for same-named test files across\npackages, and documents known Windows test_run_command.py failures.\n\nCo-Authored-By: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e\n\n* test: fix cross-platform compatibility in test_run_command.py\n\nReplace shell-specific commands with Python equivalents to ensure tests\npass on all platforms (Windows/Mac/Linux) when using shell=False:\n\n- Replace `echo 'content' \u003e file` with Python pathlib file writing\n- Replace shell builtin `exit N` with Python `sys.exit(N)`\n- Replace `echo 'msg' \u003e\u00262` with Python `sys.stderr.write()`\n- Update command-not-found test to handle both OSError and exit code 127\n\nAll 245 tests now pass successfully on Windows.\n\nCo-Authored-By: Claude Sonnet 4.5 \u003cnoreply@anthropic.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/prompt_template.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/prompt_template.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Remove region profile\n\n* More profile updates\n\n* More profile updates\n\n* Update scripts/aidlc-evaluator/pyproject.toml\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Make docker builder script executable\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/contracttest/src/contracttest/server.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n---------\n\nCo-authored-by: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e", + "Tags": [], + "Fingerprint": "aaca23db0bb62347189d2305536ad1c1d2e991f3:scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py:generic-api-key:107" + }, + { + "RuleID": "generic-api-key", + "Description": "Detected a Generic API Key, potentially exposing access to various services and sensitive operations.", + "StartLine": 119, + "EndLine": 119, + "StartColumn": 15, + "EndColumn": 60, + "Match": "token\": \"ghp_1234567890abcdefghijklmnopqrstuv\"", + "Secret": "ghp_1234567890abcdefghijklmnopqrstuv", + "File": "scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py", + "SymlinkFile": "", + "Commit": "aaca23db0bb62347189d2305536ad1c1d2e991f3", + "Link": "https://github.com/awslabs/aidlc-workflows/blob/aaca23db0bb62347189d2305536ad1c1d2e991f3/scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py#L119", + "Entropy": 5.003258, + "Author": "Jeff Harman", + "Email": "109810187+harmjeff@users.noreply.github.com", + "Date": "2026-03-19T17:06:40Z", + "Message": "feat: add AIDLC Evaluation \u0026 Reporting Framework (#115)\n\n* feat: add aidlc-evaluator framework\n\nEvaluation and reporting framework for validating AI-DLC workflow changes.\nIncludes execution, qualitative/quantitative scoring, contract testing,\nreporting packages, and CLI/IDE harness adapters.\n\nAlso fixes pytest import-mode collision for same-named test files across\npackages, and documents known Windows test_run_command.py failures.\n\nCo-Authored-By: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e\n\n* test: fix cross-platform compatibility in test_run_command.py\n\nReplace shell-specific commands with Python equivalents to ensure tests\npass on all platforms (Windows/Mac/Linux) when using shell=False:\n\n- Replace `echo 'content' \u003e file` with Python pathlib file writing\n- Replace shell builtin `exit N` with Python `sys.exit(N)`\n- Replace `echo 'msg' \u003e\u00262` with Python `sys.stderr.write()`\n- Update command-not-found test to handle both OSError and exit code 127\n\nAll 245 tests now pass successfully on Windows.\n\nCo-Authored-By: Claude Sonnet 4.5 \u003cnoreply@anthropic.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/prompt_template.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/prompt_template.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Remove region profile\n\n* More profile updates\n\n* More profile updates\n\n* Update scripts/aidlc-evaluator/pyproject.toml\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Make docker builder script executable\n\n* Update scripts/aidlc-evaluator/README.md\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/shared/src/shared/sandbox.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/contracttest/src/contracttest/server.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n* Update scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py\n\nCodebuilder fixes\n\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e\n\n---------\n\nCo-authored-by: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e\nCo-authored-by: Scott Schreckengaust \u003cscottschreckengaust@users.noreply.github.com\u003e", + "Tags": [], + "Fingerprint": "aaca23db0bb62347189d2305536ad1c1d2e991f3:scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py:generic-api-key:119" + }, + { + "RuleID": "jwt", + "Description": "Uncovered a JSON Web Token, which may lead to unauthorized access to web applications and sensitive user data.", + "StartLine": 29, + "EndLine": 29, + "StartColumn": 40, + "EndColumn": 195, + "Match": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c\"", + "Secret": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c", + "File": "scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py", + "SymlinkFile": "", + "Commit": "bdecf64bd99dbd80b700dde83d23eeb5b65b848c", + "Link": "https://github.com/awslabs/aidlc-workflows/blob/bdecf64bd99dbd80b700dde83d23eeb5b65b848c/scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py#L29", + "Entropy": 5.4440703, + "Author": "Jeff Harman", + "Email": "harmjeff@amazon.com", + "Date": "2026-03-12T20:01:35Z", + "Message": "feat: add aidlc-evaluator framework\n\nEvaluation and reporting framework for validating AI-DLC workflow changes.\nIncludes execution, qualitative/quantitative scoring, contract testing,\nreporting packages, and CLI/IDE harness adapters.\n\nAlso fixes pytest import-mode collision for same-named test files across\npackages, and documents known Windows test_run_command.py failures.\n\nCo-Authored-By: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e", + "Tags": [], + "Fingerprint": "bdecf64bd99dbd80b700dde83d23eeb5b65b848c:scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py:jwt:29" + }, + { + "RuleID": "generic-api-key", + "Description": "Detected a Generic API Key, potentially exposing access to various services and sensitive operations.", + "StartLine": 36, + "EndLine": 36, + "StartColumn": 18, + "EndColumn": 67, + "Match": "GITHUB_TOKEN=ghp_1234567890abcdefghijklmnopqrstuv\"", + "Secret": "ghp_1234567890abcdefghijklmnopqrstuv", + "File": "scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py", + "SymlinkFile": "", + "Commit": "bdecf64bd99dbd80b700dde83d23eeb5b65b848c", + "Link": "https://github.com/awslabs/aidlc-workflows/blob/bdecf64bd99dbd80b700dde83d23eeb5b65b848c/scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py#L36", + "Entropy": 5.003258, + "Author": "Jeff Harman", + "Email": "harmjeff@amazon.com", + "Date": "2026-03-12T20:01:35Z", + "Message": "feat: add aidlc-evaluator framework\n\nEvaluation and reporting framework for validating AI-DLC workflow changes.\nIncludes execution, qualitative/quantitative scoring, contract testing,\nreporting packages, and CLI/IDE harness adapters.\n\nAlso fixes pytest import-mode collision for same-named test files across\npackages, and documents known Windows test_run_command.py failures.\n\nCo-Authored-By: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e", + "Tags": [], + "Fingerprint": "bdecf64bd99dbd80b700dde83d23eeb5b65b848c:scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py:generic-api-key:36" + }, + { + "RuleID": "generic-api-key", + "Description": "Detected a Generic API Key, potentially exposing access to various services and sensitive operations.", + "StartLine": 63, + "EndLine": 63, + "StartColumn": 18, + "EndColumn": 58, + "Match": "api_key=a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4\"", + "Secret": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4", + "File": "scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py", + "SymlinkFile": "", + "Commit": "bdecf64bd99dbd80b700dde83d23eeb5b65b848c", + "Link": "https://github.com/awslabs/aidlc-workflows/blob/bdecf64bd99dbd80b700dde83d23eeb5b65b848c/scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py#L63", + "Entropy": 3.561278, + "Author": "Jeff Harman", + "Email": "harmjeff@amazon.com", + "Date": "2026-03-12T20:01:35Z", + "Message": "feat: add aidlc-evaluator framework\n\nEvaluation and reporting framework for validating AI-DLC workflow changes.\nIncludes execution, qualitative/quantitative scoring, contract testing,\nreporting packages, and CLI/IDE harness adapters.\n\nAlso fixes pytest import-mode collision for same-named test files across\npackages, and documents known Windows test_run_command.py failures.\n\nCo-Authored-By: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e", + "Tags": [], + "Fingerprint": "bdecf64bd99dbd80b700dde83d23eeb5b65b848c:scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py:generic-api-key:63" + }, + { + "RuleID": "generic-api-key", + "Description": "Detected a Generic API Key, potentially exposing access to various services and sensitive operations.", + "StartLine": 73, + "EndLine": 73, + "StartColumn": 10, + "EndColumn": 51, + "Match": "TOKEN=ghp_1234567890abcdefghijklmnopqrstuv", + "Secret": "ghp_1234567890abcdefghijklmnopqrstuv", + "File": "scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py", + "SymlinkFile": "", + "Commit": "bdecf64bd99dbd80b700dde83d23eeb5b65b848c", + "Link": "https://github.com/awslabs/aidlc-workflows/blob/bdecf64bd99dbd80b700dde83d23eeb5b65b848c/scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py#L73", + "Entropy": 5.003258, + "Author": "Jeff Harman", + "Email": "harmjeff@amazon.com", + "Date": "2026-03-12T20:01:35Z", + "Message": "feat: add aidlc-evaluator framework\n\nEvaluation and reporting framework for validating AI-DLC workflow changes.\nIncludes execution, qualitative/quantitative scoring, contract testing,\nreporting packages, and CLI/IDE harness adapters.\n\nAlso fixes pytest import-mode collision for same-named test files across\npackages, and documents known Windows test_run_command.py failures.\n\nCo-Authored-By: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e", + "Tags": [], + "Fingerprint": "bdecf64bd99dbd80b700dde83d23eeb5b65b848c:scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py:generic-api-key:73" + }, + { + "RuleID": "generic-api-key", + "Description": "Detected a Generic API Key, potentially exposing access to various services and sensitive operations.", + "StartLine": 107, + "EndLine": 107, + "StartColumn": 15, + "EndColumn": 60, + "Match": "token\": \"ghp_1234567890abcdefghijklmnopqrstuv\"", + "Secret": "ghp_1234567890abcdefghijklmnopqrstuv", + "File": "scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py", + "SymlinkFile": "", + "Commit": "bdecf64bd99dbd80b700dde83d23eeb5b65b848c", + "Link": "https://github.com/awslabs/aidlc-workflows/blob/bdecf64bd99dbd80b700dde83d23eeb5b65b848c/scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py#L107", + "Entropy": 5.003258, + "Author": "Jeff Harman", + "Email": "harmjeff@amazon.com", + "Date": "2026-03-12T20:01:35Z", + "Message": "feat: add aidlc-evaluator framework\n\nEvaluation and reporting framework for validating AI-DLC workflow changes.\nIncludes execution, qualitative/quantitative scoring, contract testing,\nreporting packages, and CLI/IDE harness adapters.\n\nAlso fixes pytest import-mode collision for same-named test files across\npackages, and documents known Windows test_run_command.py failures.\n\nCo-Authored-By: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e", + "Tags": [], + "Fingerprint": "bdecf64bd99dbd80b700dde83d23eeb5b65b848c:scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py:generic-api-key:107" + }, + { + "RuleID": "generic-api-key", + "Description": "Detected a Generic API Key, potentially exposing access to various services and sensitive operations.", + "StartLine": 119, + "EndLine": 119, + "StartColumn": 15, + "EndColumn": 60, + "Match": "token\": \"ghp_1234567890abcdefghijklmnopqrstuv\"", + "Secret": "ghp_1234567890abcdefghijklmnopqrstuv", + "File": "scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py", + "SymlinkFile": "", + "Commit": "bdecf64bd99dbd80b700dde83d23eeb5b65b848c", + "Link": "https://github.com/awslabs/aidlc-workflows/blob/bdecf64bd99dbd80b700dde83d23eeb5b65b848c/scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py#L119", + "Entropy": 5.003258, + "Author": "Jeff Harman", + "Email": "harmjeff@amazon.com", + "Date": "2026-03-12T20:01:35Z", + "Message": "feat: add aidlc-evaluator framework\n\nEvaluation and reporting framework for validating AI-DLC workflow changes.\nIncludes execution, qualitative/quantitative scoring, contract testing,\nreporting packages, and CLI/IDE harness adapters.\n\nAlso fixes pytest import-mode collision for same-named test files across\npackages, and documents known Windows test_run_command.py failures.\n\nCo-Authored-By: Claude Opus 4.6 \u003cnoreply@anthropic.com\u003e", + "Tags": [], + "Fingerprint": "bdecf64bd99dbd80b700dde83d23eeb5b65b848c:scripts/aidlc-evaluator/packages/shared/tests/test_credential_scrubber.py:generic-api-key:119" + } +] diff --git a/.gitleaks.toml b/.gitleaks.toml new file mode 100644 index 00000000..b9f4b516 --- /dev/null +++ b/.gitleaks.toml @@ -0,0 +1,15 @@ +# Gitleaks configuration +# https://github.com/gitleaks/gitleaks#configuration + +title = "aidlc-workflows gitleaks config" + +[extend] +# Extend the default ruleset (don't replace it) +useDefault = true + +[allowlist] +description = "Global allowlist" +paths = [ + '''\.gitleaks-baseline\.json$''', + '''uv\.lock$''', +] diff --git a/.grype.yaml b/.grype.yaml new file mode 100644 index 00000000..1b6511e1 --- /dev/null +++ b/.grype.yaml @@ -0,0 +1,22 @@ +# Grype configuration +# https://github.com/anchore/grype#configuration + +# Only fail on high or critical vulnerabilities +fail-on-severity: high + +# Ignore specific CVEs that have been reviewed and accepted. +# +# Grype is an SCA scanner (dependencies, not source lines), so there are no +# inline source-code comments. All suppressions go here. +# +# To suppress a finding, add an entry with the CVE and a reason: +# - vulnerability: CVE-YYYY-NNNNN +# reason: "explanation of why this is acceptable" +# +# You can also scope a suppression to a specific package: +# - vulnerability: CVE-YYYY-NNNNN +# package: +# name: "package-name" +# version: "1.2.3" +# reason: "only affects feature X which we don't use" +ignore: [] diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml new file mode 100644 index 00000000..8e55bcc4 --- /dev/null +++ b/.markdownlint-cli2.yaml @@ -0,0 +1,39 @@ +# markdownlint-cli2 configuration +# https://github.com/DavidAnson/markdownlint-cli2 +# Run: npx markdownlint-cli2 "**/*.md" +# Fix: npx markdownlint-cli2 --fix "**/*.md" + +config: + # ============================================================ + # PERMANENTLY DISABLED — conflict with project documentation style + # ============================================================ + + # Line-length — long URLs, tables, code examples, ASCII diagrams + MD013: false + + # Inline HTML — tags for screenshots/badges in README + MD033: false + + # Duplicate headings — section names repeat across platform guides + MD024: false + + # Emphasis as heading — bold text used as sub-labels in lists + MD036: false + + # ============================================================ + # STYLE SETTINGS + # ============================================================ + + # Tables must use aligned column style (pipes vertically aligned) + MD060: + style: "aligned" + +# Ignore generated/vendored/test fixture files +ignores: + - "node_modules/**" + - ".claude/**" + - "scripts/aidlc-evaluator/test_cases/**" + # CHANGELOG.md is auto-generated by git-cliff (cliff.toml controls its format). + # git-cliff postprocessors run per-body so inter-body spacing and trailing + # whitespace cannot be fully controlled via template alone. + - "CHANGELOG.md" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..b06400d1 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,5 @@ +repos: + - repo: https://github.com/DavidAnson/markdownlint-cli2 + rev: v0.22.0 + hooks: + - id: markdownlint-cli2 diff --git a/.semgrepignore b/.semgrepignore new file mode 100644 index 00000000..8b3c1b97 --- /dev/null +++ b/.semgrepignore @@ -0,0 +1,20 @@ +# Semgrep ignore patterns +# https://semgrep.dev/docs/ignoring-files-folders-code + +# Lock files — auto-generated by uv; large (~2k lines) and findings are +# not actionable since dependency versions are managed via pyproject.toml. +uv.lock + +# Test fixtures and golden files — contain sample AIDLC documents and +# expected outputs for evaluator tests; not production code. +scripts/aidlc-evaluator/test_cases/ + +# CodeBuild build specification files — scanned separately as part of the +# CI pipeline's own review process. +.codebuild/ + +# Gitleaks baseline — records pre-existing known findings so gitleaks can +# detect only *new* secrets. NOTE: the baseline file includes matched text +# from findings (the "Secret" and "Match" fields), so treat it as sensitive. +# Current entries are all synthetic test credentials from test_credential_scrubber.py. +.gitleaks-baseline.json diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..97f42cb3 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,122 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +## [Unreleased] + +## [0.1.7] - 2026-04-02 + +### Bug Fixes + +- add required environmental github token (#137) +- Add security extension disclaimer (#134) +- refactor error handling and PR creation in release workflow (#140) +- address PR #140 review feedback for release workflow (#141) +- remove retention-days limit from CodeBuild workflow artifacts (#149) +- skip PR comment steps for fork PRs with read-only GITHUB_TOKEN (#154) +- correct GitHub API path for deleting label-reminder comment (#157) +- remove report-bundle CodeBuild secondary artifact and add --local-run-dir support (#162) +- use PR head branch for rules-ref instead of merge ref (#168) +- write aidlc-rules/VERSION in release PR to trigger CodeBuild (#169) + +### CI/CD + +- add markdownlint infrastructure (config, CI workflow, pre-commit) +- fix MD041 in CODE_OF_CONDUCT.md, re-enable rule + +### Documentation + +- add developer's guide for running CodeBuild locally (#94) +- add working-with-aidlc interaction guide and writing-inputs documents (#121) +- comprehensive documentation review and remediation (#113) + +- enforce MD060 aligned table style, fix 1645 violations + +## [0.1.6] - 2026-03-05 + +### Bug Fixes + +- codebuild cache and download fix (#93) +- correct copy-paste error in error-handling.md (#96) + +### Features + +- add code owners (#112) +- changelog-first release flow with build artifacts on draft releases (#125) +- add AIDLC Evaluation & Reporting Framework (#115) +- update pull request linting conditions (#131) +- add cross-release trend reporting package (#136) +- align CodeBuild workflow with current evaluator CLI and add trend report pipeline (#147) +- gate CodeBuild on 'codebuild' label + aidlc-rules paths (#150) +- auto-label PRs touching aidlc-rules/ with codebuild label (#158) + +### Miscellaneous + +- bump pyjwt in /scripts/aidlc-evaluator (#129) +- bump pillow in /scripts/aidlc-evaluator (#130) +- bump requests in /scripts/aidlc-evaluator (#146) +- bump cryptography in /scripts/aidlc-evaluator (#148) +- bump pygments in /scripts/aidlc-evaluator (#151) +- bump aiohttp in /scripts/aidlc-evaluator (#163) + +## [0.1.5] - 2026-02-24 + +### Features + +- add CodeBuild workflow (#92) + +### Miscellaneous + +- add templates for github issues (#97) + +## [0.1.4] - 2026-02-24 + +### Bug Fixes + +- correct GitHub Copilot instructions and Kiro CLI rule-details path resolution (#82, #84) (#87) + +## [0.1.3] - 2026-02-11 + +### Bug Fixes + +- require actual system time for audit timestamps (#56) + +### Documentation + +- clarify ZIP download location and consolidate notes (#70) + +## [0.1.2] - 2026-02-08 + +### Bug Fixes + +- typo in core-workflow.md +- rename rule and move to bottom of Critical Rules section + +### Documentation + +- update README to direct users to GitHub Releases (#61) +- add Windows CMD setup instructions and ZIP note (#68) + +### Features + +- add test automation friendly code generation rules +- add frontend design coverage in Construction phase + +## [0.1.1] - 2026-01-22 + +### Features + +- adding AIDLC skill to work with IDEs such as Claude, OpenCode and others +- addin +- add leo file + +### Miscellaneous + +- removing wrong files +- removing wrong files + +## [0.1.0] - 2026-01-22 + +### Features + +- add Kiro CLI support and multi-platform architecture diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 5b627cfa..240d2275 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,4 +1,5 @@ -## Code of Conduct +# Code of Conduct + This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact -opensource-codeofconduct@amazon.com with any additional questions or comments. + with any additional questions or comments. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c4b6a1c5..b63c11b2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,58 +1,68 @@ # Contributing Guidelines -Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional -documentation, we greatly value feedback and contributions from our community. +Thank you for your interest in contributing to AI-DLC. Whether it's a bug report, new rule, correction, or documentation improvement, we value feedback and contributions from the community. -Please read through this document before submitting any issues or pull requests to ensure we have all the necessary -information to effectively respond to your bug report or contribution. +Please read through this document before submitting any issues or pull requests. +## Tenets -## Reporting Bugs/Feature Requests +Before contributing, familiarize yourself with our [tenets](README.md#tenets). -We welcome you to use the GitHub issue tracker to report bugs or suggest features. +## Contributing Rules -When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already -reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: +AI-DLC rules live in `aidlc-rules/aws-aidlc-rule-details/`. When contributing: -* A reproducible test case or series of steps -* The version of our code being used -* Any modifications you've made relevant to the bug -* Anything unusual about your environment or deployment +- **Be reproducible**: Changes should be consistently reproducible either via test case or a series of step +- **Single source of truth**: Don't duplicate content. If guidance applies to multiple stages, put it in `common/` and reference it. +- **Keep it agnostic**: The core methodology shouldn't assume specific IDEs, agents, or models. Tool-specific files are generated from the source. +### Rule Structure -## Contributing via Pull Requests -Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: +Rules are organized by phase: + +- `common/` - Shared guidance across all phases +- `inception/` - Planning and architecture rules +- `construction/` - Design and implementation rules +- `operations/` - Deployment and monitoring rules + +### Testing Changes + +Test your rule changes with at least one supported platform (Amazon Q Developer, Kiro, or other tools) before submitting. Describe what you tested in your PR. -1. You are working against the latest source on the *main* branch. -2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. -3. You open an issue to discuss any significant work - we would hate for your time to be wasted. +## Reporting Bugs/Feature Requests + +Use GitHub issues to report bugs or suggest features. Before filing, check existing issues to avoid duplicates. -To send us a pull request, please: +Include: -1. Fork the repository. -2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. -3. Ensure local tests pass. -4. Commit to your fork using clear commit messages. -5. Send us a pull request, answering any default questions in the pull request interface. -6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. +- Which rule or stage is affected +- Expected vs actual behavior +- The platform/model you tested with -GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and -[creating a pull request](https://help.github.com/articles/creating-a-pull-request/). +## Contributing via Pull Requests +Before sending a pull request: -## Finding contributions to work on -Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. +1. Work against the latest `main` branch +2. Check existing open and recently merged PRs +3. Open an issue first for significant changes +To submit: + +1. Fork the repository +2. Make your changes (keep them focused) +3. Use clear commit messages following [conventional commits](https://www.conventionalcommits.org/) (e.g., `feat:`, `fix:`, `docs:`) +4. Submit the PR and respond to feedback ## Code of Conduct + This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). -For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact -opensource-codeofconduct@amazon.com with any additional questions or comments. +For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact with any additional questions or comments. -## Security issue notifications -If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. +## Security Issue Notifications +If you discover a potential security issue, notify AWS/Amazon Security via the [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do not create a public GitHub issue. ## Licensing diff --git a/README.md b/README.md index 04fca610..bb3d12ea 100644 --- a/README.md +++ b/README.md @@ -1,100 +1,754 @@ # AI-DLC (AI-Driven Development Life Cycle) +> [!IMPORTANT] +> Generative AI can make mistakes. You should consider reviewing all output and costs generated by your chosen AI model and agentic coding assistant. See [AWS Responsible AI Policy](https://aws.amazon.com/ai/responsible-ai/policy/). + + AI-DLC is an intelligent software development workflow that adapts to your needs, maintains quality standards, and keeps you in control of the process. For learning more about AI-DLC Methodology, read this [blog](https://aws.amazon.com/blogs/devops/ai-driven-development-life-cycle/) and the [Method Definition Paper](https://prod.d13rzhkk8cj2z0.amplifyapp.com/) referred in it. +## Table of Contents + +- [Quick Start](#quick-start) +- [Platform-Specific Setup](#platform-specific-setup) +- [Usage](#usage) +- [Three-Phase Adaptive Workflow](#three-phase-adaptive-workflow) +- [Key Features](#key-features) +- [Extensions](#extensions) +- [Tenets](#tenets) +- [Prerequisites](#prerequisites) +- [Troubleshooting](#troubleshooting) +- [Version Control Recommendations](#version-control-recommendations) +- [Additional Resources](#additional-resources) +- [Security](#security) +- [License](#license) + +--- + ## Quick Start -### Installation +1. Download the latest release zip from the [Releases page](../../releases/latest) to a folder **outside** your project directory (e.g., `~/Downloads`). +2. Extract the zip. It contains an `aidlc-rules/` folder with two subdirectories: + - `aws-aidlc-rules/` — the core AI-DLC workflow rules + - `aws-aidlc-rule-details/` — detailed rules conditionally referenced by the core rules +3. Follow the setup instructions for your coding agent and platform below. + +--- + +## Platform-Specific Setup + +- [Kiro](#kiro) +- [Amazon Q Developer IDE Plugin](#amazon-q-developer-ide-pluginextension) +- [Cursor IDE](#cursor-ide) +- [Cline](#cline) +- [Claude Code](#claude-code) +- [GitHub Copilot](#github-copilot) +- [Other Agents](#other-agents) + +--- -Set up the AI-DLC rule files as part of your [supported platform](#prerequisites). +### Kiro -#### Kiro CLI +AI-DLC uses [Kiro Steering Files](https://kiro.dev/docs/cli/steering/) within your project workspace. -AI-DLC uses [Kiro Steering Files](https://kiro.dev/docs/cli/steering/) within your project workspace to implement its intelligent workflow. To activate AI-DLC in your project, copy the rules to your project's workspace under the `/.kiro/steering` folder. +The commands below assume you extracted the zip to your `Downloads` folder. If you used a different location, replace `Downloads` with your actual folder path. + +On macOS/Linux: ```bash -git clone -cd ../my-project # assuming your project is located under the same parent folder as the cloned repo -mkdir -p .kiro/steering && cp -R ../aidlc-workflows/aidlc-rules .kiro/steering +mkdir -p .kiro/steering +cp -R ~/Downloads/aidlc-rules/aws-aidlc-rules .kiro/steering/ +cp -R ~/Downloads/aidlc-rules/aws-aidlc-rule-details .kiro/ ``` -To confirm that the AI-DLC rules are correctly loaded in your Kiro CLI, follow these steps: +On Windows (PowerShell): + +```powershell +New-Item -ItemType Directory -Force -Path ".kiro\steering" +Copy-Item -Recurse "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rules" ".kiro\steering\" +Copy-Item -Recurse "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rule-details" ".kiro\" +``` -1. Start Kiro CLI: `kiro-cli` +On Windows (CMD): -2. Check your context contents: `/context show` +```cmd +mkdir .kiro\steering +xcopy %USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rules .kiro\steering\aws-aidlc-rules\ /E /I +xcopy %USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rule-details .kiro\aws-aidlc-rule-details\ /E /I +``` + +Your project should look like: + +```text +/ + ├── .kiro/ + │ ├── steering/ + │ │ ├── aws-aidlc-rules/ + │ ├── aws-aidlc-rule-details/ +``` -3. Verify that you see all entries for `.kiro/steering/aidlc-rules` in the displayed list of rules. +To verify the rules are loaded: -If you do not see them, please check the directory where you previously issued the `cp` command. Ensure that `aidlc-rules` folder was successfully copied to the correct location. The `.kiro` directory must sit directly below the project root. +#### Verify in Kiro IDE -![AI-DLC Rules in Kiro CLI](./assets/images/kiro-cli-aidlc-rules-loaded.png?raw=true "AI-DLC Rules in Kiro CLI") +Open the steering files panel and confirm you see an entry for `core-workflow` under `Workspace` as shown in the screenshot below. -#### Amazon Q Developer IDE Plugin/Extension +AI-DLC Rules in Kiro IDE -AI-DLC uses [Amazon Q Rules](https://docs.aws.amazon.com/amazonq/latest/qdeveloper-ug/context-project-rules.html) to implement its intelligent workflow. To activate AI-DLC in your project, copy the rules to your project's workspace under the `/.amazonq` folder. +We use Kiro IDE in Vibe mode to run the AI-DLC workflow. This ensures that AI-DLC workflow guides the development workflow in Kiro. At times, Kiro may nudge you to switch to spec mode. Select `No` to such prompts to stay in Vibe mode. + +Staying in Kiro Vibe mode + +#### Verify in Kiro CLI + +Run `kiro-cli`, then `/context show`, and confirm entries for `.kiro/steering/aws-aidlc-rules`. + +AI-DLC Rules in Kiro CLI + +--- + +### Amazon Q Developer IDE Plugin/Extension + +AI-DLC uses [Amazon Q Rules](https://docs.aws.amazon.com/amazonq/latest/qdeveloper-ug/context-project-rules.html) within your project workspace. + +The commands below assume you extracted the zip to your `Downloads` folder. If you used a different location, replace `Downloads` with your actual folder path. + +On macOS/Linux: ```bash -git clone -cd my-project # assuming your project is located under the same parent folder as the cloned repo -mkdir -p .amazonq/rules && cp -R ../aidlc-workflows/aidlc-rules .amazonq/rules +mkdir -p .amazonq/rules +cp -R ~/Downloads/aidlc-rules/aws-aidlc-rules .amazonq/rules/ +cp -R ~/Downloads/aidlc-rules/aws-aidlc-rule-details .amazonq/ ``` -To confirm that the Amazon Q Rules are correctly loaded in your IDE, follow these steps: +On Windows (PowerShell): -1. In the Amazon Q Chat window, locate the `Rules` button in the lower right corner and click on it. +```powershell +New-Item -ItemType Directory -Force -Path ".amazonq\rules" +Copy-Item -Recurse "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rules" ".amazonq\rules\" +Copy-Item -Recurse "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rule-details" ".amazonq\" +``` -2. Verify that you see entries for `.amazonq/rules/aidlc-rules` in the displayed list of rules. +On Windows (CMD): -If you do not see the `aidlc-rules` rules loaded, please check the directory where you previously issued the `cp` command. Ensure that `aidlc-rules` folder was successfully copied to the correct location. The `.amazonq` directory must sit directly below the project root. +```cmd +mkdir .amazonq\rules +xcopy %USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rules .amazonq\rules\aws-aidlc-rules\ /E /I +xcopy %USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rule-details .amazonq\aws-aidlc-rule-details\ /E /I +``` + +Your project should look like: + +```text +/ + ├── .amazonq/ + │ ├── rules/ + │ │ ├── aws-aidlc-rules/ + │ ├── aws-aidlc-rule-details/ +``` + +To verify the rules are loaded: + +1. In the Amazon Q Chat window, click the `Rules` button in the lower right corner. +2. Confirm you see entries for `.amazonq/rules/aws-aidlc-rules`. + +AI-DLC Rules in Q Developer IDE plugin + +--- + +### Cursor IDE + +AI-DLC uses [Cursor Rules](https://cursor.com/docs/context/rules) to implement its intelligent workflow. + +The commands below assume you extracted the zip to your `Downloads` folder. If you used a different location, replace `Downloads` with your actual folder path. + +#### Option 1: Project Rules (Recommended) + +**Unix/Linux/macOS:** + +```bash +mkdir -p .cursor/rules + +cat > .cursor/rules/ai-dlc-workflow.mdc << 'EOF' +--- +description: "AI-DLC (AI-Driven Development Life Cycle) adaptive workflow for software development" +alwaysApply: true +--- + +EOF +cat ~/Downloads/aidlc-rules/aws-aidlc-rules/core-workflow.md >> .cursor/rules/ai-dlc-workflow.mdc + +mkdir -p .aidlc-rule-details +cp -R ~/Downloads/aidlc-rules/aws-aidlc-rule-details/* .aidlc-rule-details/ +``` + +**Windows PowerShell:** + +```powershell +New-Item -ItemType Directory -Force -Path ".cursor\rules" + +$frontmatter = @" +--- +description: "AI-DLC (AI-Driven Development Life Cycle) adaptive workflow for software development" +alwaysApply: true +--- + +"@ +$frontmatter | Out-File -FilePath ".cursor\rules\ai-dlc-workflow.mdc" -Encoding utf8 + +Get-Content "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rules\core-workflow.md" | Add-Content ".cursor\rules\ai-dlc-workflow.mdc" + +New-Item -ItemType Directory -Force -Path ".aidlc-rule-details" +Copy-Item "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rule-details\*" ".aidlc-rule-details\" -Recurse +``` + +**Windows CMD:** + +```cmd +mkdir .cursor\rules + +( +echo --- +echo description: "AI-DLC (AI-Driven Development Life Cycle) adaptive workflow for software development" +echo alwaysApply: true +echo --- +echo. +) > .cursor\rules\ai-dlc-workflow.mdc + +type "%USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rules\core-workflow.md" >> .cursor\rules\ai-dlc-workflow.mdc + +mkdir .aidlc-rule-details +xcopy "%USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rule-details" ".aidlc-rule-details\" /E /I +``` -![AI-DLC Rules in Q Developer IDE](./assets/images/q-ide-aidlc-rules-loaded.png?raw=true "AI-DLC Rules in Q Developer") +#### Option 2: AGENTS.md (Simple Alternative) -### Usage +**Unix/Linux/macOS:** + +```bash +cp ~/Downloads/aidlc-rules/aws-aidlc-rules/core-workflow.md ./AGENTS.md +mkdir -p .aidlc-rule-details +cp -R ~/Downloads/aidlc-rules/aws-aidlc-rule-details/* .aidlc-rule-details/ +``` + +**Windows PowerShell:** + +```powershell +Copy-Item "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rules\core-workflow.md" ".\AGENTS.md" +New-Item -ItemType Directory -Force -Path ".aidlc-rule-details" +Copy-Item "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rule-details\*" ".aidlc-rule-details\" -Recurse +``` + +**Windows CMD:** + +```cmd +copy "%USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rules\core-workflow.md" ".\AGENTS.md" +mkdir .aidlc-rule-details +xcopy "%USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rule-details" ".aidlc-rule-details\" /E /I +``` + +**Verify Setup:** + +1. Open **Cursor Settings → Rules, Commands** +2. Under **Project Rules**, you should see `ai-dlc-workflow` listed +3. For `AGENTS.md`, it will be automatically detected and applied + +![AI-DLC Rules in Cursor](./assets/images/cursor-ide-aidlc-rules-loaded.png?raw=true "AI-DLC Rules in Cursor") + +**Directory Structure (Option 1):** + +```text +/ +├── .cursor/ +│ └── rules/ +│ └── ai-dlc-workflow.mdc +└── .aidlc-rule-details/ + ├── common/ + ├── inception/ + ├── construction/ + ├── extensions/ + └── operations/ +``` + +--- + +### Cline + +AI-DLC uses Cline Rules to implement its intelligent workflow. + +The commands below assume you extracted the zip to your `Downloads` folder. If you used a different location, replace `Downloads` with your actual folder path. + +#### Option 1: .clinerules Directory (Recommended) + +**Unix/Linux/macOS:** + +```bash +mkdir -p .clinerules +cp ~/Downloads/aidlc-rules/aws-aidlc-rules/core-workflow.md .clinerules/ +mkdir -p .aidlc-rule-details +cp -R ~/Downloads/aidlc-rules/aws-aidlc-rule-details/* .aidlc-rule-details/ +``` + +**Windows PowerShell:** + +```powershell +New-Item -ItemType Directory -Force -Path ".clinerules" +Copy-Item "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rules\core-workflow.md" ".clinerules\" +New-Item -ItemType Directory -Force -Path ".aidlc-rule-details" +Copy-Item "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rule-details\*" ".aidlc-rule-details\" -Recurse +``` + +**Windows CMD:** + +```cmd +mkdir .clinerules +copy "%USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rules\core-workflow.md" ".clinerules\" +mkdir .aidlc-rule-details +xcopy "%USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rule-details" ".aidlc-rule-details\" /E /I +``` + +#### Option 2: AGENTS.md (Alternative) + +**Unix/Linux/macOS:** + +```bash +cp ~/Downloads/aidlc-rules/aws-aidlc-rules/core-workflow.md ./AGENTS.md +mkdir -p .aidlc-rule-details +cp -R ~/Downloads/aidlc-rules/aws-aidlc-rule-details/* .aidlc-rule-details/ +``` + +**Windows PowerShell:** + +```powershell +Copy-Item "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rules\core-workflow.md" ".\AGENTS.md" +New-Item -ItemType Directory -Force -Path ".aidlc-rule-details" +Copy-Item "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rule-details\*" ".aidlc-rule-details\" -Recurse +``` + +**Windows CMD:** + +```cmd +copy "%USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rules\core-workflow.md" ".\AGENTS.md" +mkdir .aidlc-rule-details +xcopy "%USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rule-details" ".aidlc-rule-details\" /E /I +``` + +**Verify Setup:** + +1. In Cline's chat interface, look for the Rules popover under the chat input field +2. Verify that `core-workflow.md` is listed and active +3. You can toggle the rule file on/off as needed + +![AI-DLC Rules in Cline](./assets/images/cline-ide-aidlc-rules-loaded.png?raw=true "AI-DLC Rules in Cline") + +**Directory Structure (Option 1):** + +```text +/ +├── .clinerules/ +│ └── core-workflow.md +└── .aidlc-rule-details/ + ├── common/ + ├── inception/ + ├── construction/ + ├── extensions/ + └── operations/ +``` + +--- + +### Claude Code + +AI-DLC uses Claude Code's project memory file (`CLAUDE.md`) to implement its intelligent workflow. + +The commands below assume you extracted the zip to your `Downloads` folder. If you used a different location, replace `Downloads` with your actual folder path. + +#### Option 1: Project Root (Recommended) + +**Unix/Linux/macOS:** + +```bash +cp ~/Downloads/aidlc-rules/aws-aidlc-rules/core-workflow.md ./CLAUDE.md +mkdir -p .aidlc-rule-details +cp -R ~/Downloads/aidlc-rules/aws-aidlc-rule-details/* .aidlc-rule-details/ +``` -1. Start any software development project by stating your intent in the chat (Amazon Q IDE Extension or in Q CLI). AI-DLC automatically activates and guides you from there. -2. Answer structured questions that AI-DLC asks you -3. Carefully review every plan that AI generates. Provide your oversight and validation. -4. Review the execution plan to see which stages will run -5. Carefully review the artifacts and approve each stage to maintain control -6. All the artifacts will be generated in the `aidlc-docs/` directory +**Windows PowerShell:** + +```powershell +Copy-Item "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rules\core-workflow.md" ".\CLAUDE.md" +New-Item -ItemType Directory -Force -Path ".aidlc-rule-details" +Copy-Item "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rule-details\*" ".aidlc-rule-details\" -Recurse +``` + +**Windows CMD:** + +```cmd +copy "%USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rules\core-workflow.md" ".\CLAUDE.md" +mkdir .aidlc-rule-details +xcopy "%USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rule-details" ".aidlc-rule-details\" /E /I +``` + +#### Option 2: .claude Directory + +**Unix/Linux/macOS:** + +```bash +mkdir -p .claude +cp ~/Downloads/aidlc-rules/aws-aidlc-rules/core-workflow.md .claude/CLAUDE.md +mkdir -p .aidlc-rule-details +cp -R ~/Downloads/aidlc-rules/aws-aidlc-rule-details/* .aidlc-rule-details/ +``` + +**Windows PowerShell:** + +```powershell +New-Item -ItemType Directory -Force -Path ".claude" +Copy-Item "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rules\core-workflow.md" ".claude\CLAUDE.md" +New-Item -ItemType Directory -Force -Path ".aidlc-rule-details" +Copy-Item "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rule-details\*" ".aidlc-rule-details\" -Recurse +``` + +**Windows CMD:** + +```cmd +mkdir .claude +copy "%USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rules\core-workflow.md" ".claude\CLAUDE.md" +mkdir .aidlc-rule-details +xcopy "%USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rule-details" ".aidlc-rule-details\" /E /I +``` + +**Verify Setup:** + +1. Start Claude Code in your project directory (CLI: `claude` or VS Code extension) +2. Use the `/config` command to view current configuration +3. Ask Claude: "What instructions are currently active in this project?" + +**Directory Structure (Option 1):** + +```text +/ +├── CLAUDE.md +└── .aidlc-rule-details/ + ├── common/ + ├── inception/ + ├── construction/ + ├── extensions/ + └── operations/ +``` + +--- + +### GitHub Copilot + +AI-DLC uses [GitHub Copilot custom instructions](https://code.visualstudio.com/docs/copilot/customization/custom-instructions) to implement its intelligent workflow. The `.github/copilot-instructions.md` file is automatically detected and applied to all chat requests in the workspace. + +The commands below assume you extracted the zip to your `Downloads` folder. If you used a different location, replace `Downloads` with your actual folder path. + +**Unix/Linux/macOS:** + +```bash +mkdir -p .github +cp ~/Downloads/aidlc-rules/aws-aidlc-rules/core-workflow.md .github/copilot-instructions.md +mkdir -p .aidlc-rule-details +cp -R ~/Downloads/aidlc-rules/aws-aidlc-rule-details/* .aidlc-rule-details/ +``` + +**Windows PowerShell:** + +```powershell +New-Item -ItemType Directory -Force -Path ".github" +Copy-Item "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rules\core-workflow.md" ".github\copilot-instructions.md" +New-Item -ItemType Directory -Force -Path ".aidlc-rule-details" +Copy-Item "$env:USERPROFILE\Downloads\aidlc-rules\aws-aidlc-rule-details\*" ".aidlc-rule-details\" -Recurse +``` + +**Windows CMD:** + +```cmd +mkdir .github +copy "%USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rules\core-workflow.md" ".github\copilot-instructions.md" +mkdir .aidlc-rule-details +xcopy "%USERPROFILE%\Downloads\aidlc-rules\aws-aidlc-rule-details" ".aidlc-rule-details\" /E /I +``` + +**Verify Setup:** + +1. Open VS Code with your project folder +2. Open the Copilot Chat panel (Cmd/Ctrl+Shift+I) +3. Select **Configure Chat** (gear icon) > **Chat Instructions** and verify that `copilot-instructions` is listed +4. Alternatively, type `/instructions` in the chat input to view active instructions + +**Directory Structure:** + +```text +/ +├── .github/ +│ └── copilot-instructions.md +└── .aidlc-rule-details/ + ├── common/ + ├── inception/ + ├── construction/ + ├── extensions/ + └── operations/ +``` + +--- + +### Other Agents + +AI-DLC works with any coding agent that supports project-level rules or steering files. The general approach: + +1. Place `aws-aidlc-rules/` wherever your agent reads project rules from (consult your agent's documentation). +2. Place `aws-aidlc-rule-details/` at a sibling level so the rules can reference it. + +If your agent has no convention for rules files, place both folders at your project root and point the agent to `aws-aidlc-rules/` as its rules directory. + +--- + +## Usage + +1. Start any software development project by stating your intent starting with the phrase **"Using AI-DLC, ..."** in the chat +2. AI-DLC workflow automatically activates and guides you from there +3. Answer structured questions that AI-DLC asks you +4. Carefully review every plan that AI generates. Provide your oversight and validation +5. Review the execution plan to see which stages will run +6. Carefully review the artifacts and approve each stage to maintain control +7. All the artifacts will be generated in the `aidlc-docs/` directory + +--- ## Three-Phase Adaptive Workflow AI-DLC follows a structured three-phase approach that adapts to your project's complexity: -- **🔵 INCEPTION PHASE**: Determines **WHAT** to build and **WHY** - - Requirements analysis and validation - - User story creation (when applicable) - - Application Design and creating units of work for parallel development - - Risk assessment and complexity evaluation +### 🔵 INCEPTION PHASE + +Determines **WHAT** to build and **WHY** + +- Requirements analysis and validation +- User story creation (when applicable) +- Application Design and creating units of work for parallel development +- Risk assessment and complexity evaluation + +### 🟢 CONSTRUCTION PHASE -- **🟢 CONSTRUCTION PHASE**: Determines **HOW** to build it - - Detailed component design - - Code generation and implementation - - Build configuration and testing strategies - - Quality assurance and validation +Determines **HOW** to build it -- **🟡 OPERATIONS PHASE**: Deployment and monitoring (future) - - Deployment automation and infrastructure - - Monitoring and observability setup - - Production readiness validation +- Detailed component design +- Code generation and implementation +- Build configuration and testing strategies +- Quality assurance and validation + +### 🟡 OPERATIONS PHASE + +Deployment and monitoring (future) + +- Deployment automation and infrastructure +- Monitoring and observability setup +- Production readiness validation + +--- ## Key Features -- **Adaptive Intelligence**: Only executes stages that add value to your specific request -- **Context-Aware**: Analyzes existing codebase and complexity requirements -- **Risk-Based**: Complex changes get comprehensive treatment, simple changes stay efficient -- **Question-Driven**: Structured multiple-choice questions in files, not chat -- **Always in Control**: Review execution plans and approve each phase +| Feature | Description | +| ------------------------- | --------------------------------------------------------------------------------------------------------- | +| **Adaptive Intelligence** | Only executes stages that add value to your specific request | +| **Context-Aware** | Analyzes existing codebase and complexity requirements | +| **Risk-Based** | Complex changes get comprehensive treatment, simple changes stay efficient | +| **Question-Driven** | Structured multiple-choice questions in files, not chat | +| **Always in Control** | Review execution plans and approve each phase | +| **Extensible** | Layer custom rules e.g. security, compliance, and organization-specific rules on top of the core workflow | + +--- + +## Extensions + +AI-DLC supports an extension system that lets you layer additional rules on top of the core workflow. Extensions are markdown files organized under `aws-aidlc-rule-details/extensions/` and grouped by category (e.g., `security/`, `testing/`). + +### How Extensions Work + +Each extension consists of two files placed in the same directory: + +- A **rules file** (e.g., `security-baseline.md`) containing the extension's rules. +- An **opt-in file** (e.g., `security-baseline.opt-in.md`) containing a structured multiple-choice question presented to the user during Requirements Analysis. + +At workflow start, AI-DLC scans the `extensions/` directory and loads only `*.opt-in.md` files. During Requirements Analysis, it presents each opt-in prompt to the user. When the user opts in, the corresponding rules file is loaded (derived by naming convention: strip `.opt-in.md`, append `.md`). When the user opts out, the rules file is never loaded. Extensions without a matching `*.opt-in.md` file are always enforced. + +Once enabled, extension rules are blocking constraints — at each stage, the model verifies compliance before allowing the stage to proceed. + +### Built-in Extensions + +The `extensions/` directory ships with the following (new extensions may be added over time): + +```text +aws-aidlc-rule-details/ +└── extensions/ + ├── security/ # Extension category + │ └── baseline/ + │ ├── security-baseline.md # Baseline security rules + │ └── security-baseline.opt-in.md # Opt-in prompt + └── testing/ # Extension category + └── property-based/ + ├── property-based-testing.md # Property-based testing rules + └── property-based-testing.opt-in.md # Opt-in prompt +``` + +> [!IMPORTANT] +> The security extension rules are provided as a directional reference for building effective security rules within AI-DLC workflows. Each organization should build, customize, and thoroughly test their own security rules before deploying in production workflows. + +### Adding Your Own Extensions + +You can extend an existing category or create an entirely new one. + +1. Create a directory under `extensions/` (e.g., `security/compliance/` or `performance/baseline/`). +2. Add a **rules file** (e.g., `compliance.md`). Follow the same structure as `security-baseline.md`: + - Define each rule as a heading in the format `## Rule : ` where the prefix is a short category identifier and NN is a sequential number (e.g., `COMPLIANCE-01`, `COMPLIANCE-02`). These IDs are referenced in audit logs and compliance summaries, so they must be unique across all loaded extensions. + - Include a **Rule** section describing the requirement. + - Include a **Verification** section with concrete checks the model should evaluate. +3. Add a matching **opt-in file** using the naming convention `<name>.opt-in.md` (e.g., `compliance.opt-in.md`). See `security-baseline.opt-in.md` for the expected format. Omitting this file means the extension is always enforced with no user opt-out. +4. Rules are blocking by default — if verification criteria are not met, the stage cannot proceed until the finding is resolved. + +--- + +## Tenets + +These are our core principles to guide our decision making. + +- **No duplication**. The source of truth lives in one place. If we add support for new tools or formats that require specific files, we generate them from the source rather than maintaining separate copies. + +- **Methodology first**. AI-DLC is fundamentally a methodology, not a tool. Users shouldn't need to install anything to get started. That said, we're open to convenience tooling (scripts, CLIs) down the road if it helps users adopt or extend the methodology. + +- **Reproducible**. Rules should be clear enough that different models produce similar outcomes. We know models behave differently, but the methodology should minimize variance through explicit guidance. + +- **Agnostic**. The methodology works with any IDE, agent, or model. We don't tie ourselves to specific tools or vendors. + +- **Human in the loop**. Critical decisions require explicit user confirmation. The agent proposes, the human approves. + +--- ## Prerequisites Have one of our supported platforms/tools for Assisted AI Coding installed: -- [Kiro CLI](https://kiro.dev/cli/) -- [Amazon Q Developer IDE plugin](https://docs.aws.amazon.com/amazonq/latest/qdeveloper-ug/q-in-IDE.html) -- [Kiro IDE](https://kiro.dev/) (coming soon) +| Platform | Installation Link | +| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Kiro | [Install](https://kiro.dev/) | +| Kiro CLI | [Install](https://kiro.dev/cli/) | +| Amazon Q Developer IDE Plugin | [Install](https://docs.aws.amazon.com/amazonq/latest/qdeveloper-ug/q-in-IDE.html) | +| Cursor IDE | [Install](https://cursor.com/) | +| Cline VS Code Extension | [Install](https://marketplace.visualstudio.com/items?itemName=saoudrizwan.claude-dev) | +| Claude Code CLI | [Install](https://github.com/anthropics/claude-code) | +| GitHub Copilot | [Install](https://marketplace.visualstudio.com/items?itemName=GitHub.copilot) + [Chat](https://marketplace.visualstudio.com/items?itemName=GitHub.copilot-chat) | + +--- + +## Troubleshooting + +### General Issues + +| Problem | Solution | +| ---------------------------- | ----------------------------------------------------------- | +| Rules not loading | Check file exists in the correct location for your platform | +| File encoding issues | Ensure files are UTF-8 encoded | +| Rules not applied in session | Start a new chat session after file changes | +| Rule details not loading | Verify `.aidlc-rule-details/` exists with subdirectories | + +### Platform-Specific Issues + +#### Kiro + +- Use `/context show` in Kiro CLI to verify rules are loaded +- Check `.kiro/steering/` directory structure +- Note: Kiro uses `aws-aidlc-rule-details` (not `.aidlc-rule-details/`) under the `.kiro/` directory + +#### Amazon Q Developer + +- Check `.amazonq/rules/` directory structure +- Verify rules are listed in the Amazon Q Chat Rules panel +- Note: Amazon Q uses `aws-aidlc-rule-details` (not `.aidlc-rule-details/`) under the `.amazonq/` directory + +#### Cursor + +- For "Apply Intelligently", ensure a description is defined in frontmatter +- Check **Cursor Settings → Rules** to ensure the rule is enabled +- If rule is too large (>500 lines), split into multiple focused rules + +#### Cline + +- Check the Rules popover under the chat input field +- Toggle rule files on/off as needed using the popover UI + +#### Claude Code + +- Use `/config` command to view current configuration +- Ask "What instructions are currently active in this project?" + +#### GitHub Copilot + +- Select **Configure Chat** (gear icon) > **Chat Instructions** to verify instructions are loaded +- Type `/instructions` in the chat input to view active instruction files +- Check that `.github/copilot-instructions.md` exists in your workspace root + +### File Path Issues on Windows + +- Use forward slashes `/` in file paths within markdown files +- Windows paths with backslashes may not work correctly + +--- + +## Version Control Recommendations + +**Commit to repository:** + +```gitignore +# These should be version controlled +CLAUDE.md +AGENTS.md +.amazonq/rules/ +.amazonq/aws-aidlc-rule-details/ +.kiro/steering/ +.kiro/aws-aidlc-rule-details/ +.cursor/rules/ +.clinerules/ +.github/copilot-instructions.md +.aidlc-rule-details/ +``` + +**Optional - Add to `.gitignore` (if needed):** + +```gitignore +# Local-only settings +.claude/settings.local.json +``` + +--- + +## Generated aidlc-docs/ Reference + +For the complete reference of all documentation artifacts generated by the AI-DLC workflow, see [docs/GENERATED_DOCS_REFERENCE.md](docs/GENERATED_DOCS_REFERENCE.md). + +--- + +## Additional Resources + +<!-- TODO: Replace this Amplify URL with a permanent/stable URL when available --> +| Resource | Link | +| --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- | +| AI-DLC Method Definition Paper | [Paper](https://prod.d13rzhkk8cj2z0.amplifyapp.com/) | +| AI-DLC Methodology Blog | [AWS Blog](https://aws.amazon.com/blogs/devops/ai-driven-development-life-cycle/) | +| AI-DLC Open-source Launch Blog | [AWS Blog](https://aws.amazon.com/blogs/devops/open-sourcing-adaptive-workflows-for-ai-driven-development-life-cycle-ai-dlc/) | +| AI-DLC Example Walkthrough Blog | [AWS Blog](https://aws.amazon.com/blogs/devops/building-with-ai-dlc-using-amazon-q-developer/) | +| Amazon Q Developer Documentation | [Docs](https://docs.aws.amazon.com/amazonq/latest/qdeveloper-ug/q-in-IDE.html) | +| Kiro CLI Documentation | [Docs](https://kiro.dev/docs/cli/steering/) | +| Cursor Rules Documentation | [Docs](https://cursor.com/docs/context/rules) | +| Claude Code Documentation | [GitHub](https://github.com/anthropics/claude-code) | +| GitHub Copilot Documentation | [Docs](https://docs.github.com/en/copilot) | +| Working with AI-DLC (interaction patterns and tips) | [docs/WORKING-WITH-AIDLC.md](docs/WORKING-WITH-AIDLC.md) | +| Contributing Guidelines | [CONTRIBUTING.md](CONTRIBUTING.md) | +| Code of Conduct | [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md) | + +--- ## Security @@ -102,4 +756,4 @@ See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more inform ## License -This library is licensed under the MIT-0 License. See the LICENSE file. +This library is licensed under the MIT-0 License. See the [LICENSE](LICENSE) file. diff --git a/aidlc-rules/.markdownlint-cli2.yaml b/aidlc-rules/.markdownlint-cli2.yaml new file mode 100644 index 00000000..9de9ed38 --- /dev/null +++ b/aidlc-rules/.markdownlint-cli2.yaml @@ -0,0 +1,30 @@ +# markdownlint overrides for aidlc-rules/ (LLM prompt files) +# These rules are temporarily disabled pending incremental remediation. +# Each rule is re-enabled in a separate PR after fixing its violations. +# See PR #159 for the full remediation plan ordered by LLM prompt impact. + +config: + # --- Tier 1: Zero prompt impact (invisible whitespace) --- + MD009: false # no-trailing-spaces (9 violations) + MD012: false # no-multiple-blanks (2 violations) + MD047: false # single-trailing-newline (4 violations) + + # --- Tier 2: Zero prompt impact (formatting normalization) --- + MD029: false # ol-prefix (1 violation) + MD026: false # no-trailing-punctuation (17 violations) + + # --- Tier 3: Zero prompt impact (blank line structure) --- + MD022: false # blanks-around-headings (274 violations) + MD032: false # blanks-around-lists (426 violations) + MD031: false # blanks-around-fences (31 violations) + + # --- Tier 4: Minimal prompt impact (table/URL formatting) --- + MD060: false # table-column-style (34 violations) + + # --- Tier 5: Low prompt impact (structural cosmetic) --- + MD046: false # code-block-style (8 violations) + MD007: false # ul-indent (34 violations) + + # --- Tier 6: Needs review (adds content or changes structure) --- + MD040: false # fenced-code-language (18 violations) + MD025: false # single-title (13 violations) diff --git a/aidlc-rules/VERSION b/aidlc-rules/VERSION new file mode 100644 index 00000000..11808190 --- /dev/null +++ b/aidlc-rules/VERSION @@ -0,0 +1 @@ +0.1.7 diff --git a/aidlc-rules/aws-aidlc-rule-details/common/ascii-diagram-standards.md b/aidlc-rules/aws-aidlc-rule-details/common/ascii-diagram-standards.md new file mode 100644 index 00000000..498ab223 --- /dev/null +++ b/aidlc-rules/aws-aidlc-rule-details/common/ascii-diagram-standards.md @@ -0,0 +1,116 @@ +# ASCII Diagram Standards + +## MANDATORY: Use Basic ASCII Only + +**CRITICAL**: ALWAYS use basic ASCII characters for diagrams (maximum compatibility). + +### ✅ ALLOWED: `+` `-` `|` `^` `v` `<` `>` and alphanumeric text + +### ❌ FORBIDDEN: Unicode box-drawing characters +- NO: `┌` `─` `│` `└` `┐` `┘` `├` `┤` `┬` `┴` `┼` `▼` `▲` `►` `◄` +- Reason: Inconsistent rendering across fonts/platforms + +## Standard ASCII Diagram Patterns + +### CRITICAL: Character Width Rule +**Every line in a box MUST have EXACTLY the same character count (including spaces)** + +✅ CORRECT (all lines = 67 chars): +``` ++---------------------------------------------------------------+ +| Component Name | +| Description text here | ++---------------------------------------------------------------+ +``` + +❌ WRONG (inconsistent widths): +``` ++---------------------------------------------------------------+ +| Component Name | +| Description text here | ++---------------------------------------------------------------+ +``` + +### Box Pattern +``` ++-----------------------------------------------------+ +| | +| Calculator Application | +| | +| Provides basic arithmetic operations for users | +| through a web-based interface | +| | ++-----------------------------------------------------+ +``` + +### Nested Boxes +``` ++-------------------------------------------------------+ +| Web Server (PHP Runtime) | +| +-------------------------------------------------+ | +| | index.php (Monolithic Application) | | +| | +-------------------------------------------+ | | +| | | HTML Template (View Layer) | | | +| | | - Form rendering | | | +| | | - Result display | | | +| | +-------------------------------------------+ | | +| +-------------------------------------------------+ | ++-------------------------------------------------------+ +``` + +### Arrows and Connections +``` ++----------+ +| Source | ++----------+ + | + | HTTP POST + v ++----------+ +| Target | ++----------+ +``` + +### Horizontal Flow +``` ++-------+ +-------+ +-------+ +| Step1 | --> | Step2 | --> | Step3 | ++-------+ +-------+ +-------+ +``` + +### Vertical Flow with Labels +``` +User Action Flow: + | + v ++----------+ +| Input | ++----------+ + | + | validates + v ++----------+ +| Process | ++----------+ + | + | returns + v ++----------+ +| Output | ++----------+ +``` + +## Validation + +Before creating diagrams: +- [ ] Basic ASCII only: `+` `-` `|` `^` `v` `<` `>` +- [ ] No Unicode box-drawing +- [ ] Spaces (not tabs) for alignment +- [ ] Corners use `+` +- [ ] **ALL box lines same character width** (count characters including spaces) +- [ ] Test: Verify corners align vertically in monospace font + +## Alternative + +For complex diagrams, use Mermaid (see `content-validation.md`) + diff --git a/aidlc-rules/aws-aidlc-rule-details/common/content-validation.md b/aidlc-rules/aws-aidlc-rule-details/common/content-validation.md index 8a6b4633..f80c3042 100644 --- a/aidlc-rules/aws-aidlc-rule-details/common/content-validation.md +++ b/aidlc-rules/aws-aidlc-rule-details/common/content-validation.md @@ -4,6 +4,20 @@ **CRITICAL**: All generated content MUST be validated before writing to files to prevent parsing errors. +## ASCII Diagram Standards + +**CRITICAL**: Before creating ANY file with ASCII diagrams: + +1. **LOAD** `common/ascii-diagram-standards.md` +2. **VALIDATE** each diagram: + - Count characters per line (all lines MUST be same width) + - Use ONLY: `+` `-` `|` `^` `v` `<` `>` and spaces + - NO Unicode box-drawing characters + - Spaces only (NO tabs) +3. **TEST** alignment by verifying box corners align vertically + +**See `common/ascii-diagram-standards.md` for patterns and validation checklist.** + ## Mermaid Diagram Validation ### Required Validation Steps @@ -61,4 +75,4 @@ Phase 1: INCEPTION 1. **Log the error**: Record what failed validation 2. **Use fallback content**: Switch to text-based alternative 3. **Continue workflow**: Don't block on content validation failures -4. **Inform user**: Mention simplified content was used due to parsing constraints \ No newline at end of file +4. **Inform user**: Mention simplified content was used due to parsing constraints diff --git a/aidlc-rules/aws-aidlc-rule-details/common/error-handling.md b/aidlc-rules/aws-aidlc-rule-details/common/error-handling.md index 18d18903..5b7ac1e5 100644 --- a/aidlc-rules/aws-aidlc-rule-details/common/error-handling.md +++ b/aidlc-rules/aws-aidlc-rule-details/common/error-handling.md @@ -16,12 +16,12 @@ - Invalid user input that cannot be processed - System errors preventing file operations -**High**: Phase cannot complete as planned +**High**: Stage cannot complete as planned - Incomplete answers to required questions - Contradictory user responses -- Missing dependencies from prior phases +- Missing dependencies from prior stages -**Medium**: Phase can continue with workarounds +**Medium**: Stage can continue with workarounds - Optional artifacts missing - Non-critical validation failures - Partial completion possible @@ -31,9 +31,9 @@ - Optional information missing - Non-blocking warnings -## Phase-Specific Error Handling +## Stage-Specific Error Handling -### Context Assessment Errors +### Workspace Detection Errors **Error**: Cannot read workspace files - **Cause**: Permission issues, missing directories @@ -45,12 +45,12 @@ - **Solution**: Ask user if they want to start fresh or attempt recovery - **Recovery**: Create backup, start new state file -**Error**: Cannot determine required phases +**Error**: Cannot determine required stages - **Cause**: Insufficient information from user - **Solution**: Ask clarifying questions about intent and scope - **Workaround**: Default to comprehensive execution plan -### Requirements Assessment Errors +### Requirements Analysis Errors **Error**: User provides contradictory requirements - **Cause**: Unclear understanding, changing needs @@ -67,11 +67,11 @@ - **Solution**: Highlight unanswered questions, provide examples - **Do Not Proceed**: Until all required questions are answered -### Story Development Errors +### User Stories Errors **Error**: Cannot map requirements to stories - **Cause**: Requirements too vague, missing functional details -- **Solution**: Return to Requirements Assessment for clarification +- **Solution**: Return to Requirements Analysis for clarification - **Workaround**: Create stories based on available information, mark as incomplete **Error**: User provides ambiguous story planning answers @@ -130,11 +130,11 @@ - **Solution**: Clearly mark as **HUMAN TASK**, provide instructions - **Wait**: For user confirmation before proceeding -### Code Planning Errors +### Code Generation Planning Errors **Error**: Code generation plan is incomplete - **Cause**: Missing design artifacts, unclear requirements -- **Solution**: Return to Design phase to complete artifacts +- **Solution**: Return to Design stage to complete artifacts - **Recovery**: Generate plan with available information, mark gaps **Error**: Unit dependencies not satisfied @@ -142,7 +142,7 @@ - **Solution**: Reorder generation sequence to respect dependencies - **Workaround**: Generate with stub dependencies, integrate later -### Code Generation Errors +### Code Generation Errors (Part 2: Code Generation) **Error**: Cannot generate code for a step - **Cause**: Insufficient design information, unclear requirements @@ -173,12 +173,12 @@ ## Recovery Procedures -### Partial Phase Completion +### Partial Stage Completion -**Scenario**: Phase was interrupted mid-execution +**Scenario**: Stage was interrupted mid-execution **Recovery Steps**: -1. Load the phase plan file +1. Load the stage plan file 2. Identify last completed step (last [x] checkbox) 3. Resume from next uncompleted step 4. Verify all prior steps are actually complete @@ -190,43 +190,43 @@ **Recovery Steps**: 1. Create backup: `aidlc-state.md.backup` -2. Ask user which phase they're actually on +2. Ask user which stage they're actually on 3. Regenerate state file from scratch -4. Mark completed phases based on existing artifacts -5. Resume from current phase +4. Mark completed stages based on existing artifacts +5. Resume from current stage ### Missing Artifacts -**Scenario**: Required artifacts from prior phase are missing +**Scenario**: Required artifacts from prior stage are missing **Recovery Steps**: 1. Identify which artifacts are missing 2. Determine if they can be regenerated -3. If yes: Return to that phase, regenerate artifacts +3. If yes: Return to that stage, regenerate artifacts 4. If no: Ask user to provide information manually 5. Document the gap in `audit.md` -### User Wants to Restart Phase +### User Wants to Restart Stage -**Scenario**: User is unhappy with phase results and wants to redo +**Scenario**: User is unhappy with stage results and wants to redo **Recovery Steps**: 1. Confirm user wants to restart (data will be lost) 2. Archive existing artifacts: `{artifact}.backup` -3. Reset phase status in `aidlc-state.md` -4. Clear phase checkboxes in plan files -5. Re-execute phase from beginning +3. Reset stage status in `aidlc-state.md` +4. Clear stage checkboxes in plan files +5. Re-execute stage from beginning -### User Wants to Skip Phase +### User Wants to Skip Stage -**Scenario**: User wants to skip a phase that was planned +**Scenario**: User wants to skip a stage that was planned **Recovery Steps**: 1. Confirm user understands implications 2. Document skip reason in `audit.md` -3. Mark phase as "SKIPPED" in `aidlc-state.md` -4. Proceed to next phase -5. Note: May cause issues in later phases if dependencies missing +3. Mark stage as "SKIPPED" in `aidlc-state.md` +4. Proceed to next stage +5. Note: May cause issues in later stages if dependencies missing ## Escalation Guidelines @@ -247,11 +247,20 @@ ### When to Suggest Starting Over **Consider Fresh Start If**: -- Multiple phases have errors +- Multiple stages have errors - State file is severely corrupted +- User requirements have changed significantly +- Architectural decision needs to be reversed - User cannot provide missing information - Artifacts are inconsistent across phases +**Before Starting Over**: +1. Archive all existing work +2. Document lessons learned +3. Identify what to preserve +4. Get user confirmation +5. Create new execution plan + ## Session Resumption Errors ### Missing Artifacts During Resumption @@ -324,23 +333,14 @@ 3. **Fail fast**: Stop immediately if critical artifacts are missing 4. **Communicate clearly**: Tell user exactly what's missing and why it's needed 5. **Offer options**: Regenerate, provide manually, or start fresh -6. **Document recovery**: Log all recovery actions in audit.md State file is severely corrupted -- User requirements have changed significantly -- Architectural decision needs to be reversed - -**Before Starting Over**: -1. Archive all existing work -2. Document lessons learned -3. Identify what to preserve -4. Get user confirmation -5. Create new execution plan +6. **Document recovery**: Log all recovery actions in audit.md ## Logging Requirements ### Error Logging Format ```markdown -## Error - [Phase Name] +## Error - [Stage Name] **Timestamp**: [ISO timestamp] **Error Type**: [Critical/High/Medium/Low] **Description**: [What went wrong] @@ -354,7 +354,7 @@ ### Recovery Logging Format ```markdown -## Recovery - [Phase Name] +## Recovery - [Stage Name] **Timestamp**: [ISO timestamp] **Issue**: [What needed recovery] **Recovery Steps**: [What was done] diff --git a/aidlc-rules/aws-aidlc-rule-details/common/process-overview.md b/aidlc-rules/aws-aidlc-rule-details/common/process-overview.md index be3f5921..ff4e332c 100644 --- a/aidlc-rules/aws-aidlc-rule-details/common/process-overview.md +++ b/aidlc-rules/aws-aidlc-rule-details/common/process-overview.md @@ -2,14 +2,14 @@ **Purpose**: Technical reference for AI model and developers to understand complete workflow structure. -**Note**: Similar content exists in core-workflow.md (user welcome message) and README.md (documentation). This duplication is INTENTIONAL - each file serves a different purpose: +**Note**: Similar content exists in welcome-message.md (user welcome message) and README.md (documentation). This duplication is INTENTIONAL - each file serves a different purpose: - **This file**: Detailed technical reference with Mermaid diagram for AI model context loading -- **core-workflow.md**: User-facing welcome message with ASCII diagram +- **welcome-message.md**: User-facing welcome message with ASCII diagram - **README.md**: Human-readable documentation for repository ## The Three-Phase Lifecycle: • **INCEPTION PHASE**: Planning and architecture (Workspace Detection + conditional phases + Workflow Planning) -• **CONSTRUCTION PHASE**: Design, implementation, build and test (per-unit design + Code Planning/Generation + Build & Test) +• **CONSTRUCTION PHASE**: Design, implementation, build and test (per-unit design + Code Generation + Build & Test) • **OPERATIONS PHASE**: Placeholder for future deployment and monitoring workflows ## The Adaptive Workflow: @@ -84,27 +84,29 @@ flowchart TD BT -.-> OPS BT --> End(["Complete"]) - style WD fill:#90EE90,stroke:#2d5016,stroke-width:3px - style RA fill:#90EE90,stroke:#2d5016,stroke-width:3px - style WP fill:#90EE90,stroke:#2d5016,stroke-width:3px + style WD fill:#4CAF50,stroke:#1B5E20,stroke-width:3px,color:#fff + style RA fill:#4CAF50,stroke:#1B5E20,stroke-width:3px,color:#fff + style WP fill:#4CAF50,stroke:#1B5E20,stroke-width:3px,color:#fff - style CG fill:#90EE90,stroke:#2d5016,stroke-width:3px - style BT fill:#90EE90,stroke:#2d5016,stroke-width:3px - style OPS fill:#f0f0f0,stroke:#999999,stroke-width:2px,stroke-dasharray: 5 5 - style RE fill:#FFE4B5,stroke:#8B7355,stroke-width:2px,stroke-dasharray: 5 5 - style Stories fill:#FFE4B5,stroke:#8B7355,stroke-width:2px,stroke-dasharray: 5 5 - style AppDesign fill:#FFE4B5,stroke:#8B7355,stroke-width:2px,stroke-dasharray: 5 5 + style CG fill:#4CAF50,stroke:#1B5E20,stroke-width:3px,color:#fff + style BT fill:#4CAF50,stroke:#1B5E20,stroke-width:3px,color:#fff + style OPS fill:#BDBDBD,stroke:#424242,stroke-width:2px,stroke-dasharray: 5 5,color:#000 + style RE fill:#FFA726,stroke:#E65100,stroke-width:3px,stroke-dasharray: 5 5,color:#000 + style Stories fill:#FFA726,stroke:#E65100,stroke-width:3px,stroke-dasharray: 5 5,color:#000 + style AppDesign fill:#FFA726,stroke:#E65100,stroke-width:3px,stroke-dasharray: 5 5,color:#000 - style UnitsG fill:#FFE4B5,stroke:#8B7355,stroke-width:2px,stroke-dasharray: 5 5 - style FD fill:#FFE4B5,stroke:#8B7355,stroke-width:2px,stroke-dasharray: 5 5 - style NFRA fill:#FFE4B5,stroke:#8B7355,stroke-width:2px,stroke-dasharray: 5 5 - style NFRD fill:#FFE4B5,stroke:#8B7355,stroke-width:2px,stroke-dasharray: 5 5 - style ID fill:#FFE4B5,stroke:#8B7355,stroke-width:2px,stroke-dasharray: 5 5 - style INCEPTION fill:#E3F2FD,stroke:#1976D2,stroke-width:3px - style CONSTRUCTION fill:#E8F5E9,stroke:#388E3C,stroke-width:3px - style OPERATIONS fill:#FFF9C4,stroke:#F57C00,stroke-width:3px - style Start fill:#E6E6FA,stroke:#4B0082,stroke-width:2px - style End fill:#E6E6FA,stroke:#4B0082,stroke-width:2px + style UnitsG fill:#FFA726,stroke:#E65100,stroke-width:3px,stroke-dasharray: 5 5,color:#000 + style FD fill:#FFA726,stroke:#E65100,stroke-width:3px,stroke-dasharray: 5 5,color:#000 + style NFRA fill:#FFA726,stroke:#E65100,stroke-width:3px,stroke-dasharray: 5 5,color:#000 + style NFRD fill:#FFA726,stroke:#E65100,stroke-width:3px,stroke-dasharray: 5 5,color:#000 + style ID fill:#FFA726,stroke:#E65100,stroke-width:3px,stroke-dasharray: 5 5,color:#000 + style INCEPTION fill:#BBDEFB,stroke:#1565C0,stroke-width:3px, color:#000 + style CONSTRUCTION fill:#C8E6C9,stroke:#2E7D32,stroke-width:3px, color:#000 + style OPERATIONS fill:#FFF59D,stroke:#F57F17,stroke-width:3px, color:#000 + style Start fill:#CE93D8,stroke:#6A1B9A,stroke-width:3px,color:#000 + style End fill:#CE93D8,stroke:#6A1B9A,stroke-width:3px,color:#000 + + linkStyle default stroke:#333,stroke-width:2px ``` **Stage Descriptions:** diff --git a/aidlc-rules/aws-aidlc-rule-details/common/terminology.md b/aidlc-rules/aws-aidlc-rule-details/common/terminology.md index 5193a751..cbcb4140 100644 --- a/aidlc-rules/aws-aidlc-rule-details/common/terminology.md +++ b/aidlc-rules/aws-aidlc-rule-details/common/terminology.md @@ -10,13 +10,13 @@ - 🟡 **OPERATIONS PHASE** - Deployment & Monitoring (future expansion) **Stage**: An individual workflow activity within a phase -- Examples: Context Assessment stage, Requirements Assessment stage, Code Planning stage +- Examples: Context Assessment stage, Requirements Assessment stage, Code Generation stage - Each stage has specific prerequisites, steps, and outputs - Stages can be ALWAYS-EXECUTE or CONDITIONAL **Usage Examples**: - ✅ "The CONSTRUCTION phase contains 7 stages" -- ✅ "The Code Planning stage is always executed" +- ✅ "The Code Generation stage is always executed" - ✅ "We're in the INCEPTION phase, executing the Requirements Assessment stage" - ❌ "The Requirements Assessment phase" (should be "stage") - ❌ "The CONSTRUCTION stage" (should be "phase") @@ -49,8 +49,7 @@ - NFR Requirements (CONDITIONAL, per-unit) - NFR Design (CONDITIONAL, per-unit) - Infrastructure Design (CONDITIONAL, per-unit) -- Code Planning (ALWAYS) -- Code Generation (ALWAYS) +- Code Generation (ALWAYS) — includes Part 1: Planning and Part 2: Generation - Build and Test (ALWAYS) **Outputs**: Design artifacts, NFR implementations, code, tests @@ -73,8 +72,7 @@ - **Workspace Detection**: Initial analysis of workspace state and project type - **Requirements Analysis**: Gathering requirements (depth varies based on complexity) - **Workflow Planning**: Creating execution plan for which phases to run -- **Code Planning**: Creating detailed implementation plans for code generation -- **Code Generation**: Generating actual code based on plans and prior artifacts +- **Code Generation**: Single stage with two parts — Part 1 (Planning) creates detailed implementation plans, Part 2 (Generation) generates actual code based on plans and prior artifacts - **Build and Test**: Building all units and executing comprehensive testing ### Conditional Stages @@ -122,7 +120,7 @@ A reusable building block within a service or module. Components are classes, fu ### When to Use Each Term **Unit of Work**: -- During Units Planning and Units Generation phases +- During Units Planning and Units Generation stages - When discussing system decomposition - In planning documents and discussions - Example: "How should we decompose this into units of work?" @@ -156,7 +154,7 @@ Examples: - Units Planning → Units Generation - Unit Design Planning → Unit Design Generation - NFR Planning → NFR Generation -- Code Planning → Code Generation +- Code Generation Part 1 (Planning) → Code Generation Part 2 (Generation) ### Depth Levels - **Minimal**: Quick, focused execution for simple changes diff --git a/aidlc-rules/aws-aidlc-rule-details/common/welcome-message.md b/aidlc-rules/aws-aidlc-rule-details/common/welcome-message.md new file mode 100644 index 00000000..c8ef7a73 --- /dev/null +++ b/aidlc-rules/aws-aidlc-rule-details/common/welcome-message.md @@ -0,0 +1,109 @@ +# AI-DLC Welcome Message + +**Purpose**: This file contains the user-facing welcome message that should be displayed ONCE at the start of any AI-DLC workflow. + +--- + +# 👋 Welcome to AI-DLC (AI-Driven Development Life Cycle)! 👋 + +I'll guide you through an adaptive software development workflow that intelligently tailors itself to your specific needs. + +## What is AI-DLC? + +AI-DLC is a structured yet flexible software development process that adapts to your project's needs. Think of it as having an experienced software architect who: + +- **Analyzes your requirements** and asks clarifying questions when needed +- **Plans the optimal approach** based on complexity and risk +- **Skips unnecessary steps** for simple changes while providing comprehensive coverage for complex projects +- **Documents everything** so you have a complete record of decisions and rationale +- **Guides you through each phase** with clear checkpoints and approval gates + +## The Three-Phase Lifecycle + +``` + User Request + | + v + +---------------------------------------+ + | INCEPTION PHASE | + | Planning & Application Design | + +---------------------------------------+ + | * Workspace Detection (ALWAYS) | + | * Reverse Engineering (COND) | + | * Requirements Analysis (ALWAYS) | + | * User Stories (CONDITIONAL) | + | * Workflow Planning (ALWAYS) | + | * Application Design (CONDITIONAL) | + | * Units Generation (CONDITIONAL) | + +---------------------------------------+ + | + v + +---------------------------------------+ + | CONSTRUCTION PHASE | + | Design, Implementation & Test | + +---------------------------------------+ + | * Per-Unit Loop (for each unit): | + | - Functional Design (COND) | + | - NFR Requirements Assess (COND) | + | - NFR Design (COND) | + | - Infrastructure Design (COND) | + | - Code Generation (ALWAYS) | + | * Build and Test (ALWAYS) | + +---------------------------------------+ + | + v + +---------------------------------------+ + | OPERATIONS PHASE | + | Placeholder for Future | + +---------------------------------------+ + | * Operations (PLACEHOLDER) | + +---------------------------------------+ + | + v + Complete +``` + +### Phase Breakdown: + +**INCEPTION PHASE** - *Planning & Application Design* +- **Purpose**: Determines WHAT to build and WHY +- **Activities**: Understanding requirements, analyzing existing code (if any), planning the approach +- **Output**: Clear requirements, execution plan, decisions on the number of units of work for parallel development +- **Your Role**: Answer questions, review plans, approve direction + +**CONSTRUCTION PHASE** - *Detailed Design, Implementation & Test* +- **Purpose**: Determines HOW to build it +- **Activities**: Detailed design (when needed), code generation, comprehensive testing +- **Output**: Working code, tests, build instructions +- **Your Role**: Review designs, approve implementation plans, validate results + +**OPERATIONS PHASE** - *Deployment & Monitoring (Future)* +- **Purpose**: How to DEPLOY and RUN it +- **Status**: Placeholder for future deployment and monitoring workflows +- **Current State**: Build and test activities handled in CONSTRUCTION phase + +## Key Principles: + +- ⚡ **Fully Adaptive**: Each stage independently evaluated based on your needs +- 🎯 **Efficient**: Simple changes execute only essential stages +- 📋 **Comprehensive**: Complex changes get full treatment with all safeguards +- 🔍 **Transparent**: You see and approve the execution plan before work begins +- 📝 **Documented**: Complete audit trail of all decisions and changes +- 🎛️ **User Control**: You can request stages be included or excluded + +## What Happens Next: + +1. **I'll analyze your workspace** to understand if this is a new or existing project +2. **I'll gather requirements** and ask clarifying questions if needed +3. **I'll create an execution plan** showing which stages I propose to run and why +4. **You'll review and approve** the plan (or request changes) +5. **We'll execute the plan** with checkpoints at each major stage +6. **You'll get working code** with complete documentation and tests + +The AI-DLC process adapts to: +- 📋 Your intent clarity and complexity +- 🔍 Existing codebase state +- 🎯 Scope and impact of changes +- ⚡ Risk and quality requirements + +Let's begin! diff --git a/aidlc-rules/aws-aidlc-rule-details/common/workflow-changes.md b/aidlc-rules/aws-aidlc-rule-details/common/workflow-changes.md index cc7d33fd..fb62e94c 100644 --- a/aidlc-rules/aws-aidlc-rule-details/common/workflow-changes.md +++ b/aidlc-rules/aws-aidlc-rule-details/common/workflow-changes.md @@ -1,37 +1,37 @@ -# Mid-Workflow Changes and Phase Management +# Mid-Workflow Changes and Stage Management ## Overview -Users may request changes to the execution plan or phase execution during the workflow. This document provides guidance on handling these requests safely and effectively. +Users may request changes to the execution plan or stage execution during the workflow. This document provides guidance on handling these requests safely and effectively. --- ## Types of Mid-Workflow Changes -### 1. Adding a Skipped Phase +### 1. Adding a Skipped Stage -**Scenario**: User wants to add a phase that was originally skipped +**Scenario**: User wants to add a stage that was originally skipped **Example**: "Actually, I want to add user stories even though we skipped that stage" **Handling**: 1. **Confirm Request**: "You want to add User Stories stage. This will create user stories and personas. Confirm?" -2. **Check Dependencies**: Verify all prerequisite phases are complete -3. **Update Execution Plan**: Add phase to `execution-plan.md` with rationale -4. **Update State**: Mark phase as "PENDING" in `aidlc-state.md` -5. **Execute Phase**: Follow normal phase execution process +2. **Check Dependencies**: Verify all prerequisite stages are complete +3. **Update Execution Plan**: Add stage to `execution-plan.md` with rationale +4. **Update State**: Mark stage as "PENDING" in `aidlc-state.md` +5. **Execute Stage**: Follow normal stage execution process 6. **Log Change**: Document in `audit.md` with timestamp and reason **Considerations**: -- May need to update later phases that could benefit from new artifacts +- May need to update later stages that could benefit from new artifacts - Existing artifacts may need revision to incorporate new information - Timeline will be extended --- -### 2. Skipping a Planned Phase +### 2. Skipping a Planned Stage -**Scenario**: User wants to skip a phase that was planned to execute +**Scenario**: User wants to skip a stage that was planned to execute **Example**: "Let's skip the NFR Design stage for now" @@ -39,13 +39,13 @@ Users may request changes to the execution plan or phase execution during the wo 1. **Confirm Request**: "You want to skip NFR Design. This means no NFR patterns or logical components will be incorporated. Confirm?" 2. **Warn About Impact**: Explain what will be missing and potential consequences 3. **Get Explicit Confirmation**: User must explicitly confirm understanding of impact -4. **Update Execution Plan**: Mark phase as "SKIPPED" with reason -5. **Update State**: Mark phase as "SKIPPED" in `aidlc-state.md` -6. **Adjust Later Phases**: Note that later phases may need manual setup +4. **Update Execution Plan**: Mark stage as "SKIPPED" with reason +5. **Update State**: Mark stage as "SKIPPED" in `aidlc-state.md` +6. **Adjust Later Stages**: Note that later stages may need manual setup 7. **Log Change**: Document in `audit.md` with timestamp and reason **Considerations**: -- Later phases may fail or require manual intervention +- Later stages may fail or require manual intervention - User accepts responsibility for missing artifacts - Can be added back later if needed @@ -85,7 +85,7 @@ Users may request changes to the execution plan or phase execution during the wo **Handling**: 1. **Assess Impact**: Identify all stages that depend on the stage to be restarted -2. **Warn User**: "Restarting Application Design will require redoing: Units Planning, Units Generation, per-unit design (all units), Code Planning, Code Generation. Confirm?" +2. **Warn User**: "Restarting Application Design will require redoing: Units Planning, Units Generation, per-unit design (all units), Code Generation. Confirm?" 3. **Get Explicit Confirmation**: User must understand full impact 4. **If Confirmed**: - Archive all affected artifacts @@ -134,7 +134,7 @@ Users may request changes to the execution plan or phase execution during the wo 2. **Update Checkboxes**: Mark all completed steps with [x] 3. **Update State**: Ensure `aidlc-state.md` reflects current status 4. **Log Pause**: Document pause point in `audit.md` -5. **Provide Resume Instructions**: "When you return, I'll detect your existing project and offer to continue from: [current phase, current step]" +5. **Provide Resume Instructions**: "When you return, I'll detect your existing project and offer to continue from: [current stage, current step]" **On Resume**: 1. **Detect Existing Project**: Check for `aidlc-state.md` @@ -229,21 +229,21 @@ Users may request changes to the execution plan or phase execution during the wo ``` User requests change | - ├─ Is it current phase? - | ├─ Yes: Can modify or restart current phase + ├─ Is it current stage? + | ├─ Yes: Can modify or restart current stage | └─ No: Go to next question | - ├─ Is it a completed phase? - | ├─ Yes: Assess impact on dependent phases + ├─ Is it a completed stage? + | ├─ Yes: Assess impact on dependent stages | | ├─ Low impact: Modify and update dependents - | | └─ High impact: Recommend restart from that phase + | | └─ High impact: Recommend restart from that stage | └─ No: Go to next question | - ├─ Is it adding a skipped phase? + ├─ Is it adding a skipped stage? | ├─ Yes: Check prerequisites, add to plan, execute | └─ No: Go to next question | - ├─ Is it skipping a planned phase? + ├─ Is it skipping a planned stage? | ├─ Yes: Warn about impact, get confirmation, skip | └─ No: Go to next question | @@ -259,7 +259,7 @@ User requests change ### Change Request Log Format ```markdown -## Change Request - [Phase Name] +## Change Request - [Stage Name] **Timestamp**: [ISO timestamp] **Request**: [What user wants to change] **Current State**: [Where we are in workflow] diff --git a/aidlc-rules/aws-aidlc-rule-details/construction/build-and-test.md b/aidlc-rules/aws-aidlc-rule-details/construction/build-and-test.md index 85f5951c..92dd9656 100644 --- a/aidlc-rules/aws-aidlc-rule-details/construction/build-and-test.md +++ b/aidlc-rules/aws-aidlc-rule-details/construction/build-and-test.md @@ -306,37 +306,43 @@ Update `aidlc-docs/aidlc-state.md`: ## Step 9: Present Results to User -Present comprehensive message: +Present completion message in this structure: + 1. **Completion Announcement** (mandatory): Always start with this: +```markdown +# 🔨 Build and Test Complete ``` -"🔨 Build and Test Complete! -**Build Status**: [Success/Failed] + 2. **AI Summary** (optional): Provide structured bullet-point summary of build and test results + - Format: "Build and test has completed with the following results:" + - List build status and artifacts + - List test results by category (unit, integration, performance, etc.) + - List generated instruction files + - DO NOT include workflow instructions ("please review", "let me know", "proceed to next phase", "before we proceed") + - Keep factual and content-focused + 3. **Formatted Workflow Message** (mandatory): Always end with this exact format: -**Test Results**: -✅ Unit Tests: [X] passed -✅ Integration Tests: [X] scenarios passed -✅ Performance Tests: [Status] -✅ Additional Tests: [Status] +```markdown +> **📋 <u>**REVIEW REQUIRED:**</u>** +> Please examine the build and test summary at: `aidlc-docs/construction/build-and-test/build-and-test-summary.md` -**Generated Files**: -1. ✅ build-instructions.md -2. ✅ unit-test-instructions.md -3. ✅ integration-test-instructions.md -4. ✅ performance-test-instructions.md (if applicable) -5. ✅ [additional test files as needed] -6. ✅ build-and-test-summary.md -Review the summary in aidlc-docs/construction/build-and-test/build-and-test-summary.md -**Ready to proceed to Operations stage for deployment planning?"" +> **🚀 <u>**WHAT'S NEXT?**</u>** +> +> **You may:** +> +> 🔧 **Request Changes** - Ask for modifications to the build and test instructions based on your review +> ✅ **Approve & Continue** - Approve build and test results and proceed to **Operations** + +--- ``` --- ## Step 10: Log Interaction -**MANDATORY**: Log the phase completion in `aidlc-docs/audit.md`: +**MANDATORY**: Log the stage completion in `aidlc-docs/audit.md`: ```markdown ## Build and Test Stage diff --git a/aidlc-rules/aws-aidlc-rule-details/construction/code-generation.md b/aidlc-rules/aws-aidlc-rule-details/construction/code-generation.md index 492f21ca..bfb4cdf7 100644 --- a/aidlc-rules/aws-aidlc-rule-details/construction/code-generation.md +++ b/aidlc-rules/aws-aidlc-rule-details/construction/code-generation.md @@ -5,6 +5,8 @@ This stage generates code for each unit of work through two integrated parts: - **Part 1 - Planning**: Create detailed code generation plan with explicit steps - **Part 2 - Generation**: Execute approved plan to generate code, tests, and artifacts +**Note**: For brownfield projects, "generate" means modify existing files when appropriate, not create duplicates. + ## Prerequisites - Unit Design Generation must be complete for the unit - NFR Implementation (if executed) must be complete for the unit @@ -22,7 +24,12 @@ This stage generates code for each unit of work through two integrated parts: - [ ] Validate unit is ready for code generation ## Step 2: Create Detailed Unit Code Generation Plan +- [ ] Read workspace root and project type from `aidlc-docs/aidlc-state.md` +- [ ] Determine code location (see Critical Rules for structure patterns) +- [ ] **Brownfield only**: Review reverse engineering code-structure.md for existing files to modify +- [ ] Document exact paths (never aidlc-docs/) - [ ] Create explicit steps for unit generation: + - Project Structure Setup (greenfield only) - Business Logic Generation - Business Logic Unit Testing - Business Logic Summary @@ -32,11 +39,14 @@ This stage generates code for each unit of work through two integrated parts: - Repository Layer Generation - Repository Layer Unit Testing - Repository Layer Summary - - Database Migration Scripts Generation (if data models exist) + - Frontend Components Generation (if applicable) + - Frontend Components Unit Testing (if applicable) + - Frontend Components Summary (if applicable) + - Database Migration Scripts (if data models exist) - Documentation Generation (API docs, README updates) - Deployment Artifacts Generation - [ ] Number each step sequentially -- [ ] Include story mapping references for this unit +- [ ] Include story mapping references - [ ] Add checkboxes [ ] for each step ## Step 3: Include Unit Generation Context @@ -77,7 +87,7 @@ This stage generates code for each unit of work through two integrated parts: - [ ] Mark the approval status clearly ## Step 9: Update Progress -- [ ] Mark Code Planning complete in `aidlc-state.md` +- [ ] Mark Code Generation Part 1 (Planning) complete in `aidlc-state.md` - [ ] Update the "Current Status" section - [ ] Prepare for transition to Code Generation @@ -91,15 +101,23 @@ This stage generates code for each unit of work through two integrated parts: - [ ] Load the context for that step (unit, dependencies, stories) ## Step 11: Execute Current Step -- [ ] Perform exactly what the current step describes -- [ ] Generate code, tests, or documentation as specified -- [ ] Follow the unit's story requirements -- [ ] Respect dependencies and interfaces defined in the plan +- [ ] Verify target directory from plan (never aidlc-docs/) +- [ ] **Brownfield only**: Check if target file exists +- [ ] Generate exactly what the current step describes: + - **If file exists**: Modify it in-place (never create `ClassName_modified.java`, `ClassName_new.java`, etc.) + - **If file doesn't exist**: Create new file +- [ ] Write to correct locations: + - **Application Code**: Workspace root per project structure + - **Documentation**: `aidlc-docs/construction/{unit-name}/code/` (markdown only) + - **Build/Config Files**: Workspace root +- [ ] Follow unit story requirements +- [ ] Respect dependencies and interfaces ## Step 12: Update Progress - [ ] Mark the completed step as [x] in the unit code generation plan - [ ] Mark associated unit stories as [x] when their generation is finished - [ ] Update `aidlc-docs/aidlc-state.md` current status +- [ ] **Brownfield only**: Verify no duplicate files created (e.g., no `ClassName_modified.java` alongside `ClassName.java`) - [ ] Save all generated artifacts ## Step 13: Continue or Complete Generation @@ -114,18 +132,18 @@ This stage generates code for each unit of work through two integrated parts: # 💻 Code Generation Complete - [unit-name] ``` - 2. **AI Summary** (optional): Provide structured bullet-point summary of code generation - - Format: "Code generation has created [description]:" - - List key code artifacts generated (bullet points) - - List test coverage and documentation created - - Mention deployment artifacts and configuration files - - DO NOT include workflow instructions ("please review", "let me know", "proceed to next phase", "before we proceed") - - Keep factual and content-focused + 2. **AI Summary** (optional): Provide structured bullet-point summary + - **Brownfield**: Distinguish modified vs created files (e.g., "• Modified: `src/services/user-service.ts`", "• Created: `src/services/auth-service.ts`") + - **Greenfield**: List created files with paths (e.g., "• Created: `src/services/user-service.ts`") + - List tests, documentation, deployment artifacts with paths + - Keep factual, no workflow instructions 3. **Formatted Workflow Message** (mandatory): Always end with this exact format: ```markdown > **📋 <u>**REVIEW REQUIRED:**</u>** -> Please examine the generated code at: `aidlc-docs/construction/[unit-name]/code/` +> Please examine the generated code at: +> - **Application Code**: `[actual-workspace-path]` +> - **Documentation**: `aidlc-docs/construction/[unit-name]/code/` @@ -153,6 +171,23 @@ This stage generates code for each unit of work through two integrated parts: ## Critical Rules +### Code Location Rules +- **Application code**: Workspace root only (NEVER aidlc-docs/) +- **Documentation**: aidlc-docs/ only (markdown summaries) +- **Read workspace root** from aidlc-state.md before generating code + +**Structure patterns by project type**: +- **Brownfield**: Use existing structure (e.g., `src/main/java/`, `lib/`, `pkg/`) +- **Greenfield single unit**: `src/`, `tests/`, `config/` in workspace root +- **Greenfield multi-unit (microservices)**: `{unit-name}/src/`, `{unit-name}/tests/` +- **Greenfield multi-unit (monolith)**: `src/{unit-name}/`, `tests/{unit-name}/` + +### Brownfield File Modification Rules +- Check if file exists before generating +- If exists: Modify in-place (never create copies like `ClassName_modified.java`) +- If doesn't exist: Create new file +- Verify no duplicate files after generation (Step 12) + ### Planning Phase Rules - Create explicit, numbered steps for all generation activities - Include story traceability in the plan @@ -166,6 +201,13 @@ This stage generates code for each unit of work through two integrated parts: - **STORY TRACEABILITY**: Mark unit stories [x] when functionality is implemented - **RESPECT DEPENDENCIES**: Only implement when unit dependencies are satisfied +### Automation Friendly Code Rules +When generating UI code (web, mobile, desktop), ensure elements are automation-friendly: +- Add `data-testid` attributes to interactive elements (buttons, inputs, links, forms) +- Use consistent naming: `{component}-{element-role}` (e.g., `login-form-submit-button`, `user-list-search-input`) +- Avoid dynamic or auto-generated IDs that change between renders +- Keep `data-testid` values stable across code changes (only change when element purpose changes) + ## Completion Criteria - Complete unit code generation plan created and approved - All steps in unit code generation plan marked [x] diff --git a/aidlc-rules/aws-aidlc-rule-details/construction/functional-design.md b/aidlc-rules/aws-aidlc-rule-details/construction/functional-design.md index 18b997c0..70a6d62d 100644 --- a/aidlc-rules/aws-aidlc-rule-details/construction/functional-design.md +++ b/aidlc-rules/aws-aidlc-rule-details/construction/functional-design.md @@ -50,6 +50,7 @@ Design detailed business logic for the unit, technology-agnostic and focused pur - **Integration Points** - Ask about external system interactions, APIs, and data exchange - **Error Handling** - Ask about error scenarios, validation failures, and exception handling - **Business Scenarios** - Ask about edge cases, alternative flows, and complex business situations +- **Frontend Components** (if applicable) - Ask about UI component structure, user interactions, state management, and form handling ### Step 4: Store Plan - Save as `aidlc-docs/construction/plans/{unit-name}-functional-design-plan.md` @@ -67,6 +68,12 @@ Design detailed business logic for the unit, technology-agnostic and focused pur - Create `aidlc-docs/construction/{unit-name}/functional-design/business-logic-model.md` - Create `aidlc-docs/construction/{unit-name}/functional-design/business-rules.md` - Create `aidlc-docs/construction/{unit-name}/functional-design/domain-entities.md` +- If unit includes frontend/UI: Create `aidlc-docs/construction/{unit-name}/functional-design/frontend-components.md` + - Component hierarchy and structure + - Props and state definitions for each component + - User interaction flows + - Form validation rules + - API integration points (which backend endpoints each component uses) ### Step 7: Present Completion Message - Present completion message in this structure: diff --git a/aidlc-rules/aws-aidlc-rule-details/construction/infrastructure-design.md b/aidlc-rules/aws-aidlc-rule-details/construction/infrastructure-design.md index 8ff9b274..2cc89b71 100644 --- a/aidlc-rules/aws-aidlc-rule-details/construction/infrastructure-design.md +++ b/aidlc-rules/aws-aidlc-rule-details/construction/infrastructure-design.md @@ -21,20 +21,25 @@ Map logical software components to actual infrastructure choices for deployment - Each step should have a checkbox [] ### Step 3: Generate Context-Appropriate Questions -**DIRECTIVE**: Analyze the functional and NFR design to generate ONLY questions relevant to THIS specific unit's infrastructure needs. Use the categories below as inspiration, NOT as a mandatory checklist. Skip entire categories if not applicable. +**DIRECTIVE**: Thoroughly analyze the functional and NFR design to identify ALL areas where clarification would improve infrastructure decisions. Be proactive in asking questions to ensure comprehensive infrastructure coverage. + +**CRITICAL**: Default to asking questions when there is ANY ambiguity or missing detail that could affect infrastructure quality. It's better to ask too many questions than to make incorrect infrastructure assumptions. + +**MANDATORY**: Evaluate ALL of the following categories by asking targeted questions about each. For each category, determine applicability based on evidence from the functional and NFR design artifacts -- do not skip categories without explicit justification: - EMBED questions using [Answer]: tag format -- Focus on ambiguities and missing information specific to this unit -- Generate questions only where user input is needed for infrastructure decisions - -**Example question categories** (adapt as needed): -- **Deployment Environment** - Only if cloud provider or environment setup is unclear -- **Compute Infrastructure** - Only if compute service choice needs clarification -- **Storage Infrastructure** - Only if database or storage selection is ambiguous -- **Messaging Infrastructure** - Only if messaging/queuing services need specification -- **Networking Infrastructure** - Only if load balancing or API gateway approach is unclear -- **Monitoring Infrastructure** - Only if observability tooling needs clarification -- **Shared Infrastructure** - Only if infrastructure sharing strategy is ambiguous +- Focus on ANY ambiguities, missing information, or areas needing clarification +- Generate questions wherever user input would improve infrastructure decisions +- **When in doubt, ask the question** - overconfidence leads to poor infrastructure choices + +**Question categories to evaluate** (consider ALL categories): +- **Deployment Environment** - Ask about cloud provider preferences, environment setup, and deployment targets +- **Compute Infrastructure** - Ask about compute service choices, sizing, and scaling requirements +- **Storage Infrastructure** - Ask about database selection, storage patterns, and data lifecycle needs +- **Messaging Infrastructure** - Ask about messaging/queuing services, event-driven patterns, and async processing +- **Networking Infrastructure** - Ask about load balancing, API gateway approach, and network topology +- **Monitoring Infrastructure** - Ask about observability tooling, alerting strategy, and logging requirements +- **Shared Infrastructure** - Ask about infrastructure sharing strategy, multi-tenancy, and resource isolation ### Step 4: Store Plan - Save as `aidlc-docs/construction/plans/{unit-name}-infrastructure-design-plan.md` diff --git a/aidlc-rules/aws-aidlc-rule-details/construction/nfr-design.md b/aidlc-rules/aws-aidlc-rule-details/construction/nfr-design.md index 92b950b1..d4f00196 100644 --- a/aidlc-rules/aws-aidlc-rule-details/construction/nfr-design.md +++ b/aidlc-rules/aws-aidlc-rule-details/construction/nfr-design.md @@ -20,18 +20,23 @@ Incorporate NFR requirements into unit design using patterns and logical compone - Each step should have a checkbox [] ### Step 3: Generate Context-Appropriate Questions -**DIRECTIVE**: Analyze the NFR requirements to generate ONLY questions relevant to THIS specific unit's NFR design. Use the categories below as inspiration, NOT as a mandatory checklist. Skip entire categories if not applicable. +**DIRECTIVE**: Thoroughly analyze the NFR requirements to identify ALL areas where clarification would improve NFR design quality. Be proactive in asking questions to ensure comprehensive non-functional design coverage. + +**CRITICAL**: Default to asking questions when there is ANY ambiguity or missing detail that could affect NFR design quality. It's better to ask too many questions than to make incorrect assumptions about non-functional patterns. + +**MANDATORY**: Evaluate ALL of the following categories by asking targeted questions about each. For each category, determine applicability based on evidence from the NFR requirements -- do not skip categories without explicit justification: - EMBED questions using [Answer]: tag format -- Focus on ambiguities and missing information specific to this unit -- Generate questions only where user input is needed for pattern and component decisions - -**Example question categories** (adapt as needed): -- **Resilience Patterns** - Only if fault tolerance approach needs clarification -- **Scalability Patterns** - Only if scaling mechanisms are unclear -- **Performance Patterns** - Only if performance optimization strategy is ambiguous -- **Security Patterns** - Only if security implementation approach needs input -- **Logical Components** - Only if infrastructure components (queues, caches, etc.) need clarification +- Focus on ANY ambiguities, missing information, or areas needing clarification +- Generate questions wherever user input would improve pattern and component decisions +- **When in doubt, ask the question** - overconfidence leads to poor non-functional designs + +**Question categories to evaluate** (consider ALL categories): +- **Resilience Patterns** - Ask about fault tolerance approach, retry strategies, and failure recovery expectations +- **Scalability Patterns** - Ask about scaling mechanisms, load boundaries, and growth projections +- **Performance Patterns** - Ask about optimization strategy, latency targets, and throughput requirements +- **Security Patterns** - Ask about security implementation approach, threat model, and compliance constraints +- **Logical Components** - Ask about infrastructure components (queues, caches, circuit breakers, etc.) and their integration patterns ### Step 4: Store Plan - Save as `aidlc-docs/construction/plans/{unit-name}-nfr-design-plan.md` diff --git a/aidlc-rules/aws-aidlc-rule-details/extensions/security/baseline/security-baseline.md b/aidlc-rules/aws-aidlc-rule-details/extensions/security/baseline/security-baseline.md new file mode 100644 index 00000000..85914f26 --- /dev/null +++ b/aidlc-rules/aws-aidlc-rule-details/extensions/security/baseline/security-baseline.md @@ -0,0 +1,307 @@ +# Baseline Security Rules + +## Overview +These security rules are MANDATORY cross-cutting constraints that apply across all AI-DLC phases. They are not optional guidance — they are hard constraints that stages MUST enforce when generating questions, producing design artifacts, generating code, and presenting completion messages. + +**Enforcement**: At each applicable stage, the model MUST verify compliance with these rules before presenting the stage completion message to the user. + +### Blocking Security Finding Behavior +A **blocking security finding** means: +1. The finding MUST be listed in the stage completion message under a "Security Findings" section with the SECURITY rule ID and description +2. The stage MUST NOT present the "Continue to Next Stage" option until all blocking findings are resolved +3. The model MUST present only the "Request Changes" option with a clear explanation of what needs to change +4. The finding MUST be logged in `aidlc-docs/audit.md` with the SECURITY rule ID, description, and stage context + +If a SECURITY rule is not applicable to the current project (e.g., SECURITY-01 when no data stores exist), mark it as **N/A** in the compliance summary — this is not a blocking finding. + +### Default Enforcement +All rules in this document are **blocking** by default. If any rule's verification criteria are not met, it is a blocking security finding — follow the blocking finding behavior defined above. + +### Verification Criteria Format +Verification items in this document are plain bullet points describing compliance checks. They are distinct from the `- [ ]` / `- [x]` progress-tracking checkboxes used in stage plan files. Each item should be evaluated as compliant or non-compliant during review. + +--- + +## Rule SECURITY-01: Encryption at Rest and in Transit + +**Rule**: Every data persistence store (databases, object storage, file systems, caches, or any equivalent) MUST have: +- Encryption at rest enabled using a managed key service or customer-managed keys +- Encryption in transit enforced (TLS 1.2+ for all data movement in and out of the store) + +**Verification**: +- No storage resource is defined without an encryption configuration block +- No database connection string uses an unencrypted protocol +- Object storage enforces encryption at rest and rejects non-TLS requests via policy +- Database instances have storage encryption enabled and enforce TLS connections + +--- + +## Rule SECURITY-02: Access Logging on Network Intermediaries + +**Rule**: Every network-facing intermediary that handles external traffic MUST have access logging enabled. This includes: +- Load balancers → access logs to a persistent store +- API gateways → execution logging and access logging to a centralized log service +- CDN distributions → standard logging or real-time logs + +**Verification**: +- No load balancer resource is defined without access logging enabled +- No API gateway stage is defined without access logging configured +- No CDN distribution is defined without logging configuration + +--- + +## Rule SECURITY-03: Application-Level Logging + +**Rule**: Every deployed application component MUST include structured logging infrastructure: +- A logging framework MUST be configured +- Log output MUST be directed to a centralized log service +- Logs MUST include: timestamp, correlation/request ID, log level, and message +- Sensitive data (passwords, tokens, PII) MUST NOT appear in log output + +**Verification**: +- Every service/function entry point includes a configured logger +- No ad-hoc logging statements used as the primary logging mechanism in production code +- Log configuration routes output to a centralized log service +- No secrets, tokens, or PII are logged + +--- + +## Rule SECURITY-04: HTTP Security Headers for Web Applications + +**Rule**: The following HTTP response headers MUST be set on all HTML-serving endpoints: + +| Header | Required Value | +|---|---| +| `Content-Security-Policy` | Define a restrictive policy (at minimum: `default-src 'self'`) | +| `Strict-Transport-Security` | `max-age=31536000; includeSubDomains` | +| `X-Content-Type-Options` | `nosniff` | +| `X-Frame-Options` | `DENY` (or `SAMEORIGIN` if framing is required) | +| `Referrer-Policy` | `strict-origin-when-cross-origin` | + +**Note**: `X-XSS-Protection` is deprecated in modern browsers. Use `Content-Security-Policy` instead. + +**Verification**: +- Middleware or response interceptor sets all required headers +- CSP policy does not use `unsafe-inline` or `unsafe-eval` without documented justification +- HSTS max-age is at least 31536000 (1 year) + +--- + +## Rule SECURITY-05: Input Validation on All API Parameters + +**Rule**: Every API endpoint (REST, GraphQL, gRPC, WebSocket) MUST validate all input parameters before processing. Validation MUST include: +- **Type checking**: Reject unexpected types +- **Length/size bounds**: Enforce maximum lengths on strings, maximum sizes on arrays and payloads +- **Format validation**: Use allowlists (regex or schema) for structured inputs (emails, dates, IDs) +- **Sanitization**: Escape or reject HTML/script content in user-supplied strings to prevent XSS +- **Injection prevention**: Use parameterized queries for all database operations (never string concatenation) + +**Verification**: +- Every API handler uses a validation library or schema +- No raw user input is concatenated into SQL, NoSQL, or OS commands +- String inputs have explicit max-length constraints +- Request body size limits are configured at the framework or gateway level + +--- + +## Rule SECURITY-06: Least-Privilege Access Policies + +**Rule**: Every identity and access management policy, role, or permission boundary MUST follow least privilege: +- Use specific resource identifiers — NEVER use wildcard resources unless the API does not support resource-level permissions (document the exception) +- Use specific actions — NEVER use wildcard actions +- Scope conditions where possible +- Separate read and write permissions into distinct policy statements + +**Verification**: +- No policy contains wildcard actions or wildcard resources without a documented exception +- No service role has broader permissions than what the service actually calls +- Inline policies are avoided in favor of managed policies where possible +- Every role has a trust policy scoped to the specific service or account + +--- + +## Rule SECURITY-07: Restrictive Network Configuration + +**Rule**: All network configurations (security groups, network ACLs, route tables) MUST follow deny-by-default: +- Firewall rules: Only open specific ports required by the application +- No inbound rule with source `0.0.0.0/0` except for public-facing load balancers on ports 80/443 +- No outbound rule with `0.0.0.0/0` on all ports unless explicitly justified +- Private subnets MUST NOT have direct internet gateway routes +- Use private endpoints for cloud service access where available + +**Verification**: +- No firewall rule allows inbound `0.0.0.0/0` on any port other than 80/443 on a public load balancer +- Database and application firewall rules restrict source to specific CIDR blocks or security group references +- Private subnets route through a NAT gateway (not an internet gateway) +- Private endpoints are used for high-traffic cloud service calls + +--- + +## Rule SECURITY-08: Application-Level Access Control + +**Rule**: Every application endpoint that accesses or mutates a resource MUST enforce authorization checks at the application layer: +- **Deny by default**: All routes/endpoints MUST require authentication unless explicitly marked as public +- **Object-level authorization**: Every request that references a resource by ID MUST verify the requesting user/principal owns or has permission to access that resource (prevent IDOR) +- **Function-level authorization**: Administrative or privileged operations MUST check the caller's role/permissions server-side — never rely on client-side hiding +- **CORS policy**: Cross-origin resource sharing MUST be restricted to explicitly allowed origins — never use `Access-Control-Allow-Origin: *` on authenticated endpoints +- **Token validation**: JWTs or session tokens MUST be validated server-side on every request (signature, expiration, audience, issuer) + +**Verification**: +- Every controller/handler has an authorization middleware or guard applied +- No endpoint returns data for a resource ID without verifying the caller's ownership or permission +- Admin/privileged routes have explicit role checks enforced server-side +- CORS configuration does not use wildcard origins on authenticated endpoints +- Token validation occurs server-side on every request (not just at login) + +--- + +## Rule SECURITY-09: Security Hardening and Misconfiguration Prevention + +**Rule**: All deployed components MUST follow a hardening baseline: +- **No default credentials**: Default usernames/passwords MUST be changed or disabled before deployment +- **Minimal installation**: Remove or disable unused features, frameworks, sample applications, and documentation endpoints +- **Error handling**: Production error responses MUST NOT expose stack traces, internal paths, framework versions, or database details to end users +- **Directory listing**: Web servers MUST disable directory listing +- **Cloud storage**: Cloud object storage MUST block public access unless explicitly required and documented +- **Patch management**: Runtime environments, frameworks, and OS images MUST use current, supported versions + +**Verification**: +- No default credentials exist in configuration files, environment variables, or IaC templates +- Error responses in production return generic messages (no stack traces or internal details) +- Cloud object storage has public access blocked unless a documented exception exists +- No sample/demo applications or default pages are deployed +- Framework and runtime versions are current and supported + + +--- + +## Rule SECURITY-10: Software Supply Chain Security + +**Rule**: Every project MUST manage its software supply chain: +- **Dependency pinning**: All dependencies MUST use exact versions or lock files +- **Vulnerability scanning**: A dependency vulnerability scanner MUST be configured +- **No unused dependencies**: Remove packages that are not actively used +- **Trusted sources only**: Dependencies MUST be pulled from official registries or verified private registries — no unvetted third-party sources +- **SBOM**: Projects MUST generate a Software Bill of Materials for production deployments +- **CI/CD integrity**: Build pipelines MUST use pinned tool versions and verified base images — no `latest` tags in production Dockerfiles or CI configurations + +**Verification**: +- A lock file exists and is committed to version control +- A dependency vulnerability scanning step is included in CI/CD or documented in build instructions +- No unused or abandoned dependencies are included +- Dockerfiles and CI configs do not use `latest` or unpinned image tags for production +- Dependencies are sourced from official or verified registries + +--- + +## Rule SECURITY-11: Secure Design Principles + +**Rule**: Application design MUST incorporate security from the start: +- **Separation of concerns**: Security-critical logic (authentication, authorization, payment processing) MUST be isolated in dedicated modules — not scattered across the codebase +- **Defense in depth**: No single control should be the sole line of defense — layer controls (validation + authorization + encryption) +- **Rate limiting**: Public-facing endpoints MUST implement rate limiting or throttling to prevent abuse +- **Business logic abuse**: Design MUST consider misuse cases — not just happy-path scenarios + +**Verification**: +- Security-critical logic is encapsulated in dedicated modules or services +- Rate limiting is configured on public-facing APIs +- Design documentation addresses at least one misuse/abuse scenario + +--- + +## Rule SECURITY-12: Authentication and Credential Management + +**Rule**: Every application with user authentication MUST implement: +- **Password policy**: Minimum 8 characters, check against breached password lists +- **Credential storage**: Passwords MUST be hashed using adaptive algorithms — never weak or non-adaptive hashing +- **Multi-factor authentication**: MFA MUST be supported for administrative accounts and SHOULD be available for all users +- **Session management**: Sessions MUST have server-side expiration, be invalidated on logout, and use secure/httpOnly/sameSite cookie attributes +- **Brute-force protection**: Login endpoints MUST implement account lockout, progressive delays, or CAPTCHA after repeated failures +- **No hardcoded credentials**: No passwords, API keys, or secrets in source code or IaC templates — use a secrets manager + +**Verification**: +- Password hashing uses adaptive algorithms (not weak or non-adaptive hashing) +- Session cookies set `Secure`, `HttpOnly`, and `SameSite` attributes +- Login endpoints have brute-force protection (lockout, delay, or CAPTCHA) +- No hardcoded credentials in source code or configuration files +- MFA is supported for admin accounts +- Sessions are invalidated on logout and have a defined expiration + +--- + +## Rule SECURITY-13: Software and Data Integrity Verification + +**Rule**: Systems MUST verify the integrity of software and data: +- **Deserialization safety**: Untrusted data MUST NOT be deserialized without validation — use safe deserialization libraries or allowlists of permitted types +- **Artifact integrity**: Downloaded dependencies, plugins, and updates MUST be verified via checksums or digital signatures +- **CI/CD pipeline security**: Build pipelines MUST restrict who can modify pipeline definitions — separate duties between code authors and deployment approvers +- **CDN and external resources**: Scripts or resources loaded from external CDNs MUST use Subresource Integrity (SRI) hashes +- **Data integrity**: Critical data modifications MUST be auditable (who changed what, when) + +**Verification**: +- No unsafe deserialization of untrusted input +- External scripts include SRI integrity attributes when loaded from CDNs +- CI/CD pipeline definitions are access-controlled and changes are auditable +- Critical data changes are logged with actor, timestamp, and before/after values + +--- + +## Rule SECURITY-14: Alerting and Monitoring + +**Rule**: In addition to logging (SECURITY-02, SECURITY-03), systems MUST include: +- **Security event alerting**: Alerts MUST be configured for high-value security events: repeated authentication failures, privilege escalation attempts, access from unusual locations, and authorization failures +- **Log integrity**: Logs MUST be stored in append-only or tamper-evident storage — application code MUST NOT be able to delete or modify its own audit logs +- **Log retention**: Logs MUST be retained for a minimum period appropriate to the application's compliance requirements (default: 90 days minimum) +- **Monitoring dashboards**: A monitoring dashboard or alarm configuration MUST be defined for key operational and security metrics + +**Verification**: +- Alerting is configured for authentication failures and authorization violations +- Application log groups have retention policies set (minimum 90 days) +- Application roles do not have permission to delete their own log groups/streams +- Security-relevant events (login failures, access denied, privilege changes) generate alerts + +--- + +## Rule SECURITY-15: Exception Handling and Fail-Safe Defaults + +**Rule**: Every application MUST handle exceptional conditions safely: +- **Catch and handle**: All external calls (database, API, file I/O) MUST have explicit error handling — no unhandled promise rejections or uncaught exceptions in production +- **Fail closed**: On error, the system MUST deny access or halt the operation — never fail open +- **Resource cleanup**: Error paths MUST release resources (connections, file handles, locks) — use try/finally, using statements, or equivalent patterns +- **User-facing errors**: Error messages shown to users MUST be generic — no internal details or system information +- **Global error handler**: Applications MUST have a global/top-level error handler that catches unhandled exceptions, logs them (per SECURITY-03), and returns a safe response + +**Verification**: +- All external calls (DB, HTTP, file I/O) have explicit error handling (try/catch, .catch(), error callbacks) +- A global error handler is configured at the application entry point +- Error paths do not bypass authorization or validation checks (fail closed) +- Resources are cleaned up in error paths (connections closed, transactions rolled back) +- No unhandled promise rejections or uncaught exception warnings in application code + +--- + +## Enforcement Integration + +These rules are cross-cutting constraints that apply to every AI-DLC stage. At each stage: +- Evaluate all SECURITY rule verification criteria against the artifacts produced +- Include a "Security Compliance" section in the stage completion summary listing each rule as compliant, non-compliant, or N/A +- If any rule is non-compliant, this is a blocking security finding — follow the blocking finding behavior defined in the Overview +- Include security rule references in design documentation and test instructions + +--- + +## Appendix: OWASP Reference Mapping + +<!-- TODO: CRITICAL - This entire OWASP mapping table needs verification. The "2025" edition may not exist; the latest published OWASP Top 10 is 2021. Category IDs (A01-A10), numbering, and names must be validated against the actual published standard before relying on this mapping. --> +For human reviewers, the following maps SECURITY rules to OWASP Top 10 (2025) categories: + +| SECURITY Rule | OWASP Category | +|---|---| +| SECURITY-08 | A01:2025 – Broken Access Control | +| SECURITY-09 | A02:2025 – Security Misconfiguration | +| SECURITY-10 | A03:2025 – Software Supply Chain Failures | +| SECURITY-11 | A06:2025 – Insecure Design | +| SECURITY-12 | A07:2025 – Authentication Failures | +| SECURITY-13 | A08:2025 – Software or Data Integrity Failures | +| SECURITY-14 | A09:2025 – Logging & Alerting Failures | +| SECURITY-15 | A10:2025 – Mishandling of Exceptional Conditions | diff --git a/aidlc-rules/aws-aidlc-rule-details/extensions/security/baseline/security-baseline.opt-in.md b/aidlc-rules/aws-aidlc-rule-details/extensions/security/baseline/security-baseline.opt-in.md new file mode 100644 index 00000000..cd247668 --- /dev/null +++ b/aidlc-rules/aws-aidlc-rule-details/extensions/security/baseline/security-baseline.opt-in.md @@ -0,0 +1,18 @@ +# Security Baseline — Opt-In + +**Extension**: Security Baseline + +## Opt-In Prompt + +The following question is automatically included in the Requirements Analysis clarifying questions when this extension is loaded: + +```markdown +## Question: Security Extensions +Should security extension rules be enforced for this project? + +A) Yes — enforce all SECURITY rules as blocking constraints (recommended for production-grade applications) +B) No — skip all SECURITY rules (suitable for PoCs, prototypes, and experimental projects) +X) Other (please describe after [Answer]: tag below) + +[Answer]: +``` diff --git a/aidlc-rules/aws-aidlc-rule-details/extensions/testing/property-based/property-based-testing.md b/aidlc-rules/aws-aidlc-rule-details/extensions/testing/property-based/property-based-testing.md new file mode 100644 index 00000000..dd4d7735 --- /dev/null +++ b/aidlc-rules/aws-aidlc-rule-details/extensions/testing/property-based/property-based-testing.md @@ -0,0 +1,284 @@ +# Property-Based Testing Rules + +## Overview + +These property-based testing (PBT) rules are cross-cutting constraints that apply across applicable AI-DLC phases. They ensure that code with identifiable properties is tested using property-based techniques, complementing (not replacing) traditional example-based tests. + +Property-based testing defines invariants that must hold for all valid inputs, then uses a framework to generate random inputs and search for counterexamples. When a failure is found, the framework shrinks the input to a minimal reproducing case. This approach uncovers edge cases and subtle bugs that example-based testing routinely misses. + +**Enforcement**: At each applicable stage, the model MUST verify compliance with these rules before presenting the stage completion message to the user. + +### Blocking PBT Finding Behavior + +A **blocking PBT finding** means: +1. The finding MUST be listed in the stage completion message under a "PBT Findings" section with the PBT rule ID and description +2. The stage MUST NOT present the "Continue to Next Stage" option until all blocking findings are resolved +3. The model MUST present only the "Request Changes" option with a clear explanation of what needs to change +4. The finding MUST be logged in `aidlc-docs/audit.md` with the PBT rule ID, description, and stage context + +If a PBT rule is not applicable to the current project or unit (e.g., PBT-06 when no stateful components exist), mark it as **N/A** in the compliance summary — this is not a blocking finding. + +### Default Enforcement + +All rules in this document are **blocking** by default. If any rule's verification criteria are not met, it is a blocking PBT finding — follow the blocking finding behavior defined above. + +### Partial Enforcement Mode + +If the user selected **Partial** enforcement during opt-in, only rules PBT-02, PBT-03, PBT-07, PBT-08, and PBT-09 are enforced. All other rules are treated as advisory (non-blocking). Log the enforcement mode in `aidlc-docs/aidlc-state.md` under `## Extension Configuration`. + +### Verification Criteria Format + +Verification items in this document are plain bullet points describing compliance checks. Each item should be evaluated as compliant or non-compliant during review. + +--- + +## Rule PBT-01: Property Identification During Design + +**Rule**: Every unit containing business logic, data transformations, or algorithmic operations MUST be analyzed for testable properties during the Functional Design stage. The analysis MUST identify which of the following property categories apply: + +| Category | Description | Example | +|---|---|---| +| Round-trip | An operation paired with its inverse yields the original value | serialize → deserialize = identity | +| Invariant | A transformation preserves some measurable characteristic | sort preserves collection size and elements | +| Idempotence | Applying an operation twice yields the same result as once | dedup(dedup(list)) = dedup(list) | +| Commutativity | Different operation orderings produce the same result | add(a, b) = add(b, a) | +| Oracle | A reference implementation or simplified model can verify results | optimized algorithm vs brute-force | +| Induction | A property proven for smaller inputs extends to larger ones | recursive structures, divide-and-conquer | +| Easy verification | The result is hard to compute but easy to check | maze solver output can be walked to verify | + +The identified properties MUST be documented in the functional design artifacts for the unit, and carried forward into code generation as PBT test requirements. + +**Verification**: +- Functional design artifacts include a "Testable Properties" section listing identified properties per component +- Each identified property references one of the categories above +- Components with no identifiable properties are explicitly marked as "No PBT properties identified" with a brief rationale +- The property list is referenced during code generation planning + +--- + +## Rule PBT-02: Round-Trip Properties + +**Rule**: Any operation that has a logical inverse MUST have a property-based test verifying the round-trip. This includes but is not limited to: +- Serialization / deserialization (JSON, XML, Protobuf, binary formats) +- Encoding / decoding (Base64, URL encoding, compression) +- Parsing / formatting (date parsing, number formatting, template rendering with structured input) +- Encryption / decryption (where key is available) +- Database write / read (for the data transformation layer, not the I/O itself) +- Any pair of functions where `f_inverse(f(x)) = x` for all valid `x` + +The property-based test MUST generate random valid inputs using a domain-appropriate generator (see PBT-07) and assert that the round-trip produces a value equal to the original input. + +**Verification**: +- Every serialization/deserialization pair has a round-trip property test +- Every encoding/decoding pair has a round-trip property test +- Every parsing/formatting pair has a round-trip property test (or documents why the transformation is lossy) +- Round-trip tests use generated inputs, not hardcoded examples +- Lossy transformations (e.g., float formatting with precision loss) document the acceptable deviation and test within tolerance + +--- + +## Rule PBT-03: Invariant Properties + +**Rule**: Functions with documented invariants MUST have property-based tests verifying those invariants hold across generated inputs. Common invariants include: +- **Size preservation**: output collection has the same size as input (e.g., map, sort) +- **Element preservation**: output contains exactly the same elements as input, possibly reordered (e.g., sort, shuffle) +- **Ordering guarantees**: output satisfies an ordering constraint (e.g., sort produces non-decreasing order) +- **Range constraints**: output values fall within a defined range (e.g., normalize produces values in [0, 1]) +- **Type preservation**: output type matches expected type for all valid inputs +- **Business rule invariants**: domain-specific rules that must always hold (e.g., "account balance never goes negative after a valid transaction", "discount never exceeds item price") + +**Verification**: +- Each documented invariant has a corresponding property-based test +- Invariant tests generate a wide range of inputs including boundary values +- Business rule invariants identified in functional design are covered by PBT +- Invariant tests do not duplicate exact assertions from example-based tests — they test the general rule, not specific cases + +--- + +## Rule PBT-04: Idempotency Properties + +**Rule**: Any operation that claims or requires idempotency MUST have a property-based test proving it. The test MUST verify that `f(f(x)) = f(x)` for all valid generated inputs. This applies to: +- API endpoints documented as idempotent (PUT, DELETE) +- Data normalization or sanitization functions +- Cache population operations +- Deduplication logic +- Configuration application (applying config twice should not change state) +- Message processing in at-least-once delivery systems + +**Verification**: +- Every operation documented as idempotent has a PBT asserting `f(f(x)) = f(x)` +- Idempotency tests use domain-appropriate generators (not just primitives) +- For stateful operations, the test verifies observable state equivalence after single vs repeated application + +--- + +## Rule PBT-05: Oracle and Model-Based Testing + +**Rule**: When a reference implementation, simplified model, or known-correct algorithm exists, property-based tests MUST compare the system under test against the oracle. This applies to: +- Optimized algorithms replacing a known brute-force version +- Refactored code replacing legacy implementations +- Parallel/concurrent implementations compared against sequential versions +- Custom implementations of well-known algorithms (sorting, searching, graph traversal) +- New query engines compared against a reference database + +The property-based test MUST generate random valid inputs and assert that the system under test produces equivalent results to the oracle for all generated inputs. + +**Verification**: +- When a reference implementation exists (or can be trivially written), an oracle PBT is present +- Oracle tests generate diverse inputs covering normal, boundary, and adversarial cases +- Equivalence is defined precisely (exact equality, structural equality, or documented tolerance) +- If no oracle exists, this rule is marked N/A with rationale + +--- + +## Rule PBT-06: Stateful Property Testing + +**Rule**: Components that manage mutable state MUST be evaluated for stateful property testing. Stateful PBT generates random sequences of commands (operations) against the system and verifies that invariants hold after each step. This applies to: +- In-memory caches and data stores +- State machines and workflow engines +- Queue and buffer implementations +- Session management systems +- Shopping carts, order pipelines, and similar stateful business objects +- Any component where the result of an operation depends on prior operations + +Stateful PBT MUST: +- Define a simplified model (reference state) that mirrors the system under test +- Generate random sequences of valid commands (add, remove, update, query, etc.) +- Execute each command against both the real system and the model +- Assert that observable state or query results match between system and model after each command +- Test sequences of varying lengths, including empty sequences + +**Verification**: +- Stateful components identified in functional design have stateful PBT or document why it is not applicable +- A simplified model is defined for comparison +- Command generators produce valid operation sequences with realistic parameter distributions +- Invariants are checked after each command in the sequence, not just at the end +- If no stateful components exist, this rule is marked N/A + +--- + +## Rule PBT-07: Generator Quality + +**Rule**: Property-based tests MUST use domain-specific generators that produce realistic, structured inputs — not just primitive types. Poor generators (e.g., random strings for email fields, unbounded integers for age fields) produce meaningless test cases and miss real bugs. + +Generator requirements: +- **Domain types**: Custom generators MUST be created for domain objects (e.g., User, Order, Transaction) that respect business constraints (valid email format, positive amounts, valid date ranges) +- **Constrained primitives**: Numeric generators MUST be constrained to realistic ranges where the domain requires it +- **Structured data**: Generators for complex inputs (nested objects, lists of domain objects) MUST produce structurally valid data +- **Edge case inclusion**: Generators SHOULD be configured to include boundary values (empty collections, zero, maximum values, Unicode strings) alongside normal values +- **Reusability**: Domain generators SHOULD be defined as reusable test utilities, not duplicated across test files + +**Verification**: +- No PBT uses only raw primitive generators (e.g., `st.integers()` alone) for domain-typed parameters +- Custom generators exist for domain objects used in PBT +- Generators respect documented business constraints (e.g., positive amounts, valid formats) +- Generator definitions are centralized and reusable where multiple tests share the same domain types + +--- + +## Rule PBT-08: Shrinking and Reproducibility + +**Rule**: All property-based tests MUST support shrinking and deterministic reproducibility. + +- **Shrinking**: When a property fails, the PBT framework MUST automatically reduce the failing input to a minimal reproducing case. Tests MUST NOT disable or bypass the framework's shrinking mechanism unless there is a documented technical reason (e.g., shrinking is incompatible with external service calls in integration tests). +- **Reproducibility**: Every PBT run MUST be reproducible via a seed value. The seed MUST be logged on failure so that the exact failing scenario can be replayed. CI configurations MUST either use a fixed seed for deterministic runs or log the random seed on every run for post-failure reproduction. +- **CI integration**: PBT MUST be included in the project's CI pipeline. Flaky PBT failures (tests that pass on retry without code changes) MUST be investigated, not suppressed. + +**Verification**: +- PBT framework's shrinking is enabled (not overridden or disabled) +- Test output on failure includes the seed value and the shrunk minimal failing input +- CI configuration logs the seed for every PBT run or uses a fixed seed +- No PBT is excluded from CI without documented justification +- Flaky PBT failures are tracked and investigated, not silently retried + +--- + +## Rule PBT-09: Framework Selection + +**Rule**: The project MUST select and configure an appropriate property-based testing framework for its primary language(s). The framework MUST support: +- Custom generators / strategies for domain types +- Automatic shrinking of failing cases +- Seed-based reproducibility +- Integration with the project's existing test runner + +Recommended frameworks by language (non-exhaustive): + +| Language | Framework | Notes | +|---|---|---| +| Python | Hypothesis | Mature, excellent shrinking, Django integration | +| JavaScript / TypeScript | fast-check | Integrates with Jest, Vitest, Mocha | +| Java | jqwik | JUnit 5 integration, stateful testing support | +| Kotlin | Kotest Property Testing | Kotest framework integration | +| Scala | ScalaCheck | SBT integration, widely adopted | +| Rust | proptest | Macro-based, good shrinking | +| Go | rapid | Lightweight, idiomatic Go | +| Haskell | QuickCheck | The original PBT framework | +| C# / .NET | FsCheck | Works with xUnit, NUnit | +| Erlang / Elixir | PropEr / StreamData | OTP-aware, stateful testing | + +The selected framework MUST be documented in the tech stack decisions and included as a project dependency. + +**Verification**: +- A PBT framework is selected and documented in tech stack decisions +- The framework is included in project dependencies (package.json, pom.xml, requirements.txt, etc.) +- The framework supports custom generators, shrinking, and seed-based reproducibility +- If the project uses multiple languages, each language with PBT-applicable code has a framework selected + +--- + +## Rule PBT-10: Complementary Testing Strategy + +**Rule**: Property-based tests MUST complement, not replace, example-based tests. The two approaches serve different purposes: + +- **Example-based tests**: Document specific known scenarios, regression cases, and business-critical edge cases with explicit expected values. They serve as executable documentation of concrete behavior. +- **Property-based tests**: Verify general invariants across a wide input space. They find unknown edge cases and validate that properties hold universally. + +Requirements: +- Critical business scenarios identified in user stories or requirements MUST have explicit example-based tests, even if a PBT covers the same property +- PBT MUST NOT be the sole test for any business-critical path — at least one example-based test must pin the expected behavior for key scenarios +- When a PBT discovers a failing case, the shrunk minimal example SHOULD be added as a permanent example-based regression test +- Test documentation MUST clearly distinguish between example-based and property-based tests (separate test files, test classes, or clearly named test functions) + +**Verification**: +- Business-critical paths have both example-based and property-based tests +- PBT is not used as the only test coverage for any critical feature +- Test files or test classes clearly separate or label PBT vs example-based tests +- Regression tests from PBT-discovered failures are captured as permanent example-based tests + +--- + +## Enforcement Integration + +These rules are cross-cutting constraints that apply to the following AI-DLC stages: + +| Stage | Applicable Rules | Enforcement | +|---|---|---| +| Functional Design | PBT-01 | Property identification must appear in design artifacts | +| NFR Requirements | PBT-09 | Framework selection must be included in tech stack decisions | +| Code Generation (Planning) | PBT-01 through PBT-10 | Code generation plan must include PBT test steps for identified properties | +| Code Generation (Generation) | PBT-02 through PBT-08, PBT-10 | Generated tests must include PBT alongside example-based tests | +| Build and Test | PBT-08 | Test execution instructions must include PBT with seed logging and CI integration | + +At each applicable stage: +- Evaluate all PBT rule verification criteria against the artifacts produced +- Include a "PBT Compliance" section in the stage completion summary listing each rule as compliant, non-compliant, or N/A +- If any rule is non-compliant, this is a blocking PBT finding — follow the blocking finding behavior defined in the Overview +- Include PBT rule references in design documentation and test instructions + +--- + +## Appendix: Property Category Quick Reference + +For developers and AI models identifying properties during Functional Design (PBT-01): + +| Pattern Name | Formal Term | Test Shape | When to Use | +|---|---|---|---| +| There and back again | Invertible function | `f_inv(f(x)) == x` | Serialization, encoding, parsing | +| Some things never change | Invariant | `measure(f(x)) == measure(x)` | Sort, map, filter, transform | +| The more things change, the more they stay the same | Idempotence | `f(f(x)) == f(x)` | Normalization, dedup, cache writes | +| Different paths, same destination | Commutativity | `f(g(x)) == g(f(x))` | Arithmetic, set operations, independent transforms | +| Solve a smaller problem first | Structural induction | Property on `x` implies property on `x + element` | Recursive structures, lists, trees | +| Hard to prove, easy to verify | Verification | `verify(solve(x)) == true` | Solvers, optimizers, search algorithms | +| The test oracle | Reference comparison | `f(x) == oracle(x)` | Optimized vs brute-force, refactored vs legacy | + +Source: Property category taxonomy adapted from Scott Wlaschin's "Choosing properties for property-based testing" ([fsharpforfunandprofit.com](https://fsharpforfunandprofit.com/posts/property-based-testing-2/)). diff --git a/aidlc-rules/aws-aidlc-rule-details/extensions/testing/property-based/property-based-testing.opt-in.md b/aidlc-rules/aws-aidlc-rule-details/extensions/testing/property-based/property-based-testing.opt-in.md new file mode 100644 index 00000000..6042d9f6 --- /dev/null +++ b/aidlc-rules/aws-aidlc-rule-details/extensions/testing/property-based/property-based-testing.opt-in.md @@ -0,0 +1,19 @@ +# Property-Based Testing — Opt-In + +**Extension**: Property-Based Testing + +## Opt-In Prompt + +The following question is automatically included in the Requirements Analysis clarifying questions when this extension is loaded: + +```markdown +## Question: Property-Based Testing Extension +Should property-based testing (PBT) rules be enforced for this project? + +A) Yes — enforce all PBT rules as blocking constraints (recommended for projects with business logic, data transformations, serialization, or stateful components) +B) Partial — enforce PBT rules only for pure functions and serialization round-trips (suitable for projects with limited algorithmic complexity) +C) No — skip all PBT rules (suitable for simple CRUD applications, UI-only projects, or thin integration layers with no significant business logic) +X) Other (please describe after [Answer]: tag below) + +[Answer]: +``` diff --git a/aidlc-rules/aws-aidlc-rule-details/inception/application-design.md b/aidlc-rules/aws-aidlc-rule-details/inception/application-design.md index 9ad90f68..4d05b5a5 100644 --- a/aidlc-rules/aws-aidlc-rule-details/inception/application-design.md +++ b/aidlc-rules/aws-aidlc-rule-details/inception/application-design.md @@ -12,9 +12,9 @@ Application Design focuses on: **Note**: Detailed business logic design happens later in Functional Design (per-unit, CONSTRUCTION phase) ## Prerequisites -- Context Assessment must be complete -- Requirements Assessment recommended (provides functional context) -- Story Development recommended (user stories guide design decisions) +- Workspace Detection must be complete +- Requirements Analysis recommended (provides functional context) +- User Stories recommended (user stories guide design decisions) - Execution plan must indicate Application Design stage should execute ## Step-by-Step Execution @@ -38,18 +38,19 @@ Application Design focuses on: - [ ] Validate design completeness and consistency ### 4. Generate Context-Appropriate Questions -**DIRECTIVE**: Analyze the requirements and stories to generate ONLY questions relevant to THIS specific application design. Use the categories below as inspiration, NOT as a mandatory checklist. Skip entire categories if not applicable. +**DIRECTIVE**: Analyze the requirements and stories to generate questions relevant to THIS specific application design. Use the categories below as guidance. Evaluate each category and, when in doubt about applicability, ask the question rather than skipping it — overconfidence leads to poor outcomes (see overconfidence-prevention.md). - EMBED questions using [Answer]: tag format -- Focus on ambiguities and missing information specific to this context -- Generate questions only where user input is needed for design decisions +- Focus on ANY ambiguities, missing information, or areas needing clarification +- Generate questions wherever user input would improve design decisions +- **When in doubt, ask the question** - overconfidence leads to poor designs -**Example question categories** (adapt as needed): -- **Component Identification** - Only if component boundaries or organization is unclear -- **Component Methods** - Only if method signatures need clarification (detailed business rules come later) -- **Service Layer Design** - Only if service orchestration or boundaries are ambiguous -- **Component Dependencies** - Only if communication patterns or dependency management is unclear -- **Design Patterns** - Only if architectural style or pattern choice needs user input +**Question categories to evaluate** (consider ALL categories): +- **Component Identification** - Ask about component boundaries, organization, and grouping strategies +- **Component Methods** - Ask about method signatures, input/output expectations, and interface contracts (detailed business rules come later) +- **Service Layer Design** - Ask about service orchestration, boundaries, and coordination patterns +- **Component Dependencies** - Ask about communication patterns, dependency management, and coupling concerns +- **Design Patterns** - Ask about architectural style preferences, pattern choices, and design constraints ### 5. Store Application Design Plan - Save as `aidlc-docs/inception/plans/application-design-plan.md` @@ -103,6 +104,7 @@ If the analysis in step 8 reveals ANY ambiguous answers, you MUST: - Dependency matrix showing relationships - Communication patterns between components - Data flow diagrams +- Create `aidlc-docs/inception/application-design/application-design.md` that consolidates the multiple design docs created above in a single doc. ### 11. Log Approval - Log approval prompt with timestamp in `aidlc-docs/audit.md` diff --git a/aidlc-rules/aws-aidlc-rule-details/inception/requirements-analysis.md b/aidlc-rules/aws-aidlc-rule-details/inception/requirements-analysis.md index cc47828a..02a2f259 100644 --- a/aidlc-rules/aws-aidlc-rule-details/inception/requirements-analysis.md +++ b/aidlc-rules/aws-aidlc-rule-details/inception/requirements-analysis.md @@ -90,6 +90,22 @@ Analyze whatever the user has provided: **When in doubt, ask questions** - incomplete requirements lead to poor implementations. +### Step 5.1: Extension Opt-In Prompts + +**MANDATORY**: Scan all loaded `*.opt-in.md` files (loaded at workflow start from `extensions/` subdirectories) for an `## Opt-In Prompt` section. For each extension that declares one, include that question in the clarifying questions file created in Step 6. + +After receiving answers: +1. Record each extension's enablement status in `aidlc-docs/aidlc-state.md` under `## Extension Configuration`: + +```markdown +## Extension Configuration +| Extension | Enabled | Decided At | +|---|---|---| +| [Extension Name] | [Yes/No] | Requirements Analysis | +``` + +2. **Deferred Rule Loading**: For each extension the user opted IN, load the full rules file now. The rules file is derived by naming convention: strip `.opt-in.md` from the opt-in filename and append `.md` (e.g., `security-baseline.opt-in.md` → `security-baseline.md`). For extensions the user opted OUT, do NOT load the full rules file. + ### Step 6: Generate Clarifying Questions (PROACTIVE APPROACH) - **ALWAYS** create `aidlc-docs/inception/requirements/requirement-verification-questions.md` unless requirements are exceptionally clear and complete - Ask questions about ANY missing, unclear, or ambiguous areas @@ -103,7 +119,12 @@ Analyze whatever the user has provided: - **MANDATORY**: Analyze ALL answers for ambiguities and create follow-up questions if needed - **MANDATORY**: Keep asking questions until ALL ambiguities are resolved OR user explicitly asks to proceed +### ⛔ GATE: Await User Answers +DO NOT proceed to Step 7 until all questions in requirement-verification-questions.md are answered and validated. +Present the question file to the user and STOP. + ### Step 7: Generate Requirements Document + - **PREREQUISITE**: Step 6 gate must be passed — all answers received and analyzed - Create `aidlc-docs/inception/requirements/requirements.md` - Include intent analysis summary at the top: - User request diff --git a/aidlc-rules/aws-aidlc-rule-details/inception/reverse-engineering.md b/aidlc-rules/aws-aidlc-rule-details/inception/reverse-engineering.md index bd63f1da..1cedb356 100644 --- a/aidlc-rules/aws-aidlc-rule-details/inception/reverse-engineering.md +++ b/aidlc-rules/aws-aidlc-rule-details/inception/reverse-engineering.md @@ -6,7 +6,7 @@ **Skip when**: Greenfield project (no existing code) -**Rerun behavior**: Always rerun when brownfield project detected, even if artifacts exist. This ensures artifacts reflect current code state +**Rerun behavior**: Rerun is controlled by workspace-detection.md. If existing reverse engineering artifacts are found and are still current, they are loaded and reverse engineering is skipped. If artifacts are stale (older than the codebase's last significant modification) or the user explicitly requests a rerun, reverse engineering executes again to ensure artifacts reflect current code state ## Step 1: Multi-Package Discovery @@ -43,7 +43,7 @@ - Linting configurations - CI/CD pipelines -## Step 1: Generate Business Overview Documentation +## Step 2: Generate Business Overview Documentation Create `aidlc-docs/inception/reverse-engineering/business-overview.md`: @@ -64,7 +64,7 @@ Create `aidlc-docs/inception/reverse-engineering/business-overview.md`: - **Responsibilities**: [Key responsibilities] ``` -## Step 2: Generate Architecture Documentation +## Step 3: Generate Architecture Documentation Create `aidlc-docs/inception/reverse-engineering/architecture.md`: @@ -98,7 +98,7 @@ Create `aidlc-docs/inception/reverse-engineering/architecture.md`: - **Networking**: [VPC, subnets, security groups] ``` -## Step 3: Generate Code Structure Documentation +## Step 4: Generate Code Structure Documentation Create `aidlc-docs/inception/reverse-engineering/code-structure.md`: @@ -112,6 +112,12 @@ Create `aidlc-docs/inception/reverse-engineering/code-structure.md`: ## Key Classes/Modules [Mermaid class diagram or module hierarchy] +### Existing Files Inventory +[List all source files with their purposes - these are candidates for modification in brownfield projects] + +**Example format**: +- `[path/to/file]` - [Purpose/responsibility] + ## Design Patterns ### [Pattern Name] - **Location**: [Where used] @@ -125,7 +131,7 @@ Create `aidlc-docs/inception/reverse-engineering/code-structure.md`: - **Purpose**: [Why needed] ``` -## Step 4: Generate API Documentation +## Step 5: Generate API Documentation Create `aidlc-docs/inception/reverse-engineering/api-documentation.md`: @@ -153,7 +159,7 @@ Create `aidlc-docs/inception/reverse-engineering/api-documentation.md`: - **Validation**: [Validation rules] ``` -## Step 5: Generate Component Inventory +## Step 6: Generate Component Inventory Create `aidlc-docs/inception/reverse-engineering/component-inventory.md`: @@ -180,7 +186,7 @@ Create `aidlc-docs/inception/reverse-engineering/component-inventory.md`: - **Test**: [Number] ``` -## Step 6: Generate Technology Stack Documentation +## Step 7: Generate Technology Stack Documentation Create `aidlc-docs/inception/reverse-engineering/technology-stack.md`: @@ -203,7 +209,7 @@ Create `aidlc-docs/inception/reverse-engineering/technology-stack.md`: - [Tool] - [Version] - [Purpose] ``` -## Step 7: Generate Dependencies Documentation +## Step 8: Generate Dependencies Documentation Create `aidlc-docs/inception/reverse-engineering/dependencies.md`: @@ -224,7 +230,7 @@ Create `aidlc-docs/inception/reverse-engineering/dependencies.md`: - **License**: [License type] ``` -## Step 8: Generate Code Quality Assessment +## Step 9: Generate Code Quality Assessment Create `aidlc-docs/inception/reverse-engineering/code-quality-assessment.md`: @@ -249,7 +255,7 @@ Create `aidlc-docs/inception/reverse-engineering/code-quality-assessment.md`: - **Anti-patterns**: [List with locations] ``` -## Step 9: Create Timestamp File +## Step 10: Create Timestamp File Create `aidlc-docs/inception/reverse-engineering/reverse-engineering-timestamp.md`: @@ -271,7 +277,7 @@ Create `aidlc-docs/inception/reverse-engineering/reverse-engineering-timestamp.m - [x] code-quality-assessment.md ``` -## Step 10: Update State Tracking +## Step 11: Update State Tracking Update `aidlc-docs/aidlc-state.md`: @@ -281,7 +287,7 @@ Update `aidlc-docs/aidlc-state.md`: - **Artifacts Location**: aidlc-docs/inception/reverse-engineering/ ``` -## Step 11: Present Completion Message to User +## Step 12: Present Completion Message to User ```markdown # 🔍 Reverse Engineering Complete @@ -299,7 +305,7 @@ Update `aidlc-docs/aidlc-state.md`: > ✅ **Approve & Continue** - Approve analysis and proceed to **Requirements Analysis** ``` -## Step 12: Wait for User Approval +## Step 13: Wait for User Approval - **MANDATORY**: Do not proceed until user explicitly approves - **MANDATORY**: Log user's response in audit.md with complete raw input diff --git a/aidlc-rules/aws-aidlc-rule-details/inception/units-generation.md b/aidlc-rules/aws-aidlc-rule-details/inception/units-generation.md index 74382f9f..2a816e98 100644 --- a/aidlc-rules/aws-aidlc-rule-details/inception/units-generation.md +++ b/aidlc-rules/aws-aidlc-rule-details/inception/units-generation.md @@ -10,11 +10,11 @@ This stage decomposes the system into manageable units of work through two integ **Terminology**: Use "Service" for independently deployable components, "Module" for logical groupings within a service, "Unit of Work" for planning context. ## Prerequisites -- Context Assessment must be complete -- Requirements Assessment recommended (provides functional scope) -- Story Development recommended (stories map to units) -- Application Design phase REQUIRED (determines components, methods, and services) -- Execution plan must indicate Design phase should execute +- Workspace Detection must be complete +- Requirements Analysis recommended (provides functional scope) +- User Stories recommended (stories map to units) +- Application Design stage REQUIRED (determines components, methods, and services) +- Execution plan must indicate Design stage should execute --- @@ -30,22 +30,29 @@ This stage decomposes the system into manageable units of work through two integ - [ ] Generate `aidlc-docs/inception/application-design/unit-of-work.md` with unit definitions and responsibilities - [ ] Generate `aidlc-docs/inception/application-design/unit-of-work-dependency.md` with dependency matrix - [ ] Generate `aidlc-docs/inception/application-design/unit-of-work-story-map.md` mapping stories to units +- [ ] **Greenfield only**: Document code organization strategy in `unit-of-work.md` (see code-generation.md for structure patterns) - [ ] Validate unit boundaries and dependencies - [ ] Ensure all stories are assigned to units ## Step 3: Generate Context-Appropriate Questions -**DIRECTIVE**: Analyze the requirements, stories, and application design to generate ONLY questions relevant to THIS specific decomposition problem. Use the categories below as inspiration, NOT as a mandatory checklist. Skip entire categories if not applicable. +**DIRECTIVE**: Thoroughly analyze the requirements, stories, and application design to identify ALL areas where clarification would improve unit decomposition quality. Be proactive in asking questions to ensure comprehensive coverage of decomposition concerns. + +**CRITICAL**: Default to asking questions when there is ANY ambiguity or missing detail that could affect unit boundaries or decomposition quality. It's better to ask too many questions than to make incorrect assumptions about how the system should be decomposed. + +**MANDATORY**: Evaluate ALL of the following categories by asking targeted questions about each. For each category, determine applicability based on evidence from the requirements, stories, and application design -- do not skip categories without explicit justification: - EMBED questions using [Answer]: tag format -- Focus on ambiguities and missing information specific to this context -- Generate questions only where user input is needed for decision-making - -**Example question categories** (adapt as needed): -- **Story Grouping** - Only if multiple stories exist and grouping strategy is unclear -- **Dependencies** - Only if multiple units likely and integration approach is ambiguous -- **Team Alignment** - Only if team structure or ownership is unclear -- **Technical Considerations** - Only if scalability/deployment requirements differ across units -- **Business Domain** - Only if domain boundaries or bounded contexts are unclear +- Focus on ANY ambiguities, missing information, or areas needing clarification +- Generate questions wherever user input would improve decomposition decisions +- **When in doubt, ask the question** - overconfidence leads to poor unit boundaries + +**Question categories to evaluate** (consider ALL categories): +- **Story Grouping** - Ask about grouping strategy, story affinity, and logical clustering approaches +- **Dependencies** - Ask about integration approach, shared resources, and inter-unit communication patterns +- **Team Alignment** - Ask about team structure, ownership boundaries, and collaboration models +- **Technical Considerations** - Ask about scalability/deployment requirements that may differ across units +- **Business Domain** - Ask about domain boundaries, bounded contexts, and business capability alignment +- **Code Organization (Greenfield multi-unit only)** - Ask about deployment model and directory structure preferences ## Step 4: Store UOW Plan - Save as `aidlc-docs/inception/plans/unit-of-work-plan.md` diff --git a/aidlc-rules/aws-aidlc-rule-details/inception/user-stories.md b/aidlc-rules/aws-aidlc-rule-details/inception/user-stories.md index e12a7fa1..66abd090 100644 --- a/aidlc-rules/aws-aidlc-rule-details/inception/user-stories.md +++ b/aidlc-rules/aws-aidlc-rule-details/inception/user-stories.md @@ -244,7 +244,7 @@ If the analysis in step 9 reveals ANY ambiguous answers, you MUST: - [ ] Save all generated artifacts ## Step 18: Continue or Complete Generation -- [ ] If more steps remain, return to Step 14 +- [ ] If more steps remain, return to Step 15 - [ ] If all steps complete, verify stories are ready for next stage - [ ] Ensure all mandatory artifacts are generated diff --git a/aidlc-rules/aws-aidlc-rule-details/inception/workflow-planning.md b/aidlc-rules/aws-aidlc-rule-details/inception/workflow-planning.md index 790e5cb1..2ef6ec30 100644 --- a/aidlc-rules/aws-aidlc-rule-details/inception/workflow-planning.md +++ b/aidlc-rules/aws-aidlc-rule-details/inception/workflow-planning.md @@ -202,20 +202,22 @@ Create Mermaid flowchart showing: **Styling rules** (add after flowchart): ``` -style WD fill:#90EE90,stroke:#2d5016,stroke-width:3px -style CP fill:#90EE90,stroke:#2d5016,stroke-width:3px -style CG fill:#90EE90,stroke:#2d5016,stroke-width:3px -style BT fill:#90EE90,stroke:#2d5016,stroke-width:3px -style US fill:#f0f0f0,stroke:#999999,stroke-width:2px,stroke-dasharray: 5 5 -style Start fill:#E6E6FA,stroke:#4B0082,stroke-width:2px -style End fill:#E6E6FA,stroke:#4B0082,stroke-width:2px +style WD fill:#4CAF50,stroke:#1B5E20,stroke-width:3px,color:#fff +style CG fill:#4CAF50,stroke:#1B5E20,stroke-width:3px,color:#fff +style BT fill:#4CAF50,stroke:#1B5E20,stroke-width:3px,color:#fff +style US fill:#BDBDBD,stroke:#424242,stroke-width:2px,stroke-dasharray: 5 5,color:#000 +style Start fill:#CE93D8,stroke:#6A1B9A,stroke-width:3px,color:#000 +style End fill:#CE93D8,stroke:#6A1B9A,stroke-width:3px,color:#000 + +linkStyle default stroke:#333,stroke-width:2px ``` **Style Guidelines**: -- Completed/Always execute: `fill:#90EE90,stroke:#2d5016,stroke-width:3px` -- Conditional EXECUTE: `fill:#FFE4B5,stroke:#8B7355,stroke-width:2px,stroke-dasharray: 5 5` -- Conditional SKIP: `fill:#f0f0f0,stroke:#999999,stroke-width:2px,stroke-dasharray: 5 5` -- Start/End: `fill:#E6E6FA,stroke:#4B0082,stroke-width:2px` +- Completed/Always execute: `fill:#4CAF50,stroke:#1B5E20,stroke-width:3px,color:#fff` (Material Green with white text) +- Conditional EXECUTE: `fill:#FFA726,stroke:#E65100,stroke-width:3px,stroke-dasharray: 5 5,color:#000` (Material Orange with black text) +- Conditional SKIP: `fill:#BDBDBD,stroke:#424242,stroke-width:2px,stroke-dasharray: 5 5,color:#000` (Material Gray with black text) +- Start/End: `fill:#CE93D8,stroke:#6A1B9A,stroke-width:3px,color:#000` (Material Purple with black text) +- Phase containers: Use lighter Material colors (INCEPTION: #BBDEFB, CONSTRUCTION: #C8E6C9, OPERATIONS: #FFF59D) ## Step 7: Create Execution Plan Document @@ -268,8 +270,7 @@ flowchart TD NFRA["NFR Requirements<br/><b>STATUS</b>"] NFRD["NFR Design<br/><b>STATUS</b>"] ID["Infrastructure Design<br/><b>STATUS</b>"] - CP["Code Planning<br/><b>EXECUTE</b>"] - CG["Code Generation<br/><b>EXECUTE</b>"] + CG["Code Generation<br/>(Planning + Generation)<br/><b>EXECUTE</b>"] BT["Build and Test<br/><b>EXECUTE</b>"] end @@ -280,8 +281,7 @@ flowchart TD Start --> WD WD --> RA RA --> WP - WP --> CP - CP --> CG + WP --> CG CG --> BT BT --> End(["Complete"]) @@ -296,7 +296,7 @@ flowchart TD ### 🔵 INCEPTION PHASE - [x] Workspace Detection (COMPLETED) - [x] Reverse Engineering (COMPLETED/SKIPPED) -- [x] Requirements Elaboration (COMPLETED) +- [x] Requirements Analysis (COMPLETED) - [x] User Stories (COMPLETED/SKIPPED) - [x] Execution Plan (IN PROGRESS) - [ ] Application Design - [EXECUTE/SKIP] @@ -315,10 +315,8 @@ flowchart TD - **Rationale**: [Why executing or skipping] - [ ] Infrastructure Design - [EXECUTE/SKIP] - **Rationale**: [Why executing or skipping] -- [ ] Code Planning - EXECUTE (ALWAYS) - - **Rationale**: Implementation approach needed - [ ] Code Generation - EXECUTE (ALWAYS) - - **Rationale**: Code implementation needed + - **Rationale**: Implementation planning and code generation needed - [ ] Build and Test - EXECUTE (ALWAYS) - **Rationale**: Build, test, and verification needed @@ -377,7 +375,6 @@ Update `aidlc-docs/aidlc-state.md`: - [ ] NFR Requirements - [EXECUTE/SKIP] - [ ] NFR Design - [EXECUTE/SKIP] - [ ] Infrastructure Design - [EXECUTE/SKIP] -- [ ] Code Planning - EXECUTE - [ ] Code Generation - EXECUTE - [ ] Build and Test - EXECUTE diff --git a/aidlc-rules/aws-aidlc-rule-details/inception/workspace-detection.md b/aidlc-rules/aws-aidlc-rule-details/inception/workspace-detection.md index 570b2e7e..b3bcc2d2 100644 --- a/aidlc-rules/aws-aidlc-rule-details/inception/workspace-detection.md +++ b/aidlc-rules/aws-aidlc-rule-details/inception/workspace-detection.md @@ -11,9 +11,10 @@ Check if `aidlc-docs/aidlc-state.md` exists: ## Step 2: Scan Workspace for Existing Code **Determine if workspace has existing code:** -- Scan workspace for source code files (.java, .py, .js, .ts, etc.) +- Scan workspace for source code files (.java, .py, .js, .ts, .jsx, .tsx, .kt, .kts, .scala, .groovy, .go, .rs, .rb, .php, .c, .h, .cpp, .hpp, .cc, .cs, .fs, etc.) - Check for build files (pom.xml, package.json, build.gradle, etc.) - Look for project structure indicators +- Identify workspace root directory (NOT aidlc-docs/) **Record findings:** ```markdown @@ -22,6 +23,7 @@ Check if `aidlc-docs/aidlc-state.md` exists: - **Programming Languages**: [List if found] - **Build System**: [Maven/Gradle/npm/etc. if found] - **Project Structure**: [Monolith/Microservices/Library/Empty] +- **Workspace Root**: [Absolute path] ``` ## Step 3: Determine Next Phase @@ -33,7 +35,11 @@ Check if `aidlc-docs/aidlc-state.md` exists: **IF workspace has existing code**: - Set flag: `brownfield = true` - Check for existing reverse engineering artifacts in `aidlc-docs/inception/reverse-engineering/` -- **IF reverse engineering artifacts exist**: Load them, skip to Requirements Analysis +- **IF reverse engineering artifacts exist**: + - Check if artifacts are stale (compare artifact timestamps against codebase's last significant modification) + - **IF artifacts are current**: Load them, skip to Requirements Analysis + - **IF artifacts are stale**: Next phase is Reverse Engineering (rerun to refresh artifacts) + - **IF user explicitly requests rerun**: Next phase is Reverse Engineering regardless of staleness - **IF no reverse engineering artifacts**: Next phase is Reverse Engineering ## Step 4: Create Initial State File @@ -51,6 +57,12 @@ Create `aidlc-docs/aidlc-state.md`: ## Workspace State - **Existing Code**: [Yes/No] - **Reverse Engineering Needed**: [Yes/No] +- **Workspace Root**: [Absolute path] + +## Code Location Rules +- **Application Code**: Workspace root (NEVER in aidlc-docs/) +- **Documentation**: aidlc-docs/ only +- **Structure patterns**: See code-generation.md Critical Rules ## Stage Progress [Will be populated as workflow progresses] diff --git a/aidlc-rules/core-workflow.md b/aidlc-rules/aws-aidlc-rules/core-workflow.md similarity index 68% rename from aidlc-rules/core-workflow.md rename to aidlc-rules/aws-aidlc-rules/core-workflow.md index ae4a09d6..720221bf 100644 --- a/aidlc-rules/core-workflow.md +++ b/aidlc-rules/aws-aidlc-rules/core-workflow.md @@ -1,11 +1,6 @@ # PRIORITY: This workflow OVERRIDES all other built-in workflows # When user requests software development, ALWAYS follow this workflow FIRST -# NOTE: This file contains a user-facing welcome message with workflow overview. -# Similar content exists in process-overview.md (technical reference) and README.md (documentation). -# This duplication is INTENTIONAL - each file serves a different purpose and audience. -# This file: User-facing welcome message with ASCII diagram for starting workflow - ## Adaptive Workflow Principle **The workflow adapts to the work, not the other way around.** @@ -16,7 +11,12 @@ The AI model intelligently assesses what stages are needed based on: 4. Risk and impact assessment ## MANDATORY: Rule Details Loading -**CRITICAL**: When performing any phase, you MUST read and use relevant content from rule detail files in `.amazonq/aws-aidlc-rule-details/` directory. +**CRITICAL**: When performing any phase, you MUST read and use relevant content from rule detail files. Check these paths in order and use the first one that exists: +- `.aidlc-rule-details/` (Cursor, Cline, Claude Code, GitHub Copilot) +- `.kiro/aws-aidlc-rule-details/` (Kiro IDE and CLI) +- `.amazonq/aws-aidlc-rule-details/` (Amazon Q Developer) + +All subsequent rule detail file references (e.g., `common/process-overview.md`, `inception/workspace-detection.md`) are relative to whichever rule details directory was resolved above. **Common Rules**: ALWAYS load common rules at workflow start: - Load `common/process-overview.md` for workflow overview @@ -25,9 +25,33 @@ The AI model intelligently assesses what stages are needed based on: - Load `common/question-format-guide.md` for question formatting rules - Reference these throughout the workflow execution +## MANDATORY: Extensions Loading (Context-Optimized) +**CRITICAL**: At workflow start, scan the `extensions/` directory recursively but load ONLY lightweight opt-in files — NOT full rule files. Full rule files are loaded on-demand after the user opts in. + +**Loading process**: +1. List all subdirectories under `extensions/` (e.g., `extensions/security/`, `extensions/compliance/`) +2. In each subdirectory, load ONLY `*.opt-in.md` files — these contain the extension's opt-in prompt. The corresponding rules file is derived by convention: strip the `.opt-in.md` suffix and append `.md` (e.g., `security-baseline.opt-in.md` → `security-baseline.md`) +3. Do NOT load full rule files (e.g., `security-baseline.md`) at this stage + +**Deferred Rule Loading**: +- During Requirements Analysis, opt-in prompts from the loaded `*.opt-in.md` files are presented to the user +- When the user opts IN for an extension, load the corresponding rules file (derived by naming convention) at that point +- When the user opts OUT, the full rules file is never loaded — saving context +- Extensions without a matching `*.opt-in.md` file are always enforced — load their rule files immediately at workflow start + +**Enforcement** (applies only to loaded/enabled extensions): +- Extension rules are hard constraints, not optional guidance +- At each stage, the model intelligently evaluates which extension rules are applicable based on the stage's purpose, the artifacts being produced, and the context of the work — enforce only those rules that are relevant +- Rules that are not applicable to the current stage should be marked as N/A in the compliance summary (this is not a blocking finding) +- Non-compliance with any applicable enabled extension rule is a **blocking finding** — do NOT present stage completion until resolved +- When presenting stage completion, include a summary of extension rule compliance (compliant/non-compliant/N/A per rule, with brief rationale for N/A determinations) + +**Conditional Enforcement**: Extensions may be conditionally enabled/disabled. See `inception/requirements-analysis.md` for the opt-in mechanism. Before enforcing any extension at ANY stage, check its `Enabled` status in `aidlc-docs/aidlc-state.md` under `## Extension Configuration`. Skip disabled extensions and log the skip in audit.md. Default to enforced if no configuration exists. + ## MANDATORY: Content Validation **CRITICAL**: Before creating ANY file, you MUST validate content according to `common/content-validation.md` rules: - Validate Mermaid diagram syntax +- Validate ASCII art diagrams (see `common/ascii-diagram-standards.md`) - Escape special characters properly - Provide text alternatives for complex visual content - Test content parsing compatibility @@ -41,110 +65,13 @@ The AI model intelligently assesses what stages are needed based on: - Answer validation and ambiguity resolution ## MANDATORY: Custom Welcome Message -**CRITICAL**: When starting ANY software development request, you MUST begin with this message: +**CRITICAL**: When starting ANY software development request, you MUST display the welcome message. -# 👋 Welcome to AI-DLC (AI-Driven Development Life Cycle)! 👋 - -I'll guide you through an adaptive software development workflow that intelligently tailors itself to your specific needs. - -## What is AI-DLC? - -AI-DLC is a structured yet flexible software development process that adapts to your project's needs. Think of it as having an experienced software architect who: -- **Analyzes your requirements** and asks clarifying questions when needed -- **Plans the optimal approach** based on complexity and risk -- **Skips unnecessary steps** for simple changes while providing comprehensive coverage for complex projects -- **Documents everything** so you have a complete record of decisions and rationale -- **Guides you through each phase** with clear checkpoints and approval gates - -## The Three-Phase Lifecycle - -``` - User Request - | - v - ╔═══════════════════════════════════════╗ - ║ INCEPTION PHASE ║ - ║ Planning & Application Design ║ - ╠═══════════════════════════════════════╣ - ║ • Workspace Detection (ALWAYS) ║ - ║ • Reverse Engineering (COND) ║ - ║ • Requirements Analysis (ALWAYS) ║ - ║ • User Stories (CONDITIONAL) ║ - ║ • Workflow Planning (ALWAYS) ║ - ║ • Application Design (CONDITIONAL) ║ - ║ • Units Generation (CONDITIONAL) ║ - ╚═══════════════════════════════════════╝ - | - v - ╔═══════════════════════════════════════╗ - ║ CONSTRUCTION PHASE ║ - ║ Design, Implementation & Test ║ - ╠═══════════════════════════════════════╣ - ║ • Per-Unit Loop (for each unit): ║ - ║ - Functional Design (COND) ║ - ║ - NFR Requirements Assess (COND) ║ - ║ - NFR Design (COND) ║ - ║ - Infrastructure Design (COND) ║ - ║ - Code Generation (ALWAYS) ║ - ║ • Build and Test (ALWAYS) ║ - ╚═══════════════════════════════════════╝ - | - v - ╔═══════════════════════════════════════╗ - ║ OPERATIONS PHASE ║ - ║ Placeholder for Future ║ - ╠═══════════════════════════════════════╣ - ║ • Operations (PLACEHOLDER) ║ - ╚═══════════════════════════════════════╝ - | - v - Complete -``` - -### Phase Breakdown: - -**INCEPTION PHASE** - *Planning & Application Design* -- **Purpose**: Determines WHAT to build and WHY -- **Activities**: Understanding requirements, analyzing existing code (if any), planning the approach -- **Output**: Clear requirements, execution plan, decisions on the number of units of work for parallel development -- **Your Role**: Answer questions, review plans, approve direction - -**CONSTRUCTION PHASE** - *Detailed Design, Implementation & Test* -- **Purpose**: Determines HOW to build it -- **Activities**: Detailed design (when needed), code generation, comprehensive testing -- **Output**: Working code, tests, build instructions -- **Your Role**: Review designs, approve implementation plans, validate results - -**OPERATIONS PHASE** - *Deployment & Monitoring (Future)* -- **Purpose**: How to DEPLOY and RUN it -- **Status**: Placeholder for future deployment and monitoring workflows -- **Current State**: Build and test activities handled in CONSTRUCTION phase - -## Key Principles: - -- ⚡ **Fully Adaptive**: Each stage independently evaluated based on your needs -- 🎯 **Efficient**: Simple changes execute only essential stages -- 📋 **Comprehensive**: Complex changes get full treatment with all safeguards -- 🔍 **Transparent**: You see and approve the execution plan before work begins -- 📝 **Documented**: Complete audit trail of all decisions and changes -- 🎛️ **User Control**: You can request stages be included or excluded - -## What Happens Next: - -1. **I'll analyze your workspace** to understand if this is a new or existing project -2. **I'll gather requirements** and ask clarifying questions if needed -3. **I'll create an execution plan** showing which stages I propose to run and why -4. **You'll review and approve** the plan (or request changes) -5. **We'll execute the plan** with checkpoints at each major stage -6. **You'll get working code** with complete documentation and tests - -The AI-DLC process adapts to: -- 📋 Your intent clarity and complexity -- 🔍 Existing codebase state -- 🎯 Scope and impact of changes -- ⚡ Risk and quality requirements - -Let's begin!" +**How to Display Welcome Message**: +1. Load the welcome message from `common/welcome-message.md` (in the resolved rule details directory) +2. Display the complete message to the user +3. This should only be done ONCE at the start of a new workflow +4. Do NOT load this file in subsequent interactions to save context space # Adaptive Software Development Workflow @@ -196,7 +123,7 @@ Let's begin!" 2. Load all steps from `inception/reverse-engineering.md` 3. Execute reverse engineering: - Analyze all packages and components - - Generate a busienss overview of the whole system covering the business transactions + - Generate a business overview of the whole system covering the business transactions - Generate architecture documentation - Generate code structure documentation - Generate API documentation @@ -430,7 +357,7 @@ Let's begin!" **Skip IF**: - No NFR requirements -- NFR Requirements Assessment was skipped +- NFR Requirements was skipped **Execution**: 1. **MANDATORY**: Log any user input during this stage in audit.md @@ -551,7 +478,7 @@ The Operations stage will eventually include: - **MANDATORY**: Log every approval prompt with timestamp before asking the user - **MANDATORY**: Record every user response with timestamp after receiving it - **CRITICAL**: ALWAYS append changes to EDIT audit.md file, NEVER use tools and commands that completely overwrite its contents -- **CRITICAL**: Using file writing tools and commands that overwrite contents of the entire audit.md and cause duplication +- **CRITICAL**: NEVER use file writing tools and commands that overwrite the entire contents of audit.md, as this causes duplication - Use ISO 8601 format for timestamps (YYYY-MM-DDTHH:MM:SSZ) - Include stage context for each entry @@ -581,67 +508,31 @@ The Operations stage will eventually include: ## Directory Structure ```text -aidlc-docs/ -├── inception/ # 🔵 INCEPTION PHASE artifacts -│ ├── plans/ -│ │ ├── workspace-detection.md -│ │ ├── workflow-planning.md -│ │ ├── story-generation-plan.md -│ │ └── unit-of-work-plan.md -│ ├── reverse-engineering/ # Brownfield only -│ │ ├── architecture.md -│ │ ├── code-structure.md -│ │ ├── api-documentation.md -│ │ ├── component-inventory.md -│ │ ├── technology-stack.md -│ │ ├── dependencies.md -│ │ ├── code-quality-assessment.md -│ │ └── reverse-engineering-timestamp.md -│ ├── requirements/ -│ │ ├── requirements.md -│ │ └── requirement-verification-questions.md -│ ├── user-stories/ -│ │ ├── stories.md -│ │ └── personas.md -│ └── application-design/ -│ ├── components.md -│ ├── component-methods.md -│ ├── services.md -│ ├── component-dependency.md -│ ├── unit-of-work.md -│ ├── unit-of-work-dependency.md -│ └── unit-of-work-story-map.md -├── construction/ # 🟢 CONSTRUCTION PHASE artifacts -│ ├── plans/ -│ │ ├── {unit-name}-functional-design-plan.md -│ │ ├── {unit-name}-nfr-requirements-plan.md -│ │ ├── {unit-name}-nfr-design-plan.md -│ │ ├── {unit-name}-infrastructure-design-plan.md -│ │ └── {unit-name}-code-generation-plan.md -│ ├── {unit-name}/ -│ │ ├── functional-design/ -│ │ │ ├── business-logic-model.md -│ │ │ ├── business-rules.md -│ │ │ └── domain-entities.md -│ │ ├── nfr-requirements/ -│ │ │ ├── nfr-requirements.md -│ │ │ └── tech-stack-decisions.md -│ │ ├── nfr-design/ -│ │ │ ├── nfr-design-patterns.md -│ │ │ └── logical-components.md -│ │ ├── infrastructure-design/ -│ │ │ ├── infrastructure-design.md -│ │ │ └── deployment-architecture.md -│ │ └── code/ -│ │ └── [generated code files] -│ └── build-and-test/ -│ ├── build-instructions.md -│ ├── unit-test-instructions.md -│ ├── integration-test-instructions.md -│ ├── performance-test-instructions.md -│ └── build-and-test-summary.md -├── operations/ # 🟡 OPERATIONS PHASE artifacts (placeholder) -│ └── [Future: deployment and monitoring artifacts] -├── aidlc-state.md # Dynamic state tracking -└── audit.md # Complete audit trail +<WORKSPACE-ROOT>/ # ⚠️ APPLICATION CODE HERE +├── [project-specific structure] # Varies by project (see code-generation.md) +│ +├── aidlc-docs/ # 📄 DOCUMENTATION ONLY +│ ├── inception/ # 🔵 INCEPTION PHASE +│ │ ├── plans/ +│ │ ├── reverse-engineering/ # Brownfield only +│ │ ├── requirements/ +│ │ ├── user-stories/ +│ │ └── application-design/ +│ ├── construction/ # 🟢 CONSTRUCTION PHASE +│ │ ├── plans/ +│ │ ├── {unit-name}/ +│ │ │ ├── functional-design/ +│ │ │ ├── nfr-requirements/ +│ │ │ ├── nfr-design/ +│ │ │ ├── infrastructure-design/ +│ │ │ └── code/ # Markdown summaries only +│ │ └── build-and-test/ +│ ├── operations/ # 🟡 OPERATIONS PHASE (placeholder) +│ ├── aidlc-state.md +│ └── audit.md ``` + +**CRITICAL RULE**: +- Application code: Workspace root (NEVER in aidlc-docs/) +- Documentation: aidlc-docs/ only +- Project structure: See code-generation.md for patterns by project type diff --git a/assets/images/cline-ide-aidlc-rules-loaded.png b/assets/images/cline-ide-aidlc-rules-loaded.png new file mode 100644 index 00000000..854d08ee Binary files /dev/null and b/assets/images/cline-ide-aidlc-rules-loaded.png differ diff --git a/assets/images/cursor-ide-aidlc-rules-loaded.png b/assets/images/cursor-ide-aidlc-rules-loaded.png new file mode 100644 index 00000000..bd8439a7 Binary files /dev/null and b/assets/images/cursor-ide-aidlc-rules-loaded.png differ diff --git a/assets/images/kiro-cli-aidlc-rules-loaded.png b/assets/images/kiro-cli-aidlc-rules-loaded.png index 75b5a225..63a4cdd4 100644 Binary files a/assets/images/kiro-cli-aidlc-rules-loaded.png and b/assets/images/kiro-cli-aidlc-rules-loaded.png differ diff --git a/assets/images/kiro-ide-aidlc-rules-loaded.png b/assets/images/kiro-ide-aidlc-rules-loaded.png new file mode 100644 index 00000000..b51ce16b Binary files /dev/null and b/assets/images/kiro-ide-aidlc-rules-loaded.png differ diff --git a/assets/images/kiro-sdd-nudge.png b/assets/images/kiro-sdd-nudge.png new file mode 100644 index 00000000..7daac4d5 Binary files /dev/null and b/assets/images/kiro-sdd-nudge.png differ diff --git a/assets/images/q-ide-aidlc-rules-loaded.png b/assets/images/q-ide-aidlc-rules-loaded.png index 4571f2b8..8d0b51fa 100644 Binary files a/assets/images/q-ide-aidlc-rules-loaded.png and b/assets/images/q-ide-aidlc-rules-loaded.png differ diff --git a/cliff.toml b/cliff.toml new file mode 100644 index 00000000..fe4942d0 --- /dev/null +++ b/cliff.toml @@ -0,0 +1,61 @@ +# git-cliff configuration +# https://git-cliff.org + +[changelog] +header = """ +# Changelog + +All notable changes to this project will be documented in this file. +""" + +body = """ +{% if version %}\ +## [{{ version | trim_start_matches(pat="v") }}] - {{ timestamp | date(format="%Y-%m-%d") }} +{% else %}\ +## [Unreleased] +{% endif %}\ +{% for group, commits in commits | group_by(attribute="group") %} + +### {{ group | striptags | trim | upper_first }} + +{% for commit in commits %}\ +- {{ commit.message | split(pat="\n") | first | trim }} +{% endfor %}\ +{% endfor %} +""" + +footer = "" +# trim = false so each body's leading \n creates a blank line separator +# between concatenated version sections (required for MD022/MD032) +trim = false +postprocessors = [ + # Normalize triple+ blank lines to single blank (MD012/no-multiple-blanks) + { pattern = "\n{3,}", replace = "\n\n" }, +] + +[git] +conventional_commits = true +filter_unconventional = true +split_commits = false + +commit_parsers = [ + { message = "^docs: update changelog", skip = true }, + { message = "^feat", group = "Features" }, + { message = "^fix", group = "Bug Fixes" }, + { message = "^docs", group = "Documentation" }, + { message = "^perf", group = "Performance" }, + { message = "^refactor", group = "Refactoring" }, + { message = "^style", group = "Style" }, + { message = "^test", group = "Tests" }, + { message = "^build", group = "CI/CD" }, + { message = "^ci", group = "CI/CD" }, + { message = "^chore", group = "Miscellaneous" }, +] + +filter_commits = false +tag_pattern = "v[0-9].*" +sort_commits = "oldest" + +[bump] +features_always_bump_minor = true +breaking_always_bump_major = true diff --git a/docs/ADMINISTRATIVE_GUIDE.md b/docs/ADMINISTRATIVE_GUIDE.md new file mode 100644 index 00000000..a03f4c0a --- /dev/null +++ b/docs/ADMINISTRATIVE_GUIDE.md @@ -0,0 +1,677 @@ +# Administrative Guide + +This guide documents the CI/CD infrastructure, GitHub Workflows, protected environments, secrets, variables, permissions, and release process for the `awslabs/aidlc-workflows` repository. + +**Audience:** Repository administrators, maintainers, and AI coding agents working on this repository. + +**Related documentation:** + +- [Developer's Guide](DEVELOPERS_GUIDE.md) — Running builds locally (CodeBuild + `act`) +- [Contributing Guidelines](../CONTRIBUTING.md) — Contribution process and conventions +- [README](../README.md) — User-facing setup and usage + +--- + +## Table of Contents + +- [Repository Overview](#repository-overview) +- [CI/CD Architecture](#cicd-architecture) +- [Workflow Reference](#workflow-reference) + - [Release PR Workflow](#release-pr-workflow-release-pryml) + - [Tag Release Workflow](#tag-release-workflow-tag-on-mergeyml) + - [CodeBuild Workflow](#codebuild-workflow-codebuildyml) + - [Release Workflow](#release-workflow-releaseyml) + - [Pull Request Validation Workflow](#pull-request-validation-workflow-pull-request-lintyml) + - [Security Scanners Workflow](#security-scanners-workflow-security-scannersyml) +- [Protected Environments](#protected-environments) +- [Secrets and Variables](#secrets-and-variables) +- [Permissions Model](#permissions-model) +- [Security Posture](#security-posture) + - [Security Finding Requirements](#security-finding-requirements) +- [Code Ownership](#code-ownership) +- [Release Process](#release-process) +- [Changelog Configuration](#changelog-configuration) +- [Updating Pinned Versions](#updating-pinned-versions) + +--- + +## Repository Overview + +This repository publishes the **AI-DLC (AI-Driven Development Life Cycle)** methodology as a set of markdown rule files under `aidlc-rules/`. The CI/CD infrastructure handles: + +- **Continuous integration** via AWS CodeBuild (evaluation and reporting) +- **Release distribution** via GitHub Releases (zipped rule files) +- **Changelog generation** via git-cliff (changelog-first: updated before release, included in the tagged commit) + +```text +awslabs/aidlc-workflows/ +├── .github/ +│ ├── CODEOWNERS +│ ├── ISSUE_TEMPLATE/ # Bug, feature, RFC, docs templates +│ ├── labeler.yml # Auto-label rules (path → label mapping) +│ ├── pull_request_template.md # PR template with contributor statement +│ └── workflows/ +│ ├── codebuild.yml # CI via AWS CodeBuild +│ ├── pull-request-lint.yml # PR validation (title, labels, merge gates) +│ ├── release.yml # GitHub Release on tag push +│ ├── release-pr.yml # Changelog PR before release +│ ├── security-scanners.yml # Security scanning suite (6 scanners) +│ └── tag-on-merge.yml # Auto-tag on release PR merge +├── .claude/ +│ └── settings.json # Shared Claude Code project settings +├── aidlc-rules/ # The distributable product +│ ├── aws-aidlc-rules/ # Core workflow rules +│ └── aws-aidlc-rule-details/ # Detailed rules by phase +├── cliff.toml # git-cliff changelog configuration +├── docs/ +│ ├── ADMINISTRATIVE_GUIDE.md # This file +│ └── DEVELOPERS_GUIDE.md # Local build instructions +└── scripts/ + └── aidlc-evaluator/ # Evaluation framework (in development) +``` + +--- + +## CI/CD Architecture + +Six workflows form two distinct pipelines, a security scanning suite, plus a pull request validation gate: + +### Pipeline 1: Release (changelog-first) + +```mermaid +flowchart TD + A["workflow_dispatch\n(optional version input)"] --> B["release-pr.yml"] + B --> C["Determine version\n(input or git-cliff)"] + C --> D["Generate CHANGELOG.md\nwith git-cliff"] + D --> E["Open PR: release/vX.Y.Z\nwith updated CHANGELOG"] + + E --> F["Human reviews\nand merges PR"] + + F --> G["tag-on-merge.yml"] + G --> H["Extract version from\nbranch name"] + H --> I["Create tag vX.Y.Z\non merge commit SHA"] + + I --> J["Dispatch release.yml"] + J --> K["release.yml\ncreates draft release\nwith rules zip"] + K --> L["Dispatch codebuild.yml\nafter draft exists"] + L --> M{{"Manual approval\n(codebuild environment)"}} + M --> N["Run AWS CodeBuild\nupload artifacts to draft"] + + K --> O["Human reviews\nand publishes draft"] + N --> O + + P["workflow_dispatch\n(select tag in UI)"] -.->|"manual backup\ntrigger"| M +``` + +The release flow is **changelog-first**: the CHANGELOG is updated *before* the tag is created, so the tagged commit always contains its own changelog entry. The flow has three human touchpoints: + +1. **Merge the release PR** — reviews the changelog, triggers automatic tagging +2. **Approve the CodeBuild environment** — gates access to AWS credentials for the build +3. **Publish the draft release** — reviews artifacts, makes the release public + +`tag-on-merge.yml` explicitly dispatches `release.yml` and `codebuild.yml` via `gh workflow run --ref vX.Y.Z` after creating the tag. The dispatches are **sequential**: `release.yml` runs first and is watched to completion so that the draft release exists before `codebuild.yml` uploads artifacts. This is necessary because tags created with `GITHUB_TOKEN` do not trigger `on: push: tags` events — but `workflow_dispatch` is exempt from this limitation. Both workflows also retain `push: tags: v*` as a fallback for manual tag pushes. The `codebuild.yml` workflow requires **manual approval** via the `codebuild` protected environment before the build proceeds. The upload step handles all release states resiliently: + +- **Draft exists** (normal case) — `release.yml` finishes in ~30s creating the draft; CodeBuild takes minutes, so the draft is ready when artifacts are uploaded +- **No release yet** (codebuild finished first) — creates a draft with build artifacts; `release.yml` will update it later +- **Already published** (re-run) — attempts to replace artifacts, warns gracefully if immutable + +**Backup strategy:** If the tag-triggered CodeBuild run fails or is blocked, an admin can manually dispatch the workflow via `workflow_dispatch` and select the `v*` tag in the GitHub UI branch/tag selector. Since `github.ref` resolves to the selected tag, the upload step activates automatically. + +### Pipeline 2: Continuous Integration + +```mermaid +flowchart LR + A["git push main"] --> B{{"Manual approval\n(codebuild environment)"}} + C["workflow_dispatch\n(no tag input)"] --> B + D["pull_request\n(aidlc-rules/** changed)"] --> E{"rules\nlabel?"} + E -->|yes| F["label-cleanup\n(remove reminder comment)"] + F --> B + E -->|no| I["label-reminder\n(warning + PR comment)"] + B --> G["Run AWS CodeBuild"] + G --> H["Upload workflow artifacts"] +``` + +### Pipeline 3: Security Scanning + +```mermaid +flowchart TD + A["push main"] --> G["security-scanners.yml"] + B["pull_request to main"] --> G + C["schedule (daily 03:47 UTC)"] --> G + D["workflow_dispatch"] --> G + + G --> H["gitleaks\n(secret detection)"] + G --> I["semgrep\n(multi-language SAST)"] + G --> J["grype\n(dependency SCA)"] + G --> K["bandit\n(Python SAST)"] + G --> L["checkov\n(IaC scanning)"] + G --> M["clamav\n(malware scanning)"] + + H --> N["Upload SARIF\nto Code Scanning"] + I --> N + J --> N + K --> N + L --> N + M --> O["Upload text log\n(artifact only)"] +``` + +All six scanner jobs run in parallel. Each scanner (except ClamAV) produces a SARIF report uploaded to both GitHub Code Scanning (Security tab) and as a downloadable workflow artifact. All scanners use a **deferred-failure pattern**: the scan runs to completion, results are always uploaded, and only then does the job fail if findings exceed the configured threshold. See the [Security Scanners Workflow](#security-scanners-workflow-security-scannersyml) reference for details. + +### Pipeline 4: Pull Request Validation + +```mermaid +flowchart TD + A["pull_request_target\n(to main)"] --> B["get-pr-info"] + C["merge_group\n(checks_requested)"] --> B + + B --> D["check-merge-status\n(HALT_MERGES + open release PRs)"] + B --> E["fail-by-label\n(do-not-merge label)"] + A --> F["validate\n(conventional commit title)"] + A --> G["contributorStatement\n(acknowledgment in PR body)"] + A --> H["auto-label\n(actions/labeler)"] +``` + +`pull-request-lint.yml` runs on every PR targeting `main` and on merge queue checks. It enforces four gates (conventional commit PR titles, the contributor statement from the PR template, a configurable merge-halt mechanism, and a do-not-merge label check) and automatically applies labels based on changed file paths. The workflow uses `pull_request_target` (not `pull_request`) so it runs in the context of the base branch — this is safe because it never checks out PR code and the `auto-label` job uses `actions/labeler` which only reads file paths from the API. + +--- + +## Workflow Reference + +### Release PR Workflow (`release-pr.yml`) + +| Property | Value | +| --------------- | ------------------------------------------------- | +| **File** | `.github/workflows/release-pr.yml` | +| **Trigger** | `workflow_dispatch` with optional `version` input | +| **Environment** | *(none)* | +| **Runner** | `ubuntu-latest` | + +**Purpose:** Generates an updated `CHANGELOG.md` from conventional commits using git-cliff, writes the release version to `aidlc-rules/VERSION`, and opens a PR on a `release/vX.Y.Z` branch. This is the first step in the changelog-first release flow. The `aidlc-rules/VERSION` update ensures the PR touches `aidlc-rules/`, which triggers the `codebuild.yml` path filter and the `rules` auto-label. + +**Job: `release-pr` ("Create Release PR")** + +| Step | Name | Action | +| ---- | ------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| 1 | Checkout code | `actions/checkout` with `fetch-depth: 0` (full history for git-cliff) | +| 2 | Install git-cliff | `orhun/git-cliff-action` to make the CLI available | +| 3 | Determine version | Use `inputs.version` (with semver validation) or `git-cliff --bumped-version` for auto-detection; falls back to patch bump from latest tag | +| 4 | Check tag does not exist | Fail early if the target tag already exists | +| 5 | Generate changelog | `orhun/git-cliff-action` with `--tag vX.Y.Z` to generate `CHANGELOG.md` | +| 6 | Create release PR | Write version to `aidlc-rules/VERSION`, check branch doesn't already exist, commit, push `release/vX.Y.Z` branch, open PR (with labels `release` and `rules` if they exist in the repo) | + +**Version detection:** If a version is specified, it must be valid semver (`MAJOR.MINOR.PATCH`); both `v0.2.0` and `0.2.0` are accepted. If no version is specified, `git-cliff --bumped-version` determines the next version from conventional commit prefixes. The `[bump]` config in `cliff.toml` controls the rules (e.g., `feat` → minor bump, breaking change → major bump). If no conventional commits are found, the workflow falls back to a patch bump from the latest tag. If no tags exist at all, it exits cleanly with a warning (no PR is created). + +**External actions (SHA-pinned):** + +| Action | Version | SHA | +| ------------------------ | ------- | ------------------------------------------ | +| `actions/checkout` | v6.0.1 | `8e8c483db84b4bee98b60c0593521ed34d9990e8` | +| `orhun/git-cliff-action` | v4.7.0 | `e16f179f0be49ecdfe63753837f20b9531642772` | + +--- + +### Tag Release Workflow (`tag-on-merge.yml`) + +| Property | Value | +| --------------- | ----------------------------------------------------- | +| **File** | `.github/workflows/tag-on-merge.yml` | +| **Trigger** | `pull_request: types: [closed]` | +| **Condition** | PR was merged AND branch name starts with `release/v` | +| **Environment** | *(none)* | +| **Runner** | `ubuntu-latest` | + +**Purpose:** Automatically creates a version tag on the merge commit when a release PR is merged, then dispatches `release.yml` (waits for completion) followed by `codebuild.yml`. + +**Job: `tag` ("Create Release Tag")** + +| Step | Name | Action | +| ---- | ---------------------------------- | ------------------------------------------------------------------------------------------- | +| 1 | Create tag | Extract version from branch name, verify tag doesn't exist, create via GitHub API | +| 2 | Dispatch release workflow and wait | `gh workflow run release.yml --ref $TAG --repo $REPO`, then `gh run watch` until completion | +| 3 | Dispatch codebuild workflow | `gh workflow run codebuild.yml --ref $TAG --repo $REPO` (runs after draft release exists) | + +**Tag creation:** Uses `gh api repos/.../git/refs` to create a lightweight tag. + +**Workflow dispatch:** Tags created with `GITHUB_TOKEN` do not trigger `on: push: tags` events in other workflows. To work around this, `tag-on-merge.yml` explicitly dispatches `release.yml` and `codebuild.yml` via `gh workflow run --ref $TAG`. The `workflow_dispatch` event is exempt from this `GITHUB_TOKEN` limitation. Since `--ref` is set to the tag, both dispatched workflows see `github.ref = refs/tags/vX.Y.Z` — identical to a real tag push. The dispatches are **sequential**: `release.yml` runs first (watched via `gh run watch`) to ensure the draft release exists before `codebuild.yml` attempts to upload artifacts. If the release run cannot be found or fails, `codebuild.yml` is dispatched anyway as a fallback. + +**Security:** The branch name `release/vX.Y.Z` is passed through an environment variable (not directly interpolated) to prevent command injection. The job-level `if` condition uses `github.event.pull_request.merged == true` to ensure only merged PRs trigger tagging. + +--- + +### CodeBuild Workflow (`codebuild.yml`) + +| Property | Value | +| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **File** | `.github/workflows/codebuild.yml` | +| **Triggers** | `push` to `main`, `push` tags `v*`, `pull_request` to `main` (label-gated, path-filtered), `workflow_dispatch` (dispatched by `tag-on-merge.yml` or manual — select a tag in the UI to trigger a release build) | +| **Environment** | `codebuild` (protected, manual approval) | +| **Runner** | `ubuntu-latest` | +| **Concurrency** | Groups by `{workflow}-{ref}`, cancels in-progress | + +**Purpose:** Runs an AWS CodeBuild project, downloads primary and secondary artifacts from S3, caches them in GitHub Actions cache, uploads them as workflow artifacts, and (when triggered from a `v*` tag) attaches them to the GitHub Release. + +**PR label gate:** For `pull_request` events, the workflow only fires when files under `aidlc-rules/**` are changed (via `paths` filter) and the `build` job only runs when the `rules` label is present on the PR (via `contains(github.event.pull_request.labels.*.name, 'rules')`). The `rules` label is applied automatically by the `auto-label` job in `pull-request-lint.yml` (see [Pull Request Validation Workflow](#pull-request-validation-workflow-pull-request-lintyml)). The trigger includes `types: [opened, synchronize, reopened, labeled]` so that subsequent pushes to a labeled PR re-trigger the build automatically. `push`, `workflow_dispatch`, and tag events bypass the label check entirely. + +**Job: `label-reminder`** (PR only, no `rules` label) + +| Step | Name | Action | +| ---- | -------------------------------- | -------------------------------------------------------------------------------------------- | +| 1 | Warn about missing rules label | Emits a `::warning::` annotation visible in the Actions summary | +| 2 | Comment on PR | Posts a one-time PR comment (idempotent — skips if the reminder comment already exists) | + +This job runs only for `pull_request` events where `aidlc-rules/**` changed but the `rules` label is absent. It alerts maintainers and reviewers that the evaluation pipeline was not triggered. The comment is posted once per PR using an HTML comment marker (`<!-- rules-label-reminder -->`) to avoid duplicates. In normal operation, the `auto-label` job in `pull-request-lint.yml` applies the `rules` label automatically, so this job serves as a fallback safety net. + +**Job: `label-cleanup`** (PR only, `rules` label present) + +| Step | Name | Action | +| ---- | ----------------------------- | ---------------------------------------------------------------------------------------- | +| 1 | Remove label reminder comment | Finds and deletes the `label-reminder` PR comment (no-op if it doesn't exist) | + +This job runs when the `rules` label is applied, immediately removing the reminder comment without waiting for the `codebuild` environment approval gate. + +**Job: `build`** + +| Step | Name | Condition | Action | +| ---- | ---------------------------- | ------------------------- | ------------------------------------------------------------- | +| 1 | List caches | *(always)* | `gh cache list` for existing project caches | +| 2 | Check cache | *(always)* | `actions/cache/restore` with `lookup-only: true` | +| 3 | Configure AWS credentials | cache miss | `aws-actions/configure-aws-credentials` (OIDC) | +| 4 | Run CodeBuild | cache miss | `aws-actions/aws-codebuild-run-build` with inline buildspec | +| 5 | Build ID | cache miss (always) | Echo CodeBuild build ID | +| 6 | Download CodeBuild artifacts | cache miss | Download primary + secondary artifacts from S3 | +| 7 | List CodeBuild artifacts | cache miss | List and inspect downloaded zip files | +| 8 | Clean old report caches | cache miss | Delete 3 oldest matching caches for branch | +| 9 | Save report to cache | cache miss | `actions/cache/save` with key `{project}-{branch}-{sha}` | +| 10 | Upload primary artifact | `!env.ACT` | `actions/upload-artifact` for `{project}.zip` | +| 11 | Upload evaluation artifact | `!env.ACT` | `actions/upload-artifact` for `evaluation.zip` | +| 12 | Upload trend artifact | `!env.ACT` | `actions/upload-artifact` for `trend.zip` | +| 13 | Upload artifacts to release | triggered from a `v*` tag | Attach build artifacts to GitHub Release (draft or published) | + +**Caching strategy:** The cache key `{project}-{branch}-{sha}` ensures that the same commit on the same branch is never built twice. On cache hit, steps 3–9 are skipped entirely. + +**Inline buildspec:** The workflow embeds a full `buildspec-override` rather than referencing an external file. The buildspec: + +- Installs `gh` CLI (via dnf) and `uv` (Python package manager) +- Determines build context: release (tagged), pre-release (default branch), or pre-merge (feature branch) +- Creates placeholder evaluation and trend report files under `.codebuild/` +- Outputs a primary artifact (all files under `.codebuild/`) and two secondary artifacts (`evaluation`, `trend`) + +**Artifact upload compatibility:** Upload steps are gated by `!env.ACT` because `actions/upload-artifact` v6 is incompatible with the [`act`](https://github.com/nektos/act) local runner. + +**External actions (all SHA-pinned):** + +| Action | Version | SHA | +| --------------------------------------- | ------- | ------------------------------------------ | +| `actions/cache/restore` | v5.0.3 | `cdf6c1fa76f9f475f3d7449005a359c84ca0f306` | +| `aws-actions/configure-aws-credentials` | v6.0.0 | `8df5847569e6427dd6c4fb1cf565c83acfa8afa7` | +| `aws-actions/aws-codebuild-run-build` | v1.0.18 | `d8279f349f3b1b84e834c30e47c20dcb8888b7e5` | +| `actions/cache/save` | v5.0.3 | `cdf6c1fa76f9f475f3d7449005a359c84ca0f306` | +| `actions/upload-artifact` | v6.0.0 | `b7c566a772e6b6bfb58ed0dc250532a479d7789f` | + +--- + +### Release Workflow (`release.yml`) + +| Property | Value | +| --------------- | --------------------------------------------------------------------------------------------------------------------- | +| **File** | `.github/workflows/release.yml` | +| **Triggers** | `workflow_dispatch` (dispatched by `tag-on-merge.yml`), `push` on tags matching `v*` (fallback for manual tag pushes) | +| **Environment** | *(none)* | +| **Runner** | `ubuntu-latest` | + +**Purpose:** Creates a **draft** GitHub Release with a zip of `aidlc-rules/` when dispatched or when a version tag is pushed. The release is kept as a draft so that CodeBuild artifacts can be attached and reviewed before publishing. + +**Job: `release` ("Create Release")** + +| Step | Name | Condition | Action | +| ---- | ----------------------- | ----------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- | +| 1 | Checkout code | *(always)* | `actions/checkout` with `fetch-depth: 0` | +| 2 | Extract version | *(always)* | Guard: if `GITHUB_REF` is not a `v*` tag, emit `::warning::` and skip remaining steps. Otherwise parse into `version` (no `v`) and `tag` (with `v`) | +| 3 | Create release artifact | ref is a `v*` tag | `zip -r ai-dlc-rules-v{VERSION}.zip aidlc-rules/` | +| 4 | Create GitHub Release | ref is a `v*` tag | `softprops/action-gh-release` with `draft: true` and zip attached | + +**Graceful skip:** If dispatched from a branch instead of a tag (e.g., someone manually runs the workflow from `main`), the job completes successfully with a warning annotation rather than failing. This prevents confusing red X failures in the Actions UI. + +**Release naming:** `AI-DLC Workflow v{VERSION}` (e.g., `AI-DLC Workflow v0.1.6`) + +**External actions (SHA-pinned):** + +| Action | Version | SHA | +| ----------------------------- | ------- | ------------------------------------------ | +| `actions/checkout` | v6.0.1 | `8e8c483db84b4bee98b60c0593521ed34d9990e8` | +| `softprops/action-gh-release` | v2.5.0 | `a06a81a03ee405af7f2048a818ed3f03bbf83c7b` | + +--- + +### Pull Request Validation Workflow (`pull-request-lint.yml`) + +| Property | Value | +| --------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | +| **File** | `.github/workflows/pull-request-lint.yml` | +| **Triggers** | `pull_request_target` to `main` (edited, labeled, opened, ready_for_review, reopened, synchronize, unlabeled); `merge_group` (checks_requested) | +| **Environment** | *(none)* | +| **Runner** | `ubuntu-latest` | +| **Concurrency** | Groups by `{workflow}-{ref}`, cancels in-progress | + +**Purpose:** Validates pull requests before merge. Enforces conventional commit PR titles, the contributor acknowledgment statement, merge-halt controls, and a do-not-merge label gate. Also runs as a merge queue check. + +**Why `pull_request_target`:** This trigger runs the workflow in the context of the base branch (not the PR head). This is safe here because no step checks out or executes PR code — the workflow only inspects PR metadata (title, labels, body). Using `pull_request_target` ensures the workflow has access to repository secrets and labels even for PRs from forks. + +**Job: `get-pr-info`** + +| Step | Name | Action | +| ---- | ----------- | -------------------------------------------------------------------------------------------------------- | +| 1 | Get PR info | Extract PR number and labels from event context (`pull_request_target`) or by API lookup (`merge_group`) | + +Outputs `pr_number` and `pr_labels` for downstream jobs. For `merge_group` events, the PR number is extracted from the ref name and labels are fetched via the GitHub API. For `pull_request_target` events, values come directly from the event payload. + +**Job: `check-merge-status` ("Check Merge Status")** + +Depends on `get-pr-info`. Runs `if: always()` so it executes even if the upstream job fails. + +| Check | Behavior | +| -------------------- | ----------------------------------------------------------------------------- | +| Open release PRs | Blocks merge if other `release/` PRs are open (prevents concurrent releases) | +| `HALT_MERGES = 0` | All merges allowed (default) | +| `HALT_MERGES = -N` | All merges blocked | +| `HALT_MERGES = N` | Only PR #N is allowed to merge | + +**Job: `fail-by-label` ("Fail by Label")** + +Depends on `get-pr-info`. Runs `if: always()`. Fails the check if the PR has the `do-not-merge` label (configurable via `DO_NOT_MERGE_LABEL` variable). + +**Job: `validate` ("Validate PR title")** + +Only runs for `pull_request` and `pull_request_target` events (not `merge_group`). Uses `amannn/action-semantic-pull-request` to enforce conventional commit format on PR titles. + +Allowed types: `fix`, `feat`, `build`, `chore`, `ci`, `docs`, `style`, `refactor`, `perf`, `test`. Scopes are optional (`requireScope: false`). + +**Job: `auto-label` ("Auto-label")** + +Only runs for `pull_request_target` events. Uses [`actions/labeler`](https://github.com/actions/labeler) v6.0.1 to automatically apply and remove labels based on changed file paths. Label rules are defined in `.github/labeler.yml`: + +| Label | Path Pattern | Description | +| --------------- | ----------------------------------------------- | ------------------------------------------------- | +| `rules` | `aidlc-rules/**` | Triggers CodeBuild evaluation pipeline | +| `documentation` | `**/*.md` (excluding `aidlc-rules/**`) | Non-rules markdown file changes | +| `github` | `.github/**` | Workflow, template, or config changes | + +With `sync-labels: true`, labels are automatically removed when the matching files are no longer in the PR diff (e.g., after a rebase drops those changes). New label rules can be added by editing `.github/labeler.yml` — no workflow changes required. + +**Job: `contributorStatement` ("Require Contributor Statement")** + +Only runs for `pull_request` and `pull_request_target` events. Skipped for bot accounts (`dependabot[bot]`, `github-actions[bot]`, `github-actions`, `aidlc-workflows`). Verifies the PR body contains the contributor acknowledgment text from `.github/pull_request_template.md`: + +> By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of the project license. + +**External actions (SHA-pinned):** + +| Action | Version | SHA | +| --------------------------------------- | ------- | ------------------------------------------ | +| `actions/labeler` | v6.0.1 | `634933edcd8ababfe52f92936142cc22ac488b1b` | +| `amannn/action-semantic-pull-request` | v6.1.1 | `48f256284bd46cdaab1048c3721360e808335d50` | +| `actions/github-script` | v8.0.0 | `ed597411d8f924073f98dfc5c65a23a2325f34cd` | + +--- + +### Security Scanners Workflow (`security-scanners.yml`) + +| Property | Value | +| --------------- | ---------------------------------------------------------------------------------------------- | +| **File** | `.github/workflows/security-scanners.yml` | +| **Triggers** | `push` to `main`, `pull_request` to `main`, `schedule` (daily 03:47 UTC), `workflow_dispatch` | +| **Environment** | *(none)* | +| **Runner** | `ubuntu-latest` | +| **Concurrency** | Groups by `{workflow}-{ref}`, cancels in-progress | + +**Purpose:** Runs six independent security scanners in parallel to detect secrets, vulnerabilities, misconfigurations, and malware. All HIGH and CRITICAL findings must be remediated or have a documented risk acceptance before merge (see [Security Finding Requirements](#security-finding-requirements)). + +**Permissions model:** Deny-all at workflow level, then each job grants only `actions: read`, `contents: read`, and `security-events: write`. + +**Jobs:** + +| Job | Scanner | What it detects | Fails on | +| ---------- | -------- | --------------------------------------------------- | ------------------------------------------------------------ | +| `gitleaks` | Gitleaks | Secrets in git history | Any secret not in `.gitleaks-baseline.json` | +| `semgrep` | Semgrep | Security anti-patterns (all languages) | Any finding (PRs: new findings only via `--baseline-commit`) | +| `grype` | Grype | Known CVEs in dependencies | High or critical CVEs (`fail-on-severity: high`) | +| `bandit` | Bandit | Python security issues | Any finding with high confidence | +| `checkov` | Checkov | IaC misconfigurations (GitHub Actions, Dockerfiles) | Any check failure (minus skipped checks) | +| `clamav` | ClamAV | Malware and viruses | Any detection | + +**Deferred-failure pattern:** All scanners capture the exit code without failing the step (`set +e`), upload the SARIF report as an artifact and to GitHub Code Scanning, then fail the job if findings were detected. This ensures results are always preserved regardless of outcome. ClamAV follows the same pattern but uploads a text log instead of SARIF. + +**Configuration files:** + +| File | Purpose | +| ------------------------- | ---------------------------------------------- | +| `.bandit` | Bandit targets, excludes, confidence level | +| `.semgrepignore` | Semgrep path exclusions | +| `.gitleaks.toml` | Gitleaks ruleset extension and path allowlist | +| `.gitleaks-baseline.json` | Pre-existing known findings (test credentials) | +| `.grype.yaml` | Grype severity threshold and CVE ignore list | +| `.checkov.yaml` | Checkov frameworks and skipped checks | + +**Version pinning:** All scanner tool versions and GitHub Actions are pinned to specific versions or commit SHAs in the workflow file to ensure reproducible builds and prevent supply-chain attacks. These pins should be reviewed and updated periodically (at least quarterly). See [Updating Pinned Versions](#updating-pinned-versions) for the update procedure. + +For detailed remediation and suppression instructions, see [Developer's Guide — Security Scanners](DEVELOPERS_GUIDE.md#security-scanners). + +--- + +## Protected Environments + +| Environment | Used By | Purpose | +| ----------- | --------------------------- | --------------------------------------------- | +| `codebuild` | `codebuild.yml` job `build` | Gates access to AWS credentials for CodeBuild | + +The `codebuild` environment is the only protected environment. It contains: + +- The `AWS_CODEBUILD_ROLE_ARN` secret (required for OIDC-based AWS role assumption) +- Possibly the repository variables `CODEBUILD_PROJECT_NAME`, `AWS_REGION`, and `ROLE_DURATION_SECONDS` (these may alternatively be set at the repository level) + +Environment protection rules (configured in GitHub repository settings) may include required reviewers or deployment branch restrictions. + +--- + +## Secrets and Variables + +### Secrets + +| Secret | Scope | Used By | Purpose | +| ------------------------ | --------------------------- | ---------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------- | +| `AWS_CODEBUILD_ROLE_ARN` | Environment (`codebuild`) | `codebuild.yml` | IAM Role ARN for OIDC-based AWS STS role assumption | +| `GITHUB_TOKEN` | Automatic (GitHub-provided) | `release.yml`, `release-pr.yml`, `tag-on-merge.yml`, `pull-request-lint.yml` | Authenticate GitHub API calls (release creation, PR creation, tag creation, workflow dispatch, PR validation) | + +The `codebuild.yml` workflow also uses `github.token` (the automatic token, accessed without the `secrets.` prefix) for cache management and release asset uploads. + +### Repository Variables + +| Variable | Used By | Default Fallback | Purpose | +| ------------------------- | ----------------------- | ------------------- | ---------------------------------------------------------------- | +| `CODEBUILD_PROJECT_NAME` | `codebuild.yml` | `codebuild-project` | AWS CodeBuild project name | +| `AWS_REGION` | `codebuild.yml` | `us-east-1` | AWS region for CodeBuild and STS | +| `ROLE_DURATION_SECONDS` | `codebuild.yml` | `7200` | STS session duration (seconds) | +| `DO_NOT_MERGE_LABEL` | `pull-request-lint.yml` | `do-not-merge` | Label name that blocks PR merging | +| `HALT_MERGES` | `pull-request-lint.yml` | `0` | Merge gate: `0` = allow all, `-N` = block all, `N` = only PR #N | + +All variables have sensible defaults via `${{ vars.VAR || 'default' }}` syntax, so workflows run even without explicit variable configuration. + +--- + +## Permissions Model + +### Workflow-level permissions + +| Workflow | Permissions | +| ------------------------- | ----------------------------------------- | +| `codebuild.yml` | All 16 scopes explicitly set to `none` | +| `pull-request-lint.yml` | All 16 scopes explicitly set to `none` | +| `release.yml` | All 16 scopes explicitly set to `none` | +| `release-pr.yml` | All 16 scopes explicitly set to `none` | +| `security-scanners.yml` | All 16 scopes explicitly set to `none` | +| `tag-on-merge.yml` | All 16 scopes explicitly set to `none` | + +### Job-level permissions (overrides) + +| Workflow | Job | Permissions | Rationale | +| ----------------------- | ---------------------- | --------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ | +| `codebuild.yml` | `label-reminder` | `pull-requests: write` | Post reminder comment when `rules` label is missing | +| `codebuild.yml` | `label-cleanup` | `pull-requests: write` | Delete reminder comment when `rules` label is applied | +| `codebuild.yml` | `build` | `actions: write`, `contents: write`, `id-token: write` | Cache management, release asset upload, OIDC token for AWS STS | +| `pull-request-lint.yml` | `auto-label` | `contents: read`, `issues: write`, `pull-requests: write` | Apply/remove labels based on changed file paths; `issues: write` allows creating labels that don't yet exist | +| `pull-request-lint.yml` | `get-pr-info` | `contents: read`, `pull-requests: read` | Read PR metadata and labels via API | +| `pull-request-lint.yml` | `check-merge-status` | `pull-requests: read` | Read PR state for merge gate checks | +| `pull-request-lint.yml` | `validate` | `pull-requests: read` | Read PR title for conventional commit validation | +| `pull-request-lint.yml` | `contributorStatement` | `pull-requests: read` | Read PR body for contributor acknowledgment | +| `release.yml` | `release` | `contents: write` | Create draft release and attach zip artifact | +| `release-pr.yml` | `release-pr` | `contents: write`, `pull-requests: write` | Generate changelog, push branch, open PR | +| `tag-on-merge.yml` | `tag` | `contents: write`, `actions: write` | Create tag via API, dispatch release and codebuild workflows | + +All six workflows follow a **deny-all-then-grant** pattern: every permission scope is set to `none` at the workflow level, then only the required scopes are granted at the job level. This is the strictest possible configuration and prevents privilege escalation from compromised steps. `security-scanners.yml` grants each of its six jobs `actions: read`, `contents: read`, and `security-events: write`. + +--- + +## Security Posture + +| Control | Implementation | +| --------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **Supply-chain protection** | All external actions pinned to full commit SHAs (not mutable version tags) | +| **AWS authentication** | OIDC-based role assumption via `id-token: write` — no static credentials stored | +| **Least-privilege tokens** | All six workflows explicitly deny all 16 permission scopes at workflow level, grant only required scopes at job level | +| **Environment protection** | `codebuild` environment gates AWS credential access with potential reviewer/branch rules | +| **Security scanning** | Six automated scanners (SAST, SCA, secrets, IaC, malware) run on every push to `main`, every PR, and daily. Findings are published to GitHub Code Scanning. All HIGH and CRITICAL findings require remediation or documented risk acceptance | +| **Label-gated CI** | `codebuild.yml` requires the `rules` label on PRs and only triggers for `aidlc-rules/**` changes, preventing unnecessary builds and environment approval prompts. The label is applied automatically by the `auto-label` job in `pull-request-lint.yml` | +| **Concurrency control** | `codebuild.yml`, `pull-request-lint.yml`, and `security-scanners.yml` cancel in-progress runs for the same branch | +| **Safe PR trigger** | `pull-request-lint.yml` uses `pull_request_target` but never checks out PR code — only inspects metadata (title, labels, body) | +| **Injection-safe inputs** | Zero `${{ }}` expression interpolation in `run:` blocks — all dynamic values (`github.ref_name`, `github.repository`, `env.*`, event inputs) passed via step-level `env:` or auto-exported workflow `env:` variables | +| **Code ownership** | `.github/` (including workflows) owned exclusively by `@awslabs/aidlc-admins` via CODEOWNERS | +| **Account masking** | `mask-aws-account-id: true` in AWS credential configuration | + +### Security Finding Requirements + +All **HIGH** and **CRITICAL** security findings from any scanner must be either **remediated** or have a **documented risk acceptance** before a PR can be merged to `main`. This applies to: + +- **Bandit / Semgrep (SAST):** High-severity code findings must be fixed or suppressed with an inline comment (`# nosec` / `# nosemgrep`) that includes a justification explaining why the finding is acceptable +- **Grype (SCA):** High and critical CVEs must be resolved by upgrading the affected dependency. If no fix is available, add an entry to `.grype.yaml` `ignore` with the CVE, affected package, and reason for acceptance +- **Gitleaks (Secrets):** Any detected secret must be rotated immediately. Only synthetic/test credentials may be added to the baseline (`.gitleaks-baseline.json`) +- **Checkov (IaC):** Failing checks must be fixed or suppressed with an inline `# checkov:skip=` comment with a reason, or added to `.checkov.yaml` `skip-check` with a comment +- **ClamAV (Malware):** Any detection must be investigated and the file removed. No suppression mechanism exists + +**Risk acceptance process:** + +1. The developer adds the appropriate suppression (inline comment or config entry) with a clear justification +2. The suppression is reviewed as part of the normal PR code review process +3. Reviewers from `@awslabs/aidlc-admins` or `@awslabs/aidlc-maintainers` must approve any risk acceptance +4. LOW and MEDIUM findings should be addressed when practical but do not block merge + +For detailed remediation and suppression instructions per scanner, see [Developer's Guide — Security Scanners](DEVELOPERS_GUIDE.md#security-scanners). + +--- + +## Code Ownership + +Defined in `.github/CODEOWNERS`: + +| Path | Owners | +| --------------------------------------------- | ----------------------------------------------------------------------------- | +| `*` (default) | `@awslabs/aidlc-admins` `@awslabs/aidlc-maintainers` | +| `.github/` | `@awslabs/aidlc-admins` | +| `.github/CODEOWNERS` | `@awslabs/aidlc-admins` | +| `aidlc-rules/` | `@awslabs/aidlc-admins` `@awslabs/aidlc-maintainers` `@awslabs/aidlc-writers` | +| `assets/` | `@awslabs/aidlc-admins` `@awslabs/aidlc-maintainers` `@awslabs/aidlc-writers` | +| `scripts/` | `@awslabs/aidlc-admins` `@awslabs/aidlc-maintainers` | +| `CHANGELOG.md`, `cliff.toml`, `LICENSE`, etc. | `@awslabs/aidlc-admins` | + +**Key implication:** Only `@awslabs/aidlc-admins` can approve changes to `.github/` (workflows, CODEOWNERS, issue templates). + +--- + +## Release Process + +Releases follow a **changelog-first** flow: the CHANGELOG is updated *before* the tag is created, so the tagged commit always contains its own changelog entry. The process has three human touchpoints (merge PR, approve CodeBuild, publish release). + +1. **Dispatch the Release PR workflow** via the GitHub Actions UI: + - Navigate to Actions → Release PR → Run workflow + - Optionally specify a version (e.g., `0.2.0`); leave blank to auto-determine from conventional commits + - `release-pr.yml` generates `CHANGELOG.md`, writes the version to `aidlc-rules/VERSION`, and opens a PR on branch `release/v1.2.0` with labels `release` and `rules` + +2. **Review and merge the release PR:** + - Verify the changelog content is correct + - Merge the PR (requires `@awslabs/aidlc-admins` approval since `CHANGELOG.md` is owned by them) + - `tag-on-merge.yml` automatically creates tag `v1.2.0` on the merge commit and dispatches the release and build workflows + +3. **`release.yml` runs automatically** (dispatched by `tag-on-merge.yml` with `--ref v1.2.0`): + - Zips `aidlc-rules/` into `ai-dlc-rules-v1.2.0.zip` + - Creates a **draft** GitHub Release named "AI-DLC Workflow v1.2.0" with the zip attached + +4. **`codebuild.yml` runs automatically** (dispatched by `tag-on-merge.yml`; requires `codebuild` environment approval): + - Runs CodeBuild on the tagged commit + - Downloads build artifacts (primary, evaluation, trend) + - Attaches artifacts to the draft release (or creates a draft if one doesn't exist yet) + +5. **Publish the release** by clicking "Publish release" in the GitHub UI: + - Verify all expected artifacts are attached (rules zip + build artifacts) + - Review release notes and edit if needed + +**Note:** The `codebuild` protected environment may need its deployment branch rules updated to allow `v*` tags (in addition to `main`) for tag-triggered builds to proceed. + +--- + +## Changelog Configuration + +Defined in `cliff.toml` (used by `release-pr.yml`): + +| Setting | Value | +| ----------------- | ----------------------------------------------------- | +| **Commit format** | Conventional commits (`feat:`, `fix:`, `docs:`, etc.) | +| **Tag pattern** | `v[0-9].*` | +| **Sort order** | Oldest first | + +**Commit groups:** + +| Prefix | Group Name | +| ---------- | ------------- | +| `feat` | Features | +| `fix` | Bug Fixes | +| `docs` | Documentation | +| `perf` | Performance | +| `refactor` | Refactoring | +| `style` | Style | +| `test` | Tests | +| `build` | CI/CD | +| `ci` | CI/CD | +| `chore` | Miscellaneous | + +**Filtered commits:** + +| Pattern | Action | +| ------------------------ | ------------------------------------------ | +| `docs: update changelog` | Skipped (noise from previous release flow) | + +Unconventional commits are filtered out (`filter_unconventional = true`). + +**Version bump rules** (defined in `[bump]` section): + +| Rule | Effect | +| ----------------------------------- | --------------------------------------------- | +| `features_always_bump_minor = true` | `feat:` commits trigger a minor version bump | +| `breaking_always_bump_major = true` | Breaking changes trigger a major version bump | + +These rules are used by `git-cliff --bumped-version` when auto-determining the next version in `release-pr.yml`. + +--- + +## Updating Pinned Versions + +All scanner tools, GitHub Actions, and container images in the workflow files are pinned to specific versions or commit SHAs. This prevents supply-chain attacks and ensures reproducible builds, but requires periodic maintenance to stay current with security patches and new features. + +Pinned versions should be reviewed and updated **at least quarterly**. + +<!-- TODO: Add step-by-step instructions for updating pinned versions, including: + - How to check for latest versions of each scanner tool (PyPI, GitHub releases, Docker Hub) + - How to look up commit SHAs for GitHub Actions (gh api repos/OWNER/REPO/git/ref/tags/TAG) + - How to look up Docker image digests (docker manifest inspect) + - How to verify the update works (run the workflow on a feature branch) + - How to handle breaking changes in scanner tool upgrades + - Consider automating this with Dependabot or Renovate +--> diff --git a/docs/DEVELOPERS_GUIDE.md b/docs/DEVELOPERS_GUIDE.md new file mode 100644 index 00000000..aa7f181b --- /dev/null +++ b/docs/DEVELOPERS_GUIDE.md @@ -0,0 +1,272 @@ +# Developer's Guide + +## Running CodeBuild Locally + +You can run AWS CodeBuild builds locally using the [CodeBuild local agent](https://docs.aws.amazon.com/codebuild/latest/userguide/use-codebuild-agent.html). This is useful for testing buildspec changes without pushing to the remote. + +### Prerequisites + +- Docker installed and running +- The `codebuild_build.sh` script: + +### Basic Usage + +1. Setup + +- Download the local CodeBuild script and make it executable. +- Send the `GH_TOKEN` environmental GitHub Personal Access Token (PAT) into a `./.env` file + +```bash +if [ ! -f codebuild_build.sh ]; then + curl -O https://raw.githubusercontent.com/aws/aws-codebuild-docker-images/master/local_builds/codebuild_build.sh && chmod +x codebuild_build.sh; +fi; +echo "GH_TOKEN=${GH_TOKEN:-ghp_notset}" > "./.env"; +``` + +1. Iterate + +- _Optionally edit the `buildspec-override` value in the `.github/workflows/codebuild.yml` GitHub workflow_ +- Update `./buildspec.yml` based on the workflow contents to a local file +- Run AWS CodeBuild build locally with images based on the machine architecture + +```bash +cat .github/workflows/codebuild.yml \ + | uvx yq -r '.jobs.build.steps[] | select(.id == "codebuild") | .with["buildspec-override"]' \ + > buildspec.yml +./codebuild_build.sh \ + -i "public.ecr.aws/codebuild/amazonlinux-$([ "$(arch)" = "arm64" -o "$(arch)" = "aarch64" ] && echo "aarch64" || echo "x86_64")-standard:$([ "$(arch)" = "arm64" -o "$(arch)" = "aarch64" ] && echo "3.0" || echo "5.0")" \ + -a "./.codebuild/artifacts/" \ + -l "public.ecr.aws/codebuild/local-builds:$([ "$(arch)" = "arm64" -o "$(arch)" = "aarch64" ] && echo "aarch64" || echo "latest")" \ + -c \ + -e "./.env" +``` + +### All Script Options + +| Flag | Required | Description | +| -------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `-i IMAGE` | Yes | Customer build container image (e.g. `aws/codebuild/standard:5.0`) | +| `-a DIR` | Yes | Artifact output directory | +| `-b FILE` | No | Buildspec override file. Defaults to `buildspec.yml` in the source directory | +| `-s DIR` | No | Source directory. First `-s` is the primary source; additional `-s` flags use `<sourceIdentifier>:<sourceLocation>` format for secondary sources. Defaults to the current working directory | +| `-l IMAGE` | No | Override the default local agent image | +| `-r DIR` | No | Report output directory | +| `-c` | No | Use AWS configuration and credentials from your local host (`~/.aws` and `AWS_*` environment variables) | +| `-p PROFILE` | No | AWS CLI profile to use (requires `-c`) | +| `-e FILE` | No | File containing environment variables (`VAR=VAL` format, one per line) | +| `-m` | No | Mount the source directory into the build container directly | +| `-d` | No | Run the build container in Docker privileged mode | + +## Security Scanners + +The [`security-scanners.yml`](../.github/workflows/security-scanners.yml) workflow runs six scanners on every push to `main`, every PR targeting `main`, and on a daily schedule. Each scanner uploads a SARIF report to GitHub Code Scanning (visible under the **Security** tab) and as a downloadable artifact. + +All scanners except ClamAV use a **deferred-failure pattern**: the scan always runs to completion and uploads results before the job fails. This ensures findings are recorded even when the build breaks. + +### Bandit — Python SAST + +**What it detects:** Common security issues in Python code (e.g., use of `subprocess`, `eval`, hardcoded passwords, weak crypto). + +**What triggers failure:** Any finding with **high confidence**, at any severity level. See the Bandit configuration in [`.github/workflows/security-scanners.yml`](../.github/workflows/security-scanners.yml) for the exact filters used. + +**Scope:** Runs against all tracked Python files in the repository; see [`.github/workflows/security-scanners.yml`](../.github/workflows/security-scanners.yml) for the precise include/exclude patterns. + +**How to review findings:** + +1. Check the **Code Scanning** alerts in the GitHub Security tab, or download the `bandit.sarif` artifact +2. Each finding includes a Bandit rule ID (e.g., `B603`) and a description of the risk + +**How to remediate:** + +- **Fix the code** — the preferred approach. Bandit docs list safe alternatives for each rule +- **Suppress inline** — add `# nosec BXXX` (with a justification) to the affected line: + + ```python + subprocess.run(cmd, check=True) # nosec B603 — cmd is built from validated config, not user input + ``` + +- **Exclude a path** — add to the `exclude` list in `.bandit` + +### Semgrep — Multi-language SAST + +**What it detects:** Security anti-patterns, dangerous API usage, and code quality issues across all languages using the full Semgrep Registry (`--config=r/all`). + +**What triggers failure:** Any finding. On PRs, only **new** findings (vs the PR base commit) trigger failure — pre-existing findings are ignored via `--baseline-commit`. + +**How to review findings:** + +1. Check **Code Scanning** alerts or download the `semgrep.sarif` artifact +2. Each finding includes a rule ID (e.g., `python.lang.security.dangerous-subprocess-use-audit`) and a link to the rule documentation + +**How to remediate:** + +- **Fix the code** — follow the rule's suggested fix in the Semgrep Registry docs +- **Suppress inline** — add `# nosemgrep: <rule-id>` to the affected line: + + ```python + time.sleep(5) # nosemgrep: arbitrary-sleep — polling for server startup + ``` + + For YAML files: + + ```yaml + run: exit ${{ steps.scan.outputs.exit_code }} # nosemgrep: yaml.github-actions.security.curl-eval.curl-eval + ``` + +- **Exclude a path** — add the path to `.semgrepignore` (note: the `changed-semgrepignore` audit rule will flag new entries for app-sec review) + +### Grype — Dependency Vulnerability Scanning (SCA) + +**What it detects:** Known CVEs in project dependencies by scanning lock files, manifests, and container images. + +**What triggers failure:** Any vulnerability rated **high or critical** (`fail-on-severity: high` in `.grype.yaml`). Low and medium vulnerabilities are reported but do not fail the build. + +**How to review findings:** + +1. Check **Code Scanning** alerts or download the `grype.sarif` artifact +2. Each finding includes the CVE ID, affected package, installed version, and fixed version (if available) + +**How to remediate:** + +- **Upgrade the dependency** — the preferred approach. Check if a patched version exists and update the relevant `pyproject.toml` or lock file +- **Suppress in config** — add an entry to the `ignore` list in `.grype.yaml` with a reason: + + ```yaml + ignore: + - vulnerability: CVE-2024-12345 + reason: "only affects server-side XML parsing which we don't use" + ``` + + You can scope to a specific package: + + ```yaml + ignore: + - vulnerability: CVE-2024-12345 + package: + name: "some-package" + version: "1.2.3" + reason: "pinned version; affected code path is unreachable" + ``` + +> **Note:** Grype is an SCA scanner — it analyzes dependencies, not source lines. There are no inline code comments for suppression; all accepted risks go in `.grype.yaml`. + +### Gitleaks — Secret Detection + +**What it detects:** Secrets (API keys, tokens, passwords, private keys) committed anywhere in the git history. + +**What triggers failure:** Any secret not present in the baseline file (`.gitleaks-baseline.json`). + +**How to review findings:** + +1. Download the `gitleaks.sarif` artifact +2. Each finding identifies the secret type (e.g., `generic-api-key`, `jwt`), file, and commit + +**How to remediate:** + +- **Rotate the secret immediately** — treat any detected secret as compromised +- **Remove from history** — use `git filter-repo` or BFG Repo-Cleaner to purge the secret from all commits +- **Add to baseline** — only for known false positives (e.g., test fixtures with synthetic credentials). Regenerate the baseline: + + ```bash + gitleaks git --config=.gitleaks.toml --report-path=.gitleaks-baseline.json --report-format=json . + ``` + + Review the updated baseline carefully before committing +- **Allowlist a path** — add a regex to `.gitleaks.toml` under `[allowlist] paths` for files that intentionally contain secret-like patterns (e.g., test credential scrubbers) + +### Checkov — Infrastructure as Code Scanning + +**What it detects:** Misconfigurations in GitHub Actions workflows and Dockerfiles (e.g., unpinned actions, missing security settings, overly broad permissions). + +**Scope:** Only scans `github_actions` and `dockerfile` frameworks (configured in `.checkov.yaml`). + +**What triggers failure:** Any check failure, except checks listed in `skip-check`. + +**How to review findings:** + +1. Check **Code Scanning** alerts or download the `checkov.sarif` artifact +2. Each finding includes a check ID (e.g., `CKV_GHA_7`, `CKV_DOCKER_2`) and a description of the misconfiguration + +**How to remediate:** + +- **Fix the configuration** — follow the Checkov docs for the specific check ID +- **Suppress inline** — add a comment above or on the affected line: + + In a Dockerfile: + + ```dockerfile + # checkov:skip=CKV_DOCKER_2:healthcheck not needed for build-only image + FROM python:3.12-slim + ``` + + In a GitHub Actions workflow: + + ```yaml + # checkov:skip=CKV_GHA_7:buildspec-override requires user parameters + - uses: aws-actions/aws-codebuild-run-build@v1 + ``` + + Multiple skips on one line: + + ```yaml + # checkov:skip=CKV_DOCKER_2,CKV_DOCKER_3:reason for both + ``` + +- **Skip repo-wide** — add the check ID to the `skip-check` list in `.checkov.yaml` with a comment explaining why + +### ClamAV — Malware Scanning + +**What it detects:** Malware, viruses, and trojans in repository files using ClamAV's signature database. + +**What triggers failure:** Any malware detection (binary pass/fail). + +**How to review findings:** + +1. Download the `clamdscan.txt` artifact — it contains the full scan log with any infected file paths + +> **Note:** ClamAV does not produce SARIF output and does not integrate with GitHub Code Scanning. Results are only available as the text log artifact. + +**How to remediate:** + +- **Remove the infected file** and investigate how it was introduced +- **Verify the detection** — false positives are rare but possible. Check the ClamAV signature name against known FP databases + +### Summary of Failure Thresholds + +| Scanner | Fails on | Severity filter | Config file | +| -------- | -------------------------------- | --------------------- | ------------------------------------------- | +| Bandit | Any finding with high confidence | All severities | `.bandit` | +| Semgrep | Any finding (PRs: new only) | All severities | `.semgrepignore` | +| Grype | High or critical CVEs | Low/medium don't fail | `.grype.yaml` | +| Gitleaks | Any secret not in baseline | All | `.gitleaks.toml`, `.gitleaks-baseline.json` | +| Checkov | Any check failure | All (minus skipped) | `.checkov.yaml` | +| ClamAV | Any malware detection | Binary pass/fail | None | + +### Summary of Suppression Methods + +| Scanner | Inline comment | Config-level | Baseline/differential | +| -------- | --------------------------- | ----------------------------- | --------------------------- | +| Bandit | `# nosec BXXX` | `.bandit` `exclude` | — | +| Semgrep | `# nosemgrep: rule-id` | `.semgrepignore` | `--baseline-commit` on PRs | +| Grype | _(not applicable — SCA)_ | `.grype.yaml` `ignore` | — | +| Gitleaks | — | `.gitleaks.toml` `allowlist` | `.gitleaks-baseline.json` | +| Checkov | `# checkov:skip=ID:reason` | `.checkov.yaml` `skip-check` | — | +| ClamAV | — | — | — | + +## Running GitHub Actions locally + +_NOTE: This uses the [`act`](https://github.com/nektos/act) tool and assumes access to a valid AWS CodeBuild project `codebuild-project` in "us-east-1"_ + +```shell +act --platform ubuntu-latest=-self-hosted \ + --job build \ + --workflows .github/workflows/codebuild.yml \ + --env-file .env \ + --var CODEBUILD_PROJECT_NAME=codebuild-project \ + --var AWS_REGION=us-east-1 \ + --var ROLE_DURATION_SECONDS=7200 \ + --artifact-server-path=$PWD/.codebuild/artifacts \ + --cache-server-path=$PWD/.codebuild/artifacts \ + --env ACT_CODEBUILD_DIR=$PWD/.codebuild/downloads \ + --bind +``` diff --git a/docs/GENERATED_DOCS_REFERENCE.md b/docs/GENERATED_DOCS_REFERENCE.md new file mode 100644 index 00000000..83deb82a --- /dev/null +++ b/docs/GENERATED_DOCS_REFERENCE.md @@ -0,0 +1,102 @@ +# Generated aidlc-docs/ Reference + +When you run the AI-DLC workflow, all documentation artifacts are generated inside an `aidlc-docs/` directory at your workspace root. The exact files created depend on your project type (greenfield vs brownfield), complexity, and which stages the workflow executes or skips. + +Below is the fully populated structure showing every possible file across all phases and stages. Conditional files are annotated with notes indicating when they appear. + +```text +aidlc-docs/ +├── aidlc-state.md # Workflow state tracker — project info, stage progress, current status +├── audit.md # Complete audit trail — every user input, AI response, and approval with timestamps +│ +├── inception/ # 🔵 INCEPTION PHASE — determines WHAT to build and WHY +│ ├── plans/ +│ │ ├── execution-plan.md # Workflow visualization and phase execution decisions (always created) +│ │ ├── story-generation-plan.md # Story development methodology and questions (if User Stories executes) +│ │ ├── user-stories-assessment.md # Assessment of whether user stories add value (if User Stories executes) +│ │ ├── application-design-plan.md # Component and service design plan with questions (if Application Design executes) +│ │ └── unit-of-work-plan.md # System decomposition plan with questions (if Units Generation executes) +│ │ +│ ├── reverse-engineering/ # Created only for brownfield projects (existing codebase detected) +│ │ ├── business-overview.md # Business context, transactions, and dictionary +│ │ ├── architecture.md # System architecture diagrams, component descriptions, data flow +│ │ ├── code-structure.md # Build system, key classes/modules, design patterns, file inventory +│ │ ├── api-documentation.md # REST APIs, internal APIs, and data models +│ │ ├── component-inventory.md # Inventory of all packages by type (application, infrastructure, shared, test) +│ │ ├── technology-stack.md # Languages, frameworks, infrastructure, build tools, testing tools +│ │ ├── dependencies.md # Internal and external dependency graphs and relationships +│ │ ├── code-quality-assessment.md # Test coverage, code quality indicators, technical debt, patterns +│ │ └── reverse-engineering-timestamp.md # Analysis metadata and artifact checklist +│ │ +│ ├── requirements/ +│ │ ├── requirements.md # Functional and non-functional requirements with intent analysis (always created) +│ │ └── requirement-verification-questions.md # Clarifying questions with [Answer]: tags for user input (always created) +│ │ +│ ├── user-stories/ # Created only if User Stories stage executes +│ │ ├── stories.md # User stories following INVEST criteria with acceptance criteria +│ │ └── personas.md # User archetypes, characteristics, and persona-to-story mappings +│ │ +│ └── application-design/ # Created only if Application Design and/or Units Generation execute +│ ├── application-design.md # Consolidated design document (if Application Design executes) +│ ├── components.md # Component definitions, responsibilities, and interfaces +│ ├── component-methods.md # Method signatures, purposes, and input/output types +│ ├── services.md # Service definitions, responsibilities, and orchestration patterns +│ ├── component-dependency.md # Dependency matrix and communication patterns between components +│ ├── unit-of-work.md # Unit definitions and responsibilities (if Units Generation executes) +│ ├── unit-of-work-dependency.md # Dependency matrix between units (if Units Generation executes) +│ └── unit-of-work-story-map.md # Mapping of user stories to units (if Units Generation executes) +│ +├── construction/ # 🟢 CONSTRUCTION PHASE — determines HOW to build it +│ ├── plans/ +│ │ ├── {unit-name}-functional-design-plan.md # Business logic design plan with questions (per unit, if Functional Design executes) +│ │ ├── {unit-name}-nfr-requirements-plan.md # NFR assessment plan with questions (per unit, if NFR Requirements executes) +│ │ ├── {unit-name}-nfr-design-plan.md # NFR design patterns plan with questions (per unit, if NFR Design executes) +│ │ ├── {unit-name}-infrastructure-design-plan.md # Infrastructure mapping plan with questions (per unit, if Infrastructure Design executes) +│ │ └── {unit-name}-code-generation-plan.md # Detailed code generation steps with checkboxes (per unit, always created) +│ │ +│ ├── {unit-name}/ # Per-unit artifacts — one directory per unit of work +│ │ ├── functional-design/ # Created only if Functional Design executes for this unit +│ │ │ ├── business-logic-model.md # Detailed business logic and algorithms +│ │ │ ├── business-rules.md # Business rules, validation logic, and constraints +│ │ │ ├── domain-entities.md # Domain models with entities and relationships +│ │ │ └── frontend-components.md # UI component hierarchy, props, state, interactions (if unit has frontend) +│ │ │ +│ │ ├── nfr-requirements/ # Created only if NFR Requirements executes for this unit +│ │ │ ├── nfr-requirements.md # Scalability, performance, availability, security requirements +│ │ │ └── tech-stack-decisions.md # Technology choices and rationale +│ │ │ +│ │ ├── nfr-design/ # Created only if NFR Design executes for this unit +│ │ │ ├── nfr-design-patterns.md # Resilience, scalability, performance, and security patterns +│ │ │ └── logical-components.md # Logical infrastructure components (queues, caches, etc.) +│ │ │ +│ │ ├── infrastructure-design/ # Created only if Infrastructure Design executes for this unit +│ │ │ ├── infrastructure-design.md # Cloud service mappings and infrastructure components +│ │ │ └── deployment-architecture.md # Deployment model, networking, scaling configuration +│ │ │ +│ │ └── code/ # Markdown summaries of generated code (always created per unit) +│ │ └── *.md # Code generation summaries (actual code goes to workspace root) +│ │ +│ ├── shared-infrastructure.md # Shared infrastructure across units (if applicable) +│ │ +│ └── build-and-test/ # Always created after all units complete code generation +│ ├── build-instructions.md # Prerequisites, build steps, troubleshooting +│ ├── unit-test-instructions.md # Unit test execution commands and expected results +│ ├── integration-test-instructions.md # Integration test scenarios, setup, and execution +│ ├── performance-test-instructions.md # Load/stress test configuration and execution (if performance NFRs exist) +│ ├── contract-test-instructions.md # API contract validation between services (if microservices) +│ ├── security-test-instructions.md # Vulnerability scanning and security testing (if security NFRs exist) +│ ├── e2e-test-instructions.md # End-to-end user workflow testing (if applicable) +│ └── build-and-test-summary.md # Overall build status, test results, and readiness assessment +│ +└── operations/ # 🟡 OPERATIONS PHASE — placeholder for future expansion +``` + +## Notes + +- `{unit-name}` is replaced with the actual unit name (e.g., `api-service`, `frontend-app`, `data-processor`). For single-unit projects, there is typically one unit directory under `construction/`. +- For simpler single-unit projects, the model may simplify naming — for example, `construction/plans/code-generation-plan.md` instead of `construction/plans/{unit-name}-code-generation-plan.md`, or place `application-design.md` as a single consolidated file without the individual component files. +- The `build-and-test/` directory always includes `build-and-test-summary.md`. The individual instruction files (`build-instructions.md`, `unit-test-instructions.md`, `integration-test-instructions.md`, etc.) are generated based on project complexity and testing needs. +- Plans in `inception/plans/` and `construction/plans/` contain `[Answer]:` tags where users provide input, and `[ ]`/`[x]` checkboxes that track execution progress. +- Application code is never placed inside `aidlc-docs/` — it goes to the workspace root. Only markdown documentation lives here. +- The `audit.md` file is append-only and captures every interaction with ISO 8601 timestamps. +- The `aidlc-state.md` file tracks which stages have been completed, skipped, or are in progress, along with extension configuration. diff --git a/docs/WORKING-WITH-AIDLC.md b/docs/WORKING-WITH-AIDLC.md new file mode 100644 index 00000000..136fe9ab --- /dev/null +++ b/docs/WORKING-WITH-AIDLC.md @@ -0,0 +1,624 @@ +# Working with AIDLC + +This guide helps you get the most out of AI-DLC (AI-Driven Development Life Cycle). It covers how to interact effectively with the AI at each stage — from first prompt to working code. + +Start with the basics in each section. The advanced tips are drawn from real workshop experience and address the patterns that teams found most useful once they got comfortable with the fundamentals. + +--- + +## Table of Contents + +1. [General Rules](#1-general-rules) +2. [Inception Phase](#2-inception-phase) +3. [Construction Phase](#3-construction-phase) +4. [Never Vibe Code](#4-never-vibe-code) + +--- + +## 1. General Rules + +### Asking Questions Without Changing Files + +One of the most important habits to build early: **not every question should trigger a document update**. + +When you ask the AI something without guarding your question, it may interpret it as a change request and immediately update design documents. To prevent this, prefix exploratory questions with a clear no-change instruction. + +**Basic pattern:** + +```text +Do not update any documents. Help me understand why [this decision] was made. +``` + +```text +Do not update any documents. For [component name], is it reasonable to use [library or technology] here? +``` + +```text +Do not change anything. Assess the impact of [proposed change]. +I want to understand the consequences before we decide. +``` + +These patterns let you think out loud with the AI, evaluate options, and challenge decisions without committing to anything. Once you're satisfied with the answer, follow up with a deliberate update instruction if needed. + +> **Tip**: Start every exploratory message with "Do not update any documents." You can always drop that constraint once you're ready to act. + +--- + +### The Question → Doc → Approval Flow + +AIDLC never asks clarifying questions inline in the chat. It writes questions into a markdown file and waits for you to fill in your answers there. This keeps a durable record of every decision and makes it easy for the whole team to contribute. + +**Step 1 — AIDLC creates a question file** + +The AI creates a file like `aidlc-docs/inception/requirements/requirement-verification-questions.md` and stops. It will not proceed until you answer. + +**Step 2 — You fill in your answers** + +Open the file and fill in each `[Answer]:` tag. Questions use multiple-choice format: + +```markdown +## Question: Deployment model +Where will this service be deployed? + +A) AWS Lambda (serverless) +B) AWS ECS Fargate (containerized) +C) Existing on-premises infrastructure +X) Other (please describe after [Answer]: tag below) + +[Answer]: B +``` + +A few things that work well when answering: + +- **Add a label alongside the letter.** `C — financial summary and debt service coverage` is clearer than just `C`. +- **Include a brief justification.** `A — design-first; generate the OpenAPI spec before writing code` confirms intent and gives the AI context it carries forward. +- **Combine options when you mean both.** `B and C — rate limiting at both API Gateway level and application level (not D)` is unambiguous. +- **Add a caveat when the option is almost right.** `B — migration is a separate project; however, include a one-time migration into the new data structures.` +- **Use X freely.** If none of the options fit, X is the right choice over forcing a wrong answer. + +**Step 3 — Tell the AI your answers are ready** + +Return to the chat and say: "We have answered your clarification questions. Please re-read the file and proceed." + +Tip: explicitly asking the AI to *re-read* the file ensures it loads your answers from disk rather than relying on an in-memory version that may not reflect your latest edits. + +**Step 4 — AIDLC validates and proceeds** + +The AI reads your answers, flags any remaining ambiguities, and proceeds to generate the next artifact. + +> **Advanced tip**: If you have documentation that answers some of the AI's questions, you can instruct it to resolve those itself: "Analyze the rationale for each question. If a question has already been answered through the provided documentation, answer it yourself. Only ask me if it is still unclear." This reduces unnecessary back-and-forth at gate points. + +**Approval gates** + +At the end of each stage, AIDLC presents a completion message with two options: + +- **Request Changes** — ask for modifications before moving on +- **Approve and Continue** — accept the output and advance + +Read the generated artifact before approving. Discuss with your team if needed. Only approve when you're satisfied. + +--- + +### Context Management + +Context is the AI's working memory for the session. AIDLC depends on having the full chain of artifacts and instructions in context to generate consistent downstream outputs. Managing it well is one of the highest-leverage habits you can develop. + +**The core rule: clear the context at every natural decision point.** + +AIDLC is built around gates — moments where the AI stops and asks you something: a question file to answer, a document to approve, a plan to review. These pauses are not just approval checkpoints. They are the right moments to start a fresh context before continuing. + +Clearing context at a gate is low-risk because the AI's current work is already saved to files. The next context starts clean, loads the relevant artifacts from disk, and proceeds without carrying accumulated noise from all the earlier steps. + +If you let context accumulate across multiple gates, the AI starts working from a compressed or partially lost version of earlier instructions and artifacts. Output quality degrades in ways that are subtle and hard to diagnose. + +**In practice:** + +- When the AI asks you to answer a question file — answer the questions, then **start a fresh context** and tell the AI to re-read the file and continue +- When the AI presents a document for approval — review it, then **start a fresh context** to either request changes or approve and proceed +- If your tool offers a "compact context" prompt mid-workflow, **always decline it** — compaction is not the same as a clean reset and loses more than it saves + +**How to resume after a context reset:** + +Option 1 — State file method (recommended): + +```text +Go to aidlc-docs/aidlc-state.md, find the first unchecked item, +then go to the corresponding plan file and resume from that point. +``` + +Option 2 — Manual handoff: + +```text +I am resuming a previously stopped conversation. Here is the context: +[paste summary of last output or recent change] +Please continue with [next action or section X]. +``` + +> **Tip**: Commit and push all current changes to the repository whenever you reset context. It takes seconds and means you always have a clean recovery point. + +```text +Please commit and push all current changes to the repository. +``` + +--- + +### Batching Prompts + +Not all prompts should be sent separately. A simple rule from workshop experience: + +**When two changes are tightly coupled to the same subject, include both in one prompt. When two changes are unrelated, do them one at a time.** + +Over-batching (combining unrelated changes) causes the AI to lose focus and miss details. Under-batching (separate prompts for closely related things) adds unnecessary round-trips. When in doubt, err on the side of separating. + +--- + +### Loading External Reference Files + +You can point AIDLC to any existing document — a schema, an architecture diagram, a data dictionary, an API spec — and it will incorporate that content into the current stage. + +**Basic pattern:** + +```text +Please read [file path or description]. Use it as the basis for [what you want]. +``` + +```text +We have an existing audit table structure. Please add it to the inception documents +and reference it for this service. When we proceed, expect new requirements and +stories related to this service. +``` + +> **Advanced tip**: You can load documents at any stage, not just at the start. If a new constraint surfaces during Construction — an updated security policy, a revised data model — load it and ask AIDLC to assess the impact before proceeding. +> +> **Advanced tip — Enterprise standards as extensions**: If your organization has security, compliance, or API guidelines that should apply to every project, add them as a markdown steering file in `aidlc-rules/extensions/`. AIDLC will automatically load them into every phase without requiring manual injection. + +--- + +### Getting Independent Critiques + +AIDLC will defend its own prior decisions. When you want an unbiased evaluation of an artifact, ask for a critique in a **fresh context** — one where the AI has no memory of why it made those decisions. + +```text +Produce a critique document of [the requirements document / the component design]. +Do this in a new context separate from everything else. +``` + +This produces more useful, objective feedback than asking for a critique in the same session where the artifact was created. + +--- + +### Depth Levels + +AIDLC adapts how deeply it executes each stage based on the complexity of your request. You can influence this. + +```text +Keep this at minimal depth — we just need the basic structure documented. +``` + +```text +This is a production-critical component. Please run at comprehensive depth. +``` + +--- + +## 2. Inception Phase + +The Inception phase is where you and the AI align on *what to build and why* before any design or code work begins. The more context you bring in here, the fewer clarifying questions and the less rework you'll encounter in Construction. + +### Prepare Your Inputs Before Starting + +The single most effective thing you can do before kicking off AIDLC is prepare two documents: + +1. **Vision Document** — what to build and why +2. **Technical Environment Document** — what tools and constraints apply + +These documents dramatically reduce the number of clarifying questions AIDLC will ask and ensure the AI starts from your team's actual context rather than making assumptions. + +**Where to start:** + +- [writing-inputs/inputs-quickstart.md](writing-inputs/inputs-quickstart.md) — quick summary for both greenfield and brownfield +- [writing-inputs/vision-document-guide.md](writing-inputs/vision-document-guide.md) — full vision guide with templates +- [writing-inputs/technical-environment-guide.md](writing-inputs/technical-environment-guide.md) — full technical environment guide with templates + +**Brownfield projects** (adding to an existing codebase) need slightly different inputs. The vision doc needs a current state description and an explicit list of what must not change. The technical environment doc should describe the existing stack rather than a desired one, and example code should come from actual existing files. See [writing-inputs/inputs-quickstart.md](writing-inputs/inputs-quickstart.md) for the brownfield minimum and worked examples. + +**Minimum viable input** if you want to start quickly: + +For the Vision: one paragraph describing what you're building and for whom, a list of MVP features in scope, a list of features explicitly out of scope, and any open questions — things you already know are uncertain. Open questions feed directly into Requirements Analysis as pre-declared ambiguities, so they get resolved early rather than surfacing as surprises mid-design. + +For the Technical Environment: language and version, package manager, web framework, cloud provider and deployment model, test framework, a prohibited libraries table (with reason and recommended alternative for each entry), security basics, and at least one example each for a typical endpoint, function, and test. + +The prohibited libraries table matters more than a plain list — the reason and alternative columns tell AI-DLC *why* a library is banned, which leads to better substitution decisions. The example code patterns are the single highest-leverage addition beyond the basics: they give AI-DLC a concrete pattern to follow during code generation rather than inventing its own. + +> **Tip**: Every gap you fill in up front is one fewer clarifying question during Requirements Analysis. + +--- + +### Kicking Off a New Project + +Once your input documents are ready: + +```text +I want to start a new project. Please read [path to vision document] and +[path to technical environment document], then begin the AIDLC workflow. +``` + +AIDLC will scan the workspace, determine greenfield vs. brownfield, and proceed into Requirements Analysis using your documents as the primary source — asking only for what they don't cover. + +For a brownfield project, AIDLC will first run Reverse Engineering, analyzing your existing codebase and producing architecture, component, and API documentation. Review these artifacts carefully — they become the foundation for everything that follows. + +--- + +### Answering Requirements Questions + +See the answering tips in [Section 1](#the-question--doc--approval-flow) for the full guidance on using letters, adding labels, combining options, and using X for custom answers. A few additional points specific to Requirements Analysis: + +- **Separate the full vision from the MVP explicitly.** If AIDLC asks what features to include, name them. If something is out of scope, say so — don't leave it ambiguous. +- **State deliberate "no" decisions clearly.** `D — no caching required at this time` signals intent. An empty answer invites the AI to make a speculative choice. +- **Describe phased approaches inline.** `X — simple role-based workflow now; replace with external workflow engine when available` lets AIDLC design the current solution with the right extension points. + +> **Advanced tip — Security Extensions**: During Requirements Analysis, AIDLC will ask whether you want to enforce security extension rules. For production-grade applications, choose Yes. For prototypes, No is fine. This decision is recorded and enforced throughout Construction, so choose deliberately. + +--- + +### Inception-Specific Interactions + +**Deferring a feature mid-stream:** + +```text +We are going to backlog the [feature name] capability for the current release. +Please remove it from the component design and flag the related user stories as backlogged. +``` + +Backlogging (rather than deleting) preserves the work for future iterations without it influencing the current build. + +**Registering an existing data structure:** + +```text +We have an existing [schema/structure name]. Please add it to the inception documents +and reference it for this service. When we proceed, expect new requirements and +stories related to this service. +``` + +**Making implicit data sources explicit:** + +```text +For the [service name], add the understanding that [new data source] is also a +data source for this feature, in addition to [existing data source]. Then review +requirements and user stories to ensure this is captured. +``` + +**Checking for upstream impact after a design change:** + +After any meaningful change to a design artifact, ask AIDLC to check whether earlier documents are still consistent: + +```text +Now review the previous steps — user stories and requirements — to ensure +this change does not require updates to any of those documents. +``` + +> **Advanced tip — Standing back-propagation rule**: Instead of asking after each change, set this as a standing instruction at the start of a phase: "Every time you update a document, check whether the change impacts the requirements document and user stories, and prompt me if it does." This creates an automatic safety net without requiring you to remember. + +**Parallel team review of component design:** + +If your team splits up to review different components simultaneously: + +```text +Restrict your edits to the files under your team's control. When all teams are done, +we will ask the AI to review all changes and confirm there are no conflicts. +Then we will ask it to review impacts to user stories and requirements. +``` + +When everyone is done, trigger the conflict check: + +```text +We had [N] independent groups editing component design files. Please review all files +and report any conflicts or inconsistencies. Do not edit the files — produce a report +for our review. +``` + +Resolve each conflict explicitly by number: + +```text +For conflict #[number] ([conflict description]): +update [target file] to reflect [your decision]. +``` + +```text +For conflict #[number] ([capability name]): +this capability is backlogged. Update the documentation to clearly mark it as +backlogged so code generation does not attempt to implement it. +``` + +**Archiving stale design files:** + +If exploration during design produced files that are no longer needed: + +```text +Move the [file descriptions] to an archive folder — do not delete them. +Then confirm whether they are required for code generation. +``` + +> **Advanced tip — Component size constraints**: If you want to prevent oversized components that would be too large to implement in a single sprint, set a story-point cap during Application Design: "At the component design phase, inject the following instruction: no single component should have more than [X] aggregate story points. If a component exceeds this limit, break it down into smaller sub-components." +> +> **Advanced tip — Context resets mid-phase**: If your session gets interrupted, use this to re-establish state: +> +> ```text +> Stop. New context. We just completed [description of recent work]. +> Please review [upstream artifacts] to assess any impact of the recent change. +> [Paste the change description here.] +> ``` + +--- + +## 3. Construction Phase + +The Construction phase is where designs become code. Each unit of work goes through a series of design stages (conditional) followed by Code Generation (always). After all units are complete, Build and Test closes out the work. + +### The Design Review Process + +For each unit of work, AIDLC may execute some or all of these design stages before generating code: + +- **Functional Design** — business logic, domain models, data schemas +- **NFR Requirements** — performance, security, scalability, tech stack selection +- **NFR Design** — applying NFR patterns to the design +- **Infrastructure Design** — mapping the design to actual cloud services + +Each stage produces a document in `aidlc-docs/construction/{unit-name}/`. Your job at each gate is to read the document and decide: request changes or approve. + +**Read before you approve.** The design documents are the source of truth for code generation. Mistakes that slip through here are harder to fix later. + +**Advancing from design to code:** + +When you're ready to transition to Code Generation, give the AI the structural context it needs up front: + +```text +We have completed component design review. We are ready for code creation. +Please use the following directory and source code structure: +[reference an existing service or folder structure]. +Use this pattern for APIs. For the UI, follow the [Vue.js composables/components/store] +directory structure. Please ask any questions you have before proceeding. +``` + +Inviting questions before generation starts resolves ambiguities in the plan rather than in the middle of file creation. + +**Requesting a targeted correction:** + +Be precise — name the element, what is wrong, and what it should be: + +```text +The [endpoint description] should use [correct parameter], not [incorrect parameter]. +Please update the [component name] accordingly. +``` + +**Choosing between AI-presented options:** + +```text +Please implement Option B — [option description] — for [feature name]. +Update all component design documents accordingly. +``` + +Reference the option by letter *and* description, and explicitly scope the update to all affected documents, not just the one where the question arose. + +**Overriding a design pattern:** + +```text +We prefer to deviate from [standard pattern] and use [our preferred approach] +to allow [rationale]. Please update the component design documents accordingly. +``` + +The rationale matters. AIDLC carries it forward into later stages, which prevents the deviation from being silently reversed. + +> **Advanced tip — Impact assessment before committing**: For any significant design change, assess before acting: +> +> ```text +> Do not change anything. Assess the impact of [proposed change]. +> [Describe the proposed change in detail.] +> ``` +> +> **Advanced tip — Inline code documentation**: If you want inline documentation applied consistently to every unit, add it as a standing rule at the start of the Construction phase rather than repeating it per unit: "Add inline code documentation as a standard rule for the construction phase." + +--- + +### The Code Generation Process + +Code Generation has two distinct parts. Both require your explicit approval. + +**Part 1 — Planning** + +AIDLC creates a numbered, checkbox-tracked plan of every file to be created or modified. Review this plan before approving. Check that: + +- Every file is in the right location (application code in the workspace root, never in `aidlc-docs/`) +- The steps cover everything your design documents specified +- Brownfield projects list existing files to modify, not new duplicates alongside them + +> **Advanced tip — Internal libraries**: Before approving the plan, inject your internal library requirements into the Q&A file or implementation plan: +> +> ```text +> In addition to my answers, you must use the following libraries from our +> [starter project / building blocks]: [list each library explicitly]. +> Explain why and when each should be used, not just what it is. +> ``` +> +> A curated markdown guide to your internal libraries works better than pointing the AI at a repository. Create one and reference it as a code generation input. +> +> **Advanced tip — UI from Figma designs**: Take a screenshot of your Figma design, pass it to a vision-capable model (e.g. ChatGPT) to generate framework code from the screenshot, then provide that output to AIDLC as the UI implementation input. This produces a concrete, tool-readable specification rather than a raw design-tool export. + +**Part 2 — Generation** + +AIDLC executes each step sequentially, checking off each step as it completes. When all steps are done, it presents the completion message with paths to generated files. + +Review the generated code before approving. If something isn't right: + +```text +Request Changes: [describe specifically what needs to change] +``` + +> **Advanced tip — Brownfield file modifications**: For existing codebases, AIDLC modifies files in place. If you see `ClassName_modified.java` or `service_new.ts` alongside the original, flag it immediately: +> +> ```text +> I see [ClassName_modified.java] alongside [ClassName.java]. Please merge the changes +> into the original file and delete the duplicate. +> ``` + +--- + +### Build and Test + +After all units are complete, AIDLC generates build and test instructions for all units. A few patterns worth knowing: + +**Injecting test tooling at the right moment:** + +Don't add test framework or test management system instructions at project start. By the time code generation begins, those details may have been compressed or lost across many intervening stages. Inject them just-in-time: + +```text +At the functional test generation step, inject the following instruction: +generate functional tests using the [test management system] format described +in this document: [attach specification]. Use this API endpoint to push the +generated test cases to the [test management system] repository: [endpoint details]. +``` + +This principle applies to any tool-specific instruction: inject it at the phase where it's needed, not at project start. + +**Scoping unit test coverage:** + +```text +When generating unit tests, exclude third-party external dependencies from +code coverage calculations. Require a minimum of 80% coverage on internal +code paths only. +``` + +--- + +### After Code Generation: Back-Propagating Changes + +Changes made during code generation — small design decisions, adjustments discovered while writing code — need to flow back up to the design documents. Do this as a deliberate sweep after code polish is complete, not ad hoc: + +```text +When you have finished polishing the code, review each unit's final design files +and propagate any changes back up the chain to requirements and user stories. +Make a plan for how to do this step by step before executing. +``` + +Asking for a plan before execution ensures the sweep is systematic across all units rather than selective. + +> **Advanced tip — Extracting reusable specs**: At the end of a completed project, extract the patterns you established into reusable specification documents for future projects: +> +> ```text +> Create a set of reusable specification documents from the patterns expressed +> in this project: one for API design, one for security, one for UI specifications, +> one for the technology stack, and one for directory structure. Use the completed +> units as the source. I will review and approve each document before it is used +> in future projects. +> ``` + +--- + +## 4. Never Vibe Code + +Vibe coding means directly editing generated code files to make quick fixes or try things out — bypassing the design documents entirely. It feels fast in the moment and creates problems shortly after. + +The issue isn't the edit itself. It's that the design documents — the source of truth AIDLC uses for every subsequent operation — no longer reflect what the code actually does. The next time AIDLC runs Code Generation for a related unit, or you resume a session, or a colleague picks up the work, the disconnect causes confusion and rework. + +One team described it directly during workshops: + +> "You never fix code directly. If you discover an issue, go back to AIDLC and say: I have discovered issue X. Review the design and make a plan to fix it. If this affects the design, update it, then update the code." + +**The rule: update the design first, then generate the code.** + +--- + +### The Right Way to Make a Change + +Whether you've spotted a bug, changed your mind about a design decision, or received new requirements, the flow is the same: + +**Step 1 — Describe the issue without touching anything:** + +```text +Do not update any documents yet. I have discovered issue [X]. +Review the design and help me understand where this needs to be addressed. +``` + +**Step 2 — Fix the design document:** + +```text +Please update [specific design document] to reflect [the fix]. +Then check whether any upstream documents — requirements, user stories — +also need to be updated. +``` + +**Step 3 — Regenerate the affected code:** + +```text +The design for [unit name] has been updated. Please re-run code generation +for the affected files only. +``` + +This flow takes a few extra minutes compared to directly editing a file. It keeps your documentation in sync, your audit trail complete, and your team aligned on what was actually built. + +--- + +### When You're Tempted to "Just Edit the File" + +**"It's just a one-line fix."** + +One-line fixes that bypass the design still create drift. Note the fix in the relevant design document and let AIDLC apply it: + +```text +In [functional-design.md for unit X], update [method or rule] to [the fix]. +Then regenerate [the affected file]. +``` + +**"We're just exploring — nothing is final yet."** + +Exploration is exactly what "Do not update any documents" is for. Explore freely in the chat. Commit only when you're ready. + +**"I need to unblock the team right now."** + +Sometimes you have to move fast. If you make a direct edit, log it honestly so the audit trail stays accurate: + +```text +We made a temporary direct edit to [file] to unblock the team. +The fix was [description]. Please update [design document] to reflect this +and verify no other documents are inconsistent. +``` + +--- + +### Standing Rules That Prevent Drift + +Two standing instructions you can set at the start of a Construction phase that catch problems early, without requiring you to remember to ask each time: + +**Back-propagation on every update:** + +```text +Every time you update a document, check whether the change impacts the +requirements document and user stories, and prompt me if it does. +``` + +**Design-first on every code decision:** + +```text +When you make a design decision during code generation, always make sure +the documentation reflects this change before proceeding. +``` + +Set these once at the start of Construction and they apply for the entire phase. + +--- + +### Keeping Reports Out of aidlc-docs + +One practical note: if you ask AIDLC to produce human-facing reports — architecture diagrams, component summaries, stakeholder presentations — don't let it save them into `aidlc-docs/`. Those files will be loaded as artifacts in subsequent stages, inflating the token count and potentially confusing the AI about what is authoritative design input. + +Use a separate `reports/` folder and, for cleaner output, generate reports in a fresh context with a dedicated report specification file: + +```text +Pause the process. Start a new context. Read [report specification markdown file] +and produce the report based on the current state of the AIDLC artifacts. +Save the output to a reports/ folder, not aidlc-docs/. +``` + +--- + +*For guides on preparing your input documents, see [writing-inputs/inputs-quickstart.md](writing-inputs/inputs-quickstart.md).* diff --git a/docs/writing-inputs/example-minimal-tech-env-brownfield.md b/docs/writing-inputs/example-minimal-tech-env-brownfield.md new file mode 100644 index 00000000..fde4aad4 --- /dev/null +++ b/docs/writing-inputs/example-minimal-tech-env-brownfield.md @@ -0,0 +1,150 @@ +# Technical Environment: Returns and Refunds Module — OrderFlow Platform + +> **Brownfield project.** The existing stack is the baseline. New code must fit +> into the established patterns. Where a choice is not listed below, follow the +> existing codebase — do not introduce new patterns without justification. + +--- + +## Existing Stack (must be preserved) + +| Layer | Current Technology | Version | Notes | +| ------------------ | ------------------- | --------- | -------------------------------------------------------------------- | +| Language | TypeScript | 5.x | Strict mode. Do not introduce JavaScript files. | +| Runtime | Node.js | 20.x LTS | | +| API framework | Express | 4.x | All existing services use Express. Do not introduce Fastify or Koa. | +| Database | PostgreSQL | 15 | Via pg and node-postgres. No ORM — raw SQL with typed query helpers. | +| Infrastructure | AWS ECS Fargate | — | Services deploy as Docker containers. CDK for all infra. | +| Message bus | Amazon SQS | — | Used by notification-service for async email dispatch. | +| Auth | AWS Cognito | — | JWT tokens validated at API Gateway. Do not build a new auth layer. | +| Package manager | npm | 10.x | Do not introduce yarn or pnpm. | +| Test framework | Jest | 29.x | With ts-jest. All tests in `__tests__/` alongside source. | +| Linter / formatter | ESLint + Prettier | — | Config files are in the repo root. Do not modify them. | + +--- + +## What to Add (new for this module) + +- A new `returns-service` following the same structure as `order-service` +- New PostgreSQL tables: `return_requests`, `return_items`, `return_status_history` +- New React components for the customer return form and operations dashboard +- These additions must not modify existing tables or service contracts + +--- + +## What to Keep Unchanged + +- `order-service`, `payment-service`, `notification-service` — do not modify these services +- Existing PostgreSQL tables — additive migrations only (new tables, new columns on new tables) +- The `notification-service` API contract — call it as documented, do not extend it +- Existing CDK stacks — add a new stack for `returns-service`, do not edit existing stacks +- Frontend design system components — use existing components, do not create replacements + +--- + +## What to Remove / Not Introduce + +| Prohibited | Reason | Use Instead | +| ----------------------------------- | -------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------- | +| ORMs (TypeORM, Prisma, Sequelize) | Existing codebase uses raw SQL with typed helpers. Introducing an ORM creates inconsistency. | node-postgres with typed query functions, matching existing pattern | +| Axios | Project uses native fetch (Node 20 built-in). | fetch | +| Any new CSS framework | Existing frontend uses Tailwind CSS. | Tailwind CSS, existing design system components | +| New state management library | Existing frontend uses React Context + useReducer. | React Context + useReducer | +| New test runner (Vitest, Mocha) | Project uses Jest throughout. | Jest | +| Separate auth service or middleware | Auth is handled at API Gateway via Cognito JWT. | Validate the JWT passed in the Authorization header, same as other services | + +--- + +## Security Basics + +- Authentication: Cognito JWT validated at API Gateway. Services receive `x-user-id` and `x-user-role` headers — trust these, do not re-validate the JWT in the service +- Authorization: Operations dashboard endpoints require `role === 'operations'` — check this header +- Input validation: Validate all request bodies with Zod schemas before processing +- PII: Return requests contain customer names and addresses — do not log these fields +- Secrets: Database credentials and service URLs via AWS Secrets Manager, same as existing services + +--- + +## Example Code Patterns + +Follow these patterns from the existing codebase. Do not invent alternatives. + +**A service endpoint (Express route handler):** + +```typescript +import { Router, Request, Response } from 'express'; +import { z } from 'zod'; +import { createReturnRequest } from '../domain/returns'; +import { AppError } from '../errors'; + +const router = Router(); + +const CreateReturnSchema = z.object({ + orderId: z.string().uuid(), + items: z.array(z.object({ orderItemId: z.string().uuid(), reason: z.string().min(1) })).min(1), +}); + +router.post('/returns', async (req: Request, res: Response) => { + const parsed = CreateReturnSchema.safeParse(req.body); + if (!parsed.success) { + return res.status(400).json({ error: 'VALIDATION_ERROR', details: parsed.error.flatten() }); + } + try { + const result = await createReturnRequest(parsed.data, req.headers['x-user-id'] as string); + return res.status(201).json(result); + } catch (err) { + if (err instanceof AppError) { + return res.status(err.statusCode).json({ error: err.code, message: err.message }); + } + throw err; + } +}); + +export default router; +``` + +**A database query function:** + +```typescript +import { pool } from '../db/pool'; + +export interface ReturnRequest { + id: string; + orderId: string; + customerId: string; + status: 'submitted' | 'approved' | 'rejected' | 'refunded'; + createdAt: Date; +} + +export async function getReturnRequestById(id: string): Promise<ReturnRequest | null> { + const { rows } = await pool.query<ReturnRequest>( + 'SELECT id, order_id AS "orderId", customer_id AS "customerId", status, created_at AS "createdAt" FROM return_requests WHERE id = $1', + [id] + ); + return rows[0] ?? null; +} +``` + +**A Jest test:** + +```typescript +import { getReturnRequestById } from '../db/return-requests'; +import { pool } from '../db/pool'; + +jest.mock('../db/pool'); +const mockQuery = pool.query as jest.Mock; + +describe('getReturnRequestById', () => { + it('returns the request when found', async () => { + mockQuery.mockResolvedValueOnce({ rows: [{ id: 'abc', orderId: '123', status: 'submitted' }] }); + const result = await getReturnRequestById('abc'); + expect(result?.id).toBe('abc'); + }); + + it('returns null when not found', async () => { + mockQuery.mockResolvedValueOnce({ rows: [] }); + const result = await getReturnRequestById('missing'); + expect(result).toBeNull(); + }); +}); +``` diff --git a/docs/writing-inputs/example-minimal-tech-env-scientific-calculator-api.md b/docs/writing-inputs/example-minimal-tech-env-scientific-calculator-api.md new file mode 100644 index 00000000..51bf70c5 --- /dev/null +++ b/docs/writing-inputs/example-minimal-tech-env-scientific-calculator-api.md @@ -0,0 +1,134 @@ +# Technical Environment: CalcEngine + +## Language and Package Manager + +- **Python 3.12+** +- **uv** for all package management (no pip, poetry, or conda) +- `pyproject.toml` for all project and tool configuration +- `uv.lock` committed to Git + +## Web Framework + +- **FastAPI** with Pydantic v2 for request/response validation +- **Mangum** to run FastAPI on AWS Lambda + +## Cloud and Deployment + +- **AWS**, single account, `us-east-1` +- **Serverless**: Lambda behind API Gateway (HTTP API type) +- **DynamoDB** for API key storage and usage metering +- **S3 + CloudFront** for documentation site +- **AWS CDK (Python)** for all infrastructure -- no manual console changes + +## Testing + +- **pytest** with pytest-cov (90% line coverage minimum) +- **hypothesis** for property-based math accuracy testing +- **mypy** strict mode for type checking +- **ruff** for linting and formatting +- **moto** for mocking AWS services in tests + +## Do NOT Use + +| Prohibited | Reason | Use Instead | +| ------------------------------- | ------------------------------------------------- | --------------------------- | +| `eval()`, `exec()`, `compile()` | Security -- arbitrary code execution | AST-based expression parser | +| Flask, Django | Project uses FastAPI | FastAPI | +| requests | Blocks async event loop | httpx | +| sympy | Too heavy for MVP | Custom expression parser | +| pandas | Not needed -- single calculations, not dataframes | Standard Python | +| pip, poetry, pipenv | Project uses uv exclusively | uv | +| black, flake8, isort | Replaced by ruff | ruff | +| AWS EC2, ECS, RDS | Serverless model preferred for MVP | Lambda, DynamoDB | + +## Security Basics + +- API key auth via `Authorization: Bearer {key}` header +- Keys stored as bcrypt hashes in DynamoDB, never logged in plaintext +- Expression parser uses a character allowlist and AST evaluation -- no dynamic code execution +- Expression length capped at 4,096 characters, nesting depth capped at 100 levels +- TLS 1.2+ enforced, no HTTP endpoints +- Secrets in AWS Secrets Manager, not in environment variables or code + +## Example Code Pattern + +An endpoint should follow this structure: + +```python +from fastapi import APIRouter, Depends +from pydantic import BaseModel, Field + +from calcengine.api.middleware.auth import get_api_key_id +from calcengine.api.models.errors import error_response +from calcengine.api.models.responses import CalculationResponse +from calcengine.engine.errors import CalcEngineError +from calcengine.engine.trigonometry import sin + +router = APIRouter() + + +class SinRequest(BaseModel): + value: float + angle_mode: str = Field(default="radians", pattern="^(radians|degrees)$") + + +@router.post("/v1/trigonometry/sin", response_model=CalculationResponse) +async def calculate_sin( + request: SinRequest, + api_key_id: str = Depends(get_api_key_id), +) -> CalculationResponse | dict: + try: + result = sin(request.value, angle_mode=request.angle_mode) + return CalculationResponse(result=result, expression=f"sin({request.value})") + except CalcEngineError as e: + return error_response(e) +``` + +A math function should follow this structure: + +```python +import math + +from calcengine.engine.errors import DomainError + + +def log_base(value: float, base: float = 10.0) -> float: + """Compute logarithm of value with given base. Raises DomainError for invalid input.""" + if value <= 0: + raise DomainError( + code="DOMAIN_ERROR", + message=f"Cannot compute logarithm of {value}", + detail="Logarithm is only defined for positive numbers", + ) + if base <= 0 or base == 1.0: + raise DomainError( + code="DOMAIN_ERROR", + message=f"Invalid logarithm base: {base}", + detail="Base must be positive and not equal to 1", + ) + return math.log(value) / math.log(base) +``` + +A test should follow this structure: + +```python +import math +import pytest +from hypothesis import given, strategies as st +from calcengine.engine.errors import DomainError +from calcengine.engine.logarithmic import log_base + + +def test_log10_of_100() -> None: + assert log_base(100, 10) == pytest.approx(2.0) + + +def test_log_of_negative_raises_domain_error() -> None: + with pytest.raises(DomainError): + log_base(-5) + + +@given(st.floats(min_value=1e-300, max_value=1e300, allow_nan=False, allow_infinity=False)) +def test_log10_matches_stdlib(x: float) -> None: + assert log_base(x, 10) == pytest.approx(math.log10(x), rel=1e-14) +``` diff --git a/docs/writing-inputs/example-minimal-vision-brownfield.md b/docs/writing-inputs/example-minimal-vision-brownfield.md new file mode 100644 index 00000000..cad264a8 --- /dev/null +++ b/docs/writing-inputs/example-minimal-vision-brownfield.md @@ -0,0 +1,65 @@ +# Vision: Returns and Refunds Module — OrderFlow Platform + +> **Brownfield project.** This document describes a change to an existing system. +> The Current State section is required. It gives AIDLC the context it needs to +> understand what already exists before generating requirements and design. + +--- + +## Current State + +OrderFlow is an existing e-commerce order management platform built in TypeScript +on Node.js. It handles order creation, payment capture, fulfilment routing, and +shipping notifications. It does not currently have any returns or refunds capability. +Customers who want to return an item contact support via email, and refunds are +processed manually by the finance team in the payment provider dashboard. + +The existing platform has three backend services (order-service, payment-service, +notification-service) and a React frontend. All services are deployed on AWS ECS +Fargate. PostgreSQL is the primary data store. + +--- + +## What We Are Adding + +A returns and refunds module that allows customers to self-serve return requests +through the existing storefront, and allows operations staff to review, approve, +and process refunds without leaving the platform. + +--- + +## Features In Scope (this iteration) + +- Customer-facing return request form: select order, select items, select return reason +- Return request status tracking for customers (submitted, approved, rejected, refunded) +- Operations dashboard: view open return requests, approve or reject with a note +- Automated refund processing via the existing payment-service integration +- Email notifications to customers at each status change via notification-service +- Return reason codes: damaged, wrong item, changed mind, other + +## Features Explicitly Out of Scope (this iteration) + +- Return shipping label generation (manual process for now, Phase 2) +- Partial refunds at the line-item level (full order refunds only in MVP) +- Restocking or inventory management integration (Phase 2) +- Fraud detection or return abuse prevention (Phase 3) +- Self-service exchanges (return + reorder in one flow, Phase 2) +- Returns analytics or reporting dashboard (Phase 2) + +--- + +## What Must Not Change + +- Order creation, payment capture, and fulfilment flows — do not modify these +- The existing PostgreSQL schema for orders, payments, and customers — additive changes only +- The notification-service API contract — consume it as-is, do not modify it +- The existing React frontend component library and design system + +--- + +## Open Questions + +- Should return requests have an approval step, or should eligible returns be auto-approved based on policy rules (e.g., within 30 days, item not marked as final sale)? +- Who owns the return request in the operations dashboard — customer support team, warehouse team, or both with different views? +- Should refunds be issued immediately on approval, or batched and processed at end of day? +- Is there a return window policy (e.g., 30 days from delivery) that the system should enforce, or is it case-by-case for now? diff --git a/docs/writing-inputs/example-minimal-vision-scientific-calculator-api.md b/docs/writing-inputs/example-minimal-vision-scientific-calculator-api.md new file mode 100644 index 00000000..1fe2f650 --- /dev/null +++ b/docs/writing-inputs/example-minimal-vision-scientific-calculator-api.md @@ -0,0 +1,51 @@ +# Vision: CalcEngine Scientific Calculator API + +## Executive Summary + +CalcEngine is a REST API that lets developers send math expressions as strings and get back accurate results. Instead of every team building their own math parser and trig functions, they call our API. We sell it as a subscription service with a free tier to drive adoption and paid tiers for volume. + +## Features In Scope (MVP) + +- Expression evaluation: accept a string like `"2 * sin(pi/4) + sqrt(16)"` and return the numeric result +- Basic arithmetic: add, subtract, multiply, divide, power, square root, modulo, absolute value, floor, ceiling, rounding +- Trigonometry: sin, cos, tan, asin, acos, atan, atan2 (degree and radian modes) +- Logarithms: log base 10, natural log, log with arbitrary base, exp +- Basic statistics: mean, median, mode, standard deviation, variance, min, max, sum, percentile (accepts arrays) +- Math constants: pi, e, phi, sqrt(2) +- Combinatorics: factorial, permutations (nPr), combinations (nCr) +- Error handling: clear error codes for division by zero, domain errors (log of negative), overflow, malformed expressions +- API key authentication with free tier (10K calls/month) and paid tiers +- API docs portal with interactive sandbox and code examples + +## Features Explicitly Out of Scope (MVP) + +- Arbitrary-precision arithmetic (Phase 2) +- Matrix and linear algebra (Phase 2) +- Calculus -- derivatives, integrals (Phase 2) +- Financial math -- amortization, NPV, IRR (Phase 2) +- Client SDKs for Python/JS/Java (Phase 2 -- raw HTTP is fine for MVP) +- Step-by-step solution breakdowns (Phase 3) +- Unit conversion and physical constants (Phase 3) +- Batch processing / async webhooks (Phase 3) +- Symbolic computation (Phase 3) +- On-premises deployment (Phase 3+) + +## Target Users + +- Application developers who need math in their products but do not want to build/maintain it +- EdTech companies that need a calculator backend for student-facing tools +- FinTech startups that need auditable calculations (paid tier, Phase 2 focus) + +## Key Success Metrics + +- 1,000 registered developer accounts within 3 months +- 50 paid subscribers within 6 months +- API uptime 99.9% +- Response time p50 under 50ms +- Zero critical accuracy bugs (wrong calculation results) + +## Open Questions + +- Should the expression evaluator support variable assignment (`x = 5; 2*x + 3`) or only single expressions? +- Should results be returned as strings (preserving precision) or JSON numbers? +- Should implicit multiplication be supported (`2pi` meaning `2 * pi`)? diff --git a/docs/writing-inputs/example-tech-env-scientific-calculator-api.md b/docs/writing-inputs/example-tech-env-scientific-calculator-api.md new file mode 100644 index 00000000..d9f612dc --- /dev/null +++ b/docs/writing-inputs/example-tech-env-scientific-calculator-api.md @@ -0,0 +1,1222 @@ +# Technical Environment Document: CalcEngine Scientific Calculator API + +## Project Technical Summary + +- **Project Name**: CalcEngine +- **Project Type**: Greenfield +- **Primary Runtime Environment**: Cloud +- **Cloud Provider**: AWS +- **Target Deployment Model**: Serverless (API Gateway + Lambda) +- **Package Manager**: uv +- **Team Size**: 4 (2 backend developers, 1 frontend developer for docs portal, 1 QA engineer) +- **Team Experience**: Strong Python backend experience, moderate AWS experience, no prior math library development. Team has used FastAPI and Flask professionally. Familiar with pytest. Limited CDK experience (will need examples). + +--- + +## Programming Languages + +### Required Languages + +| Language | Version | Purpose | Rationale | +| ----------- | --------- | ------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------- | +| Python | 3.12+ | API service, math engine, Lambda handlers, CDK infrastructure | Team's primary language. Rich math ecosystem (mpmath, numpy, scipy). uv provides fast, reliable dependency management. | +| HTML/CSS/JS | ES2022+ | Documentation portal (static site) | Minimal frontend for API docs. No framework needed; static generation with Jinja2 templates. | + +### Permitted Languages + +| Language | Conditions for Use | +| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Rust | Approved for performance-critical math functions (e.g., expression parser) if Python performance is insufficient. Requires profiling evidence before adoption. Exposed to Python via PyO3/maturin. | +| TypeScript | Approved for CDK infrastructure if the team prefers CDK in TypeScript over Python CDK. Decision must be made before construction begins, not mid-project. | + +### Prohibited Languages + +| Language | Reason | Use Instead | +| ---------- | --------------------------------------------------------------------------- | --------------------------------------------- | +| Java | No team expertise. Adds operational complexity (JVM cold starts in Lambda). | Python | +| Go | No team expertise. Python covers all current requirements. | Python | +| C/C++ | Maintenance burden for native extensions. | Rust via PyO3 if native performance is needed | + +--- + +## Package and Environment Management + +### uv as the Standard Tool + +uv is the **sole package and environment management tool** for this project. Do not use pip, pip-tools, poetry, pipenv, or conda. + +### uv Usage Standards + +```bash +# Project initialization (already done; do not re-run) +uv init calcengine +cd calcengine + +# Adding dependencies +uv add fastapi # Add a runtime dependency +uv add uvicorn[standard] # Add with extras +uv add --dev pytest pytest-cov # Add a development dependency +uv add --dev mypy ruff # Add dev tooling + +# Removing dependencies +uv remove requests # Remove a dependency + +# Running commands in the project environment +uv run python -m calcengine.main # Run application +uv run pytest # Run tests +uv run mypy src/ # Run type checker +uv run ruff check src/ # Run linter + +# Syncing environment from lockfile +uv sync # Install all dependencies from uv.lock +uv sync --dev # Include dev dependencies + +# Lockfile management +# uv.lock is auto-generated. NEVER edit it manually. +# uv.lock MUST be committed to version control. +``` + +### Dependency File Standards + +| File | Purpose | Committed to Git | +| ----------------- | ------------------------------------------------------------- | ----------------- | +| `pyproject.toml` | Project metadata, dependency declarations, tool configuration | Yes | +| `uv.lock` | Deterministic lockfile with exact resolved versions | Yes | +| `.python-version` | Pin the Python version for the project (e.g., `3.12`) | Yes | + +### pyproject.toml Conventions + +All project configuration lives in `pyproject.toml`. Do not create separate config files for tools that support pyproject.toml configuration. + +```toml +[project] +name = "calcengine" +version = "0.1.0" +description = "Scientific calculator REST API" +requires-python = ">=3.12" +dependencies = [ + # Runtime dependencies listed here by uv add +] + +[dependency-groups] +dev = [ + # Dev dependencies listed here by uv add --dev +] + +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "-v --tb=short --strict-markers" +markers = [ + "unit: Unit tests (fast, no external dependencies)", + "integration: Integration tests (may require services)", + "accuracy: Mathematical accuracy validation tests", +] + +[tool.mypy] +python_version = "3.12" +strict = true +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = true + +[tool.ruff] +target-version = "py312" +line-length = 100 + +[tool.ruff.lint] +select = ["E", "F", "W", "I", "N", "UP", "B", "A", "SIM", "TCH"] + +[tool.coverage.run] +source = ["src/calcengine"] +branch = true + +[tool.coverage.report] +fail_under = 90 +show_missing = true +``` + +--- + +## Frameworks and Libraries + +### Required Frameworks + +| Framework/Library | Version | Domain | Rationale | +| ------------------- | --------- | ------------------------------------------------ | --------------------------------------------------------------------------------------------------------- | +| FastAPI | 0.115+ | REST API framework | Async support, automatic OpenAPI spec generation, Pydantic validation, strong Python typing integration. | +| Pydantic | 2.x | Request/response validation, settings management | Type-safe data models, JSON serialization, integral to FastAPI. | +| uvicorn | 0.30+ | ASGI server | Standard production server for FastAPI. Used locally and in Lambda via Mangum. | +| Mangum | 1.x | Lambda adapter | Wraps FastAPI ASGI app for AWS Lambda handler. Zero-config adapter. | +| pytest | 8.x | Testing framework | Team standard. Rich plugin ecosystem. | +| mypy | 1.x | Static type checking | Catch type errors before runtime. Strict mode enforced. | +| ruff | 0.8+ | Linting and formatting | Replaces flake8, isort, and black in a single fast tool. | +| structlog | 24.x+ | Structured JSON logging | All Lambda handlers and API endpoints must emit structured JSON logs. Configured once in a shared module. | +| aws-cdk-lib | 2.x | Infrastructure as Code | AWS deployment. Python CDK constructs for all infrastructure. | + +### Preferred Libraries + +Use these when their capability is needed. Do not add them preemptively. + +| Library | Purpose | Use When | +| -------------- | ------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------- | +| mpmath | Arbitrary-precision arithmetic | Phase 2: when arbitrary-precision mode is implemented. Not needed for MVP (IEEE 754 double is sufficient). | +| numpy | Array operations, linear algebra | Phase 2: when matrix/vector operations are implemented. Do not use for basic arithmetic. | +| scipy | Statistical distributions, numerical integration | Phase 2+: when advanced statistics and calculus are implemented. | +| httpx | Async HTTP client | Outbound HTTP calls (e.g., currency rate fetching in Phase 3). Preferred over requests for async compatibility. | +| boto3 | AWS SDK | Any direct AWS service interaction not handled by CDK at deploy time (e.g., DynamoDB queries, Secrets Manager reads at runtime). | +| pytest-cov | Test coverage reporting | Always. Included in dev dependencies from project start. | +| pytest-asyncio | Async test support | When testing async FastAPI endpoints or async functions. | +| hypothesis | Property-based testing | Mathematical function testing. Generates random inputs to find edge cases. Strongly recommended for all math modules. | +| freezegun | Time mocking | When testing time-dependent logic (rate limiting, token expiry, audit timestamps). | + +### Prohibited Libraries + +| Library | Reason | Alternative | +| --------------------------- | ---------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------- | +| Flask | Project uses FastAPI. Do not mix web frameworks. | FastAPI | +| Django | Excessive for an API service. ORM not needed. | FastAPI + direct DynamoDB access | +| requests | Synchronous-only. Blocks the async event loop in FastAPI. | httpx | +| sympy | Too heavy for MVP scope. Pulls in large dependency tree. | Implement expression parser directly. Re-evaluate for Phase 3 symbolic computation. | +| pandas | Not needed. CalcEngine processes individual calculations, not dataframes. | Standard Python or numpy for array operations when needed. | +| SQLAlchemy | No relational database in MVP. DynamoDB is the data store. | boto3 DynamoDB resource/client | +| celery | Unnecessary complexity for MVP. All calculations are synchronous and fast (<50ms). | Re-evaluate in Phase 3 for batch processing. Use SQS + Lambda if async is needed earlier. | +| poetry / pipenv / pip-tools | Project uses uv exclusively. Do not introduce alternative package managers. | uv | +| black / isort / flake8 | Replaced by ruff, which combines all three. | ruff | + +### Library Approval Process + +To add a library not on the required or preferred lists: + +1. Open a GitHub issue titled "Dependency Request: [library-name]" +2. Include: purpose, alternatives considered, license (must be MIT, Apache 2.0, or BSD), maintenance status (last release date, open issues count), and size impact +3. Tech lead reviews and approves or rejects +4. If approved, add via `uv add` and update this document + +--- + +## Cloud Environment + +### Cloud Provider + +- **Primary Provider**: AWS +- **Account Structure**: Single AWS account for MVP. Separate dev/staging/prod accounts in Phase 2. +- **Regions**: `us-east-1` (primary). No disaster recovery region for MVP. Multi-region planned for Phase 2. + +### Service Allow List + +| Service | Approved Use Cases | Constraints | +| ----------------------------- | --------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------- | +| AWS Lambda | API request handlers, math computation | Python 3.12 runtime. Max 256MB memory for MVP (increase if profiling shows need). 30-second timeout. | +| Amazon API Gateway (HTTP API) | Public REST API endpoint | HTTP API type (not REST API type). Custom domain with TLS. Usage plans for rate limiting. | +| Amazon DynamoDB | API key storage, usage metering, rate limit counters | On-demand capacity mode. Single-table design. TTL for rate limit windows. | +| Amazon S3 | OpenAPI spec hosting, static documentation site, Lambda deployment packages | Bucket encryption enabled. Public access blocked except for docs site bucket (CloudFront distribution). | +| Amazon CloudFront | CDN for documentation portal and API spec | HTTPS only. Cache static assets aggressively. | +| Amazon CloudWatch | Logging, metrics, alarms, dashboards | Structured JSON logs from all Lambdas. Custom metrics for calculation counts, latency percentiles, error rates. | +| AWS Secrets Manager | Stripe API keys, signing keys | Automatic rotation where supported. Lambda reads at cold start, caches in memory. | +| AWS Certificate Manager | TLS certificates for custom domain | Used with API Gateway and CloudFront. | +| Amazon Cognito | Developer account authentication for docs portal and API key management | User pool for developer signup/login. Not used for API call authentication (API keys for that). | +| Amazon SQS | Dead-letter queue for failed async operations | Standard queue. Used for failed billing events and error capture. Not used for calculation requests in MVP. | +| AWS CDK | Infrastructure as Code deployment | Python CDK. All infrastructure defined in CDK. No manual console changes. | +| AWS CloudTrail | API audit logging | Enabled for all management events. Data events for S3 and Lambda in production. | +| AWS IAM | Service permissions | Least-privilege policies per Lambda function. No wildcard resource permissions. | + +### Service Disallow List + +| Service | Reason | Alternative | +| -------------------------- | ------------------------------------------------------------------------ | ------------------------------------------------------------------- | +| Amazon EC2 | Operational overhead. Serverless model preferred. | Lambda for compute. | +| Amazon ECS / Fargate | Over-engineering for MVP request/response workload. | Lambda. Re-evaluate if cold starts become a problem. | +| Amazon RDS / Aurora | Relational database not needed. API key and usage data fits DynamoDB. | DynamoDB. | +| Amazon ElastiCache / Redis | No caching layer needed for MVP. Calculations are stateless and fast. | In-memory caching within Lambda execution context if needed. | +| AWS Elastic Beanstalk | Does not fit IaC model. | CDK + Lambda. | +| Amazon Kinesis | Streaming not needed. All calculations are synchronous request/response. | SQS if async processing is needed. | +| AWS Step Functions | No multi-step orchestration in MVP. | Direct Lambda invocation. Re-evaluate for Phase 3 batch processing. | +| Amazon SNS | No pub/sub needed in MVP. | SQS for dead-letter queues. | + +### Service Approval Process + +To use a service not on the allow list: + +1. Open a GitHub issue titled "AWS Service Request: [service-name]" +2. Include: use case, cost estimate (monthly), security implications, operational burden, and why an allowed service cannot meet the need +3. Tech lead reviews. Services with PII access or network exposure require additional security review. +4. If approved, add CDK construct and update this document + +--- + +## Preferred Technologies and Patterns + +### Architecture Pattern + +**Modular monolith deployed as serverless functions.** + +CalcEngine is a single Python package with internal modules (arithmetic, trigonometry, statistics, etc.), exposed through a single FastAPI application, deployed to AWS Lambda behind API Gateway. This is not a microservice architecture. + +| Decision | Choice | Rationale | +| ------------------ | -------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Architecture style | Modular monolith | Small team (4 people), single domain, no independent scaling requirements per module in MVP. | +| Deployment model | Single Lambda function serving all API routes via Mangum | Simplicity. One deployment artifact. Cold start amortized across all endpoints. | +| Module boundaries | Python packages within `src/calcengine/` | Clean internal boundaries without the operational cost of separate services. Can extract to separate Lambdas later if specific endpoints need different memory/timeout. | + +### API Design Standards + +- **Style**: REST over HTTPS. JSON request and response bodies. +- **Base URL**: `https://api.calcengine.io/v1/` +- **Versioning**: URL path prefix (`/v1/`, `/v2/`). Major version only. Non-breaking changes do not increment version. +- **Documentation**: OpenAPI 3.1 specification auto-generated by FastAPI. Hosted at `https://docs.calcengine.io`. +- **Naming Convention**: snake_case for JSON field names (Python convention). kebab-case for URL paths. +- **Content Type**: `application/json` for all requests and responses. No XML support. + +**Standard Request Format:** + +```json +{ + "expression": "sin(pi/4) * 2 + sqrt(16)", + "options": { + "angle_mode": "radians", + "precision": 15 + } +} +``` + +**Standard Success Response Format:** + +```json +{ + "result": 5.414213562373095, + "expression": "sin(pi/4) * 2 + sqrt(16)", + "computation_time_ms": 2.3, + "engine_version": "0.1.0" +} +``` + +**Standard Error Response Format:** + +```json +{ + "error": { + "code": "DOMAIN_ERROR", + "message": "Cannot compute logarithm of a negative number", + "detail": "log(-5) is undefined for real numbers", + "parameter": "expression", + "documentation_url": "https://docs.calcengine.io/errors/DOMAIN_ERROR" + } +} +``` + +**Error Codes (MVP):** + +| Code | HTTP Status | Meaning | +| --------------------- | ------------ | --------------------------------------------------------------- | +| `PARSE_ERROR` | 400 | Expression could not be parsed. Malformed syntax. | +| `DOMAIN_ERROR` | 422 | Mathematically undefined (log(-1), sqrt(-1), division by zero). | +| `OVERFLOW_ERROR` | 422 | Result exceeds representable range. | +| `INVALID_PARAMETER` | 400 | Request parameter has invalid type or value. | +| `EXPRESSION_TOO_LONG` | 400 | Expression exceeds maximum allowed length. | +| `RATE_LIMIT_EXCEEDED` | 429 | API key has exceeded its rate limit. | +| `UNAUTHORIZED` | 401 | Missing or invalid API key. | +| `INTERNAL_ERROR` | 500 | Unexpected server error. | + +### Data Patterns + +- **Primary Data Store**: DynamoDB (single-table design) +- **Entities in DynamoDB**: API keys, usage counters (per key per month), rate limit windows (per key per minute) +- **Access Pattern**: All reads and writes are by primary key (API key ID). No scans. No complex queries. +- **Caching**: No external cache. Lambda reuses DynamoDB connections across warm invocations. API key validation results cached in Lambda memory for 60 seconds. +- **No relational database**: If relational queries become necessary (reporting, analytics), evaluate DynamoDB export to S3 + Athena before adding RDS. + +### Logging Pattern + +All log output must be structured JSON via structlog. Human-readable console output for local development only. + +```python +import structlog + +logger = structlog.get_logger() + +# Standard log call +logger.info( + "calculation_completed", + expression=expression, + result=result, + computation_time_ms=elapsed, + api_key_id=api_key_id, +) + +# Error log call +logger.error( + "calculation_failed", + expression=expression, + error_code="DOMAIN_ERROR", + error_detail=str(e), + api_key_id=api_key_id, +) +``` + +**Required log fields for every API request:** + +| Field | Description | +| ------------- | ----------------------------------------------------- | +| `request_id` | Unique ID per request (from API Gateway or generated) | +| `api_key_id` | Hashed API key identifier (never log the raw key) | +| `endpoint` | API path called | +| `http_method` | GET, POST, etc. | +| `http_status` | Response status code | +| `duration_ms` | Total request processing time | +| `timestamp` | ISO 8601 timestamp | + +--- + +## Security Requirements + +### Authentication and Authorization + +- **API Call Authentication**: API key passed in `Authorization: Bearer {key}` header. API keys are 32-character random strings, stored as bcrypt hashes in DynamoDB. +- **Developer Portal Authentication**: Amazon Cognito user pool. Email + password signup with email verification. +- **Authorization Model**: Flat. All API keys have access to all endpoints. Tier-based rate limits (free, starter, professional) enforced by usage metering, not endpoint-level permissions. +- **API Key Management**: Developers create, rotate, and revoke keys through the developer portal. Maximum 3 active keys per account. + +### Data Protection + +- **Encryption at Rest**: DynamoDB encrypted with AWS-managed KMS key. S3 buckets encrypted with SSE-S3. +- **Encryption in Transit**: TLS 1.2+ enforced on API Gateway custom domain and CloudFront distribution. No HTTP (plaintext) endpoints. +- **PII Handling**: Developer accounts store email and hashed password. No other PII collected. Mathematical expressions are not PII. Expressions are logged for debugging but not stored permanently (CloudWatch log retention: 30 days). +- **Data Classification**: API keys = Confidential. Developer emails = Internal. Mathematical expressions and results = Public. + +### Input Validation + +- **Expression length limit**: 4,096 characters maximum. Reject longer expressions with `EXPRESSION_TOO_LONG`. +- **Expression character allowlist**: Alphanumeric, arithmetic operators (`+ - * / ^ %`), parentheses, decimal point, comma, whitespace, and recognized function names. Reject unrecognized characters. +- **No code execution**: The expression parser must never call `eval()`, `exec()`, `compile()`, or any dynamic code execution. Expressions are parsed into an AST and evaluated by the math engine. +- **Recursion depth limit**: Expression parser limits nesting depth to 100 levels. Prevents stack overflow on deeply nested expressions like `(((((...))))`. +- **Numeric range validation**: Results that exceed IEEE 754 double-precision range return `OVERFLOW_ERROR` instead of `Infinity` or `NaN`. + +### Secrets Management + +- **Stripe API Keys**: Stored in AWS Secrets Manager. Read by Lambda at cold start, cached in memory. +- **Cognito Client Secret**: Stored in AWS Secrets Manager. +- **Prohibited Practices**: + - No secrets in `pyproject.toml`, source code, or `.env` files committed to Git + - No secrets in Lambda environment variables (use Secrets Manager at runtime) + - No AWS access keys in code (Lambda uses IAM execution roles) + - `.env` files for local development only, listed in `.gitignore` + +### Dependency Security + +- **Scanning**: GitHub Dependabot enabled for Python dependencies. Alerts on known vulnerabilities. +- **License Policy**: Allowed: MIT, Apache 2.0, BSD (2-clause and 3-clause), PSF, ISC. Prohibited: GPL, LGPL, AGPL, SSPL, proprietary. Check with `uv tree` before adding new dependencies. +- **Update Policy**: Critical/High CVEs patched within 7 days. Medium within 30 days. Low evaluated quarterly. + +### OWASP Top 10 Compliance (2021) + +#### A01:2021 - Broken Access Control + +| Control | CalcEngine Implementation | +| --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| Authorization enforcement | API key validated in FastAPI middleware (`api/middleware/auth.py`) on every request before the route handler executes. No endpoint is accessible without a valid key. | +| Default deny | API Gateway rejects requests without an `Authorization` header at the gateway level (401). Lambda handler rejects requests with invalid or revoked keys (401). | +| Resource ownership | Each API key is tied to a Cognito account. Developers can only list, rotate, and revoke their own keys. DynamoDB queries are scoped to the authenticated user's partition key. | +| Rate limiting | Per-key rate limits enforced in middleware (`api/middleware/rate_limit.py`). Free: 10,000 calls/month, 10 calls/second. Starter: 1M/month, 50/second. Professional: 10M/month, 200/second. Exceeding limits returns 429. | +| CORS policy | API Gateway CORS configured to allow only the documentation portal origin (`https://docs.calcengine.io`). No wildcard origins. `GET` and `POST` methods only. | +| Directory traversal / path manipulation | Not applicable. CalcEngine does not serve files or accept file paths as input. | + +#### A02:2021 - Cryptographic Failures + +| Control | CalcEngine Implementation | +| --------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Data in transit | TLS 1.2+ enforced on API Gateway custom domain and CloudFront. HTTP endpoints do not exist. API Gateway configured with `SecurityPolicy: TLS_1_2`. | +| Data at rest | DynamoDB encrypted with AWS-managed KMS key. S3 buckets encrypted with SSE-S3. CloudWatch logs encrypted with service-managed keys. | +| Password/credential storage | Developer portal passwords hashed with bcrypt (Cognito-managed). API keys stored as bcrypt hashes in DynamoDB. Raw API keys are returned exactly once at creation time and never stored or logged. | +| Sensitive data in responses | API responses never contain API keys, account credentials, or internal identifiers. Error messages do not leak table names, ARNs, or stack traces. | +| Sensitive data in logs | API key IDs (hashed identifier, not the key itself) are logged. Raw API keys are never logged. Developer emails are not included in calculation logs. | + +#### A03:2021 - Injection + +| Control | CalcEngine Implementation | +| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Expression injection | The expression parser builds an AST from a strict grammar. It does **not** use `eval()`, `exec()`, `compile()`, or any Python code execution mechanism. Only recognized tokens (numbers, operators, parentheses, whitelisted function names) are accepted. Unrecognized tokens cause a `PARSE_ERROR` (400). | +| Character allowlist | Expression input restricted to: digits, decimal point, arithmetic operators (`+ - * / ^ %`), parentheses, comma, whitespace, and a fixed set of function names (`sin`, `cos`, `tan`, `log`, `sqrt`, etc.). All other characters are rejected before parsing. | +| NoSQL injection | DynamoDB queries use the boto3 SDK with parameterized key conditions. No string concatenation of user input into query expressions. Partition keys and sort keys are set programmatically, never interpolated from request bodies. | +| HTTP header injection | FastAPI and Pydantic validate and type-check all request input. Response headers are set programmatically by the framework, not from user input. | +| Log injection | structlog escapes special characters in log values. User-supplied expressions are logged as string values within structured JSON fields, not interpolated into log format strings. | + +#### A04:2021 - Insecure Design + +| Control | CalcEngine Implementation | +| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Threat modeling | Threat model created during AIDLC NFR Requirements stage. Reviewed when new endpoints or integration points are added. Primary threats: expression injection, resource exhaustion, API key abuse. | +| Defense in depth | Validation at three layers: (1) API Gateway request validation, (2) Pydantic model validation in FastAPI, (3) domain validation in engine functions. Each layer rejects independently. | +| Business logic limits | Expression length capped at 4,096 characters. Parser recursion depth capped at 100 levels. Maximum array size for statistics endpoints: 10,000 elements. These limits prevent resource exhaustion without affecting legitimate use. | +| Abuse case testing | Test suite includes negative/abuse tests: oversized expressions, deeply nested parentheses, expressions designed to cause slow evaluation, rapid-fire requests exceeding rate limits, invalid/expired/revoked API keys. | + +#### A05:2021 - Security Misconfiguration + +| Control | CalcEngine Implementation | +| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Infrastructure as Code | All infrastructure defined in AWS CDK (Python). No manual console changes. CDK diff reviewed in pull requests before deploy. | +| Default credentials | No default API keys, admin accounts, or hardcoded passwords in any environment. Cognito user pool requires email verification. | +| Error messages | Production error responses return the CalcEngine error code, a user-friendly message, and a documentation URL. They never expose Python tracebacks, Lambda ARNs, DynamoDB table names, or internal file paths. FastAPI `debug=False` in production. | +| Unnecessary features | No `/docs` or `/redoc` interactive endpoints exposed in production Lambda. OpenAPI spec served only from the static documentation site. No health-check endpoints that reveal version details beyond `engine_version`. | +| Security headers | API Gateway responses include: `Strict-Transport-Security: max-age=31536000; includeSubDomains`, `X-Content-Type-Options: nosniff`, `X-Frame-Options: DENY`, `Cache-Control: no-store` on API responses. CloudFront adds security headers to documentation site. | +| Lambda configuration | Lambda functions use the minimum required memory (256MB). Timeout set to 30 seconds. Reserved concurrency configured to prevent runaway scaling. No environment variables containing secrets (Secrets Manager at runtime). | + +#### A06:2021 - Vulnerable and Outdated Components + +| Control | CalcEngine Implementation | +| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Dependency scanning | GitHub Dependabot enabled. Scans `pyproject.toml` and `uv.lock` for known vulnerabilities. Alerts create GitHub issues automatically. | +| Patch SLA | Critical/High CVEs: patched within 7 days. Medium: 30 days. Low: evaluated quarterly. | +| License compliance | Allowed: MIT, Apache 2.0, BSD, PSF, ISC. Prohibited: GPL, LGPL, AGPL, SSPL, proprietary. Checked with `uv tree` before adding dependencies. | +| Lockfile integrity | `uv.lock` committed to Git and enforced in CI. `uv sync --locked` in CI pipeline fails if lockfile is out of date. No ad-hoc `uv add` in CI. | +| Minimal dependencies | Prohibited libraries list prevents bloated dependency trees (no pandas, Django, SQLAlchemy, sympy in MVP). Each new dependency requires a GitHub issue with justification. | + +#### A07:2021 - Identification and Authentication Failures + +| Control | CalcEngine Implementation | +| --------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| API key hashing | API keys are 32-character cryptographically random strings (via `secrets.token_urlsafe`). Stored as bcrypt hashes. Lookup uses a key prefix (first 8 chars, stored in plaintext) to find the record, then bcrypt verify confirms the full key. | +| Brute force protection | API Gateway throttling: 100 requests/second per IP across all endpoints. Failed authentication attempts (invalid key) logged with `api_key_prefix` and source IP. After 50 failed auth attempts from a single IP in 5 minutes, temporary IP block via WAF rule. | +| Developer portal auth | Cognito enforces: minimum 12-character password, email verification required, account lockout after 5 failed login attempts. | +| Key rotation | Developers can create a new key before revoking the old one (overlap period for zero-downtime rotation). Maximum 3 active keys per account prevents key hoarding. | +| Credential exposure | API key returned exactly once at creation (in the HTTP response body). Not stored in plaintext anywhere. Not included in emails. Not visible in the developer portal after creation. | +| Multi-factor authentication | Not required for MVP. Cognito MFA support is available and will be enabled as an option in Phase 2 when team/enterprise accounts are introduced. | + +#### A08:2021 - Software and Data Integrity Failures + +| Control | CalcEngine Implementation | +| ----------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| CI/CD pipeline security | GitHub Actions. `main` branch protected: requires PR, at least 1 review, all CI checks passing. No direct pushes to `main`. Deploy workflow triggered only on merge to `main`. | +| Dependency integrity | `uv.lock` contains hashes for all dependencies. `uv sync --locked` verifies hashes on install. Lockfile changes in PRs are reviewed explicitly. | +| Deployment artifact integrity | Lambda deployment package built in CI from a clean `uv sync --locked` install. No local builds deployed to production. CDK deploy runs only from the CI pipeline, not from developer machines. | +| Deserialization safety | Pydantic v2 models parse and validate all incoming JSON. No use of `pickle`, `yaml.load()` (unsafe loader), or `marshal`. Only `json.loads()` via Pydantic's JSON parsing. Pydantic `model_config` has `extra = "forbid"` to reject unexpected fields. | + +#### A09:2021 - Security Logging and Monitoring Failures + +| Control | CalcEngine Implementation | +| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Security events logged | All events below are logged as structured JSON to CloudWatch: authentication failures (invalid/expired/revoked key), rate limit exceeded (429), input validation failures (400), authorization anomalies, and all 5xx errors. | +| Log protection | CloudWatch logs are retained for 30 days. Log group resource policy prevents deletion by Lambda execution role. CloudTrail logs management events to a separate S3 bucket with object lock. | +| Alerting | CloudWatch Alarms configured for: 5xx error rate > 1% over 5 minutes, authentication failure rate > 100/minute, single API key generating > 10x its rate limit in attempts, Lambda concurrent execution > 80% of reserved concurrency. Alarms notify via SNS to on-call email/SMS. | +| Monitoring dashboard | CloudWatch dashboard displays: request count, error rate (4xx and 5xx), p50/p95/p99 latency, auth failure count, rate limit hit count, Lambda cold start percentage, DynamoDB consumed capacity. Reviewed weekly. | + +#### A10:2021 - Server-Side Request Forgery (SSRF) + +| Control | CalcEngine Implementation | +| --------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Applicability | **Low risk for MVP.** CalcEngine does not make outbound HTTP requests based on user input. The expression parser evaluates mathematical expressions; it does not fetch URLs, resolve hostnames, or make network calls. | +| Outbound requests | The only outbound network calls from Lambda are: (1) DynamoDB queries via AWS SDK (endpoint determined by AWS region, not user input), (2) Secrets Manager reads at cold start (secret name hardcoded in config, not user input). | +| Phase 3 consideration | When currency conversion is added (Phase 3), the service will fetch exchange rates from a financial data provider. At that point: the provider URL will be an environment variable (not user input), requests will use an allowlisted hostname, and responses will be validated against an expected schema before use. This section must be updated before Phase 3 launches. | +| Network segmentation | Lambda functions run in the AWS-managed VPC (no customer VPC for MVP). They can only reach AWS services via public endpoints. No internal services, databases, or metadata endpoints are reachable from Lambda in this configuration. | + +--- + +## Testing Requirements + +### Test Strategy Overview + +| Test Type | Required | Coverage Target | Tooling | +| --------------------------- | ---------------- | ----------------------------------------------- | ------------------------------------- | +| Unit Tests | Yes | 90% line, 80% branch | pytest + pytest-cov | +| Mathematical Accuracy Tests | Yes | 100% of implemented functions | pytest + hypothesis | +| Integration Tests | Yes | All API endpoints, DynamoDB interactions | pytest + moto (AWS mocking) | +| Load Tests | Yes (pre-launch) | 1,000 concurrent requests, p50 < 50ms | Locust | +| Security Tests | Yes | Input validation, injection prevention | pytest (custom) + manual OWASP review | +| End-to-End Tests | Conditional | Critical user journeys against deployed staging | pytest + httpx against live API | + +### Unit Testing Standards + +- **Coverage Minimum**: 90% line coverage, 80% branch coverage. Enforced by `pytest-cov` with `fail_under = 90` in `pyproject.toml`. +- **Mocking Policy**: Mock AWS services (DynamoDB, Secrets Manager) with moto. Mock time with freezegun. Do not mock internal math functions. Math functions must be tested with real computation. +- **Naming Convention**: Test files mirror source files. `src/calcengine/trig.py` tested in `tests/unit/test_trig.py`. Test functions named `test_<function>_<scenario>` (e.g., `test_sin_zero_returns_zero`, `test_sin_negative_pi_returns_zero`). +- **Test Location**: Separate `tests/` directory tree. Not co-located with source. + +```text +tests/ + unit/ + test_arithmetic.py + test_trig.py + test_statistics.py + test_expression_parser.py + test_error_handling.py + integration/ + test_api_evaluate.py + test_api_trig.py + test_api_keys.py + test_rate_limiting.py + accuracy/ + test_trig_accuracy.py + test_arithmetic_accuracy.py + test_statistics_accuracy.py + conftest.py +``` + +### Mathematical Accuracy Testing + +This is a CalcEngine-specific testing category that does not exist in most projects. + +- **Reference implementation**: Every math function must be tested against Python's `math` module, `mpmath` library (at high precision), or published mathematical tables. +- **Property-based testing with hypothesis**: Use hypothesis to generate random valid inputs and verify properties hold (e.g., `sin(x)^2 + cos(x)^2 == 1`, `log(a*b) == log(a) + log(b)`). +- **Edge cases**: Every function must have explicit tests for: zero, negative zero, very small numbers (near epsilon), very large numbers, domain boundaries (e.g., asin(1), asin(1.0000001)), and special values (pi, e, multiples of pi/2 for trig). +- **Tolerance**: Results must match reference values within 1 ULP (unit in the last place) for basic functions. Document any functions where wider tolerance is accepted, with justification. + +**Example accuracy test pattern:** + +```python +import math +import pytest +from hypothesis import given, strategies as st +from calcengine.trig import sin, cos + +class TestSinAccuracy: + """Validate sin() accuracy against math.sin and known exact values.""" + + @pytest.mark.accuracy + @pytest.mark.parametrize("input_val, expected", [ + (0, 0.0), + (math.pi / 6, 0.5), + (math.pi / 4, math.sqrt(2) / 2), + (math.pi / 2, 1.0), + (math.pi, 0.0), + (3 * math.pi / 2, -1.0), + (2 * math.pi, 0.0), + (-math.pi / 2, -1.0), + ]) + def test_sin_known_values(self, input_val: float, expected: float) -> None: + result = sin(input_val) + assert result == pytest.approx(expected, abs=1e-15) + + @pytest.mark.accuracy + @given(st.floats(min_value=-1e6, max_value=1e6, allow_nan=False, allow_infinity=False)) + def test_sin_matches_stdlib(self, x: float) -> None: + assert sin(x) == pytest.approx(math.sin(x), rel=1e-15) + + @pytest.mark.accuracy + @given(st.floats(min_value=-1e6, max_value=1e6, allow_nan=False, allow_infinity=False)) + def test_pythagorean_identity(self, x: float) -> None: + assert sin(x) ** 2 + cos(x) ** 2 == pytest.approx(1.0, abs=1e-14) +``` + +### Integration Testing Standards + +- **Scope**: Test full API request/response cycle through FastAPI test client. Test DynamoDB interactions with moto. +- **Environment**: Local. No deployed services needed. `moto` mocks all AWS services. +- **Data Management**: Each test creates its own DynamoDB table via moto fixture and tears down after. No shared test state. + +### CI/CD Testing Gates + +| Pipeline Stage | Required Tests | Tooling | Failure Action | +| ------------------------ | ------------------------------------------------------------- | ------------------------------- | --------------------------------------------- | +| Pre-commit | ruff check, ruff format --check, mypy | ruff, mypy via pre-commit hooks | Block commit | +| Pull Request | Unit tests, accuracy tests, integration tests, coverage check | pytest, GitHub Actions | Block merge | +| Pre-deploy (staging) | All PR tests + load test (100 concurrent, 60 seconds) | pytest + Locust, GitHub Actions | Block deploy | +| Post-deploy (production) | Smoke tests (10 representative calculations against live API) | pytest + httpx | Alert on-call. Auto-rollback if >50% failure. | + +### Running Tests Locally + +```bash +# Run all tests +uv run pytest + +# Run only unit tests +uv run pytest tests/unit/ -m unit + +# Run only accuracy tests +uv run pytest tests/accuracy/ -m accuracy + +# Run with coverage report +uv run pytest --cov --cov-report=term-missing + +# Run type checking +uv run mypy src/ + +# Run linter +uv run ruff check src/ tests/ + +# Run formatter check (no changes) +uv run ruff format --check src/ tests/ + +# Run formatter (apply changes) +uv run ruff format src/ tests/ +``` + +--- + +## Project Structure + +```text +calcengine/ + .github/ + workflows/ + ci.yml # GitHub Actions: lint, type check, test on PR + deploy.yml # GitHub Actions: CDK deploy on merge to main + src/ + calcengine/ + __init__.py + main.py # FastAPI app creation, Mangum handler + config.py # Settings via Pydantic BaseSettings + api/ + __init__.py + router.py # Top-level API router + endpoints/ + __init__.py + evaluate.py # POST /v1/evaluate (expression evaluation) + arithmetic.py # POST /v1/arithmetic/{operation} + trigonometry.py # POST /v1/trigonometry/{function} + statistics.py # POST /v1/statistics/{function} + constants.py # GET /v1/constants/{name} + middleware/ + __init__.py + auth.py # API key validation middleware + rate_limit.py # Rate limiting middleware + request_logging.py # Structured request/response logging + models/ + __init__.py + requests.py # Pydantic request models + responses.py # Pydantic response models + errors.py # Error response models and error codes + engine/ + __init__.py + expression_parser.py # Tokenizer, AST builder, evaluator + arithmetic.py # Basic math operations + trigonometry.py # Trig functions with domain validation + statistics.py # Descriptive statistics functions + constants.py # Mathematical constants + combinatorics.py # Factorial, permutations, combinations + logarithmic.py # Log, ln, exp functions + validation.py # Input validation, domain checking + errors.py # Math-domain exception types + storage/ + __init__.py + dynamodb.py # DynamoDB client, table operations + api_keys.py # API key CRUD, validation, hashing + usage.py # Usage metering, rate limit counters + logging.py # structlog configuration + infrastructure/ + app.py # CDK app entry point + stacks/ + __init__.py + api_stack.py # Lambda, API Gateway, custom domain + data_stack.py # DynamoDB tables + monitoring_stack.py # CloudWatch dashboards, alarms + auth_stack.py # Cognito user pool + docs_stack.py # S3 + CloudFront for documentation site + tests/ + unit/ + test_arithmetic.py + test_trig.py + test_statistics.py + test_expression_parser.py + test_combinatorics.py + test_logarithmic.py + test_validation.py + test_api_keys.py + integration/ + test_api_evaluate.py + test_api_arithmetic.py + test_api_trig.py + test_api_statistics.py + test_api_auth.py + test_api_rate_limiting.py + accuracy/ + test_trig_accuracy.py + test_arithmetic_accuracy.py + test_statistics_accuracy.py + test_logarithmic_accuracy.py + test_expression_parser_accuracy.py + conftest.py # Shared fixtures (FastAPI test client, moto mocks) + examples/ + api-endpoint/ + README.md + example_endpoint.py + test_example_endpoint.py + math-function/ + README.md + example_function.py + test_example_function.py + cdk-construct/ + README.md + example_stack.py + docs/ + static/ # Documentation portal source (Jinja2 templates) + pyproject.toml + uv.lock + .python-version # Contains: 3.12 + .gitignore + .pre-commit-config.yaml + README.md +``` + +### Directory Rules + +| Directory | Contains | Rules | +| ------------------------- | ---------------------------------- | ------------------------------------------------------------------------------------------------------------ | +| `src/calcengine/` | All application source code | Only Python. No config files, no tests, no docs. | +| `src/calcengine/engine/` | Pure math functions | No AWS imports. No HTTP imports. No side effects. Pure functions only. Must be testable without any mocking. | +| `src/calcengine/api/` | FastAPI routes, middleware, models | HTTP-layer only. Calls engine functions. Does not contain math logic. | +| `src/calcengine/storage/` | DynamoDB access layer | All AWS data access isolated here. No business logic. | +| `infrastructure/` | CDK stacks | Python CDK only. No application code. | +| `tests/` | All tests | Mirrors `src/` structure. Separate `unit/`, `integration/`, `accuracy/` directories. | +| `examples/` | Template code for patterns | Working code with tests and README. Updated when standards change. | + +--- + +## Example and Template Code + +### Example 1: API Endpoint Pattern + +`examples/api-endpoint/README.md`: + +```markdown +# API Endpoint Pattern + +## What This Demonstrates +Standard pattern for adding a new calculation endpoint to CalcEngine. +Shows: route definition, Pydantic models, engine call, error handling, logging. + +## When to Use +- Adding any new calculation endpoint +- Adding any new HTTP route to the API + +## When Not to Use +- Internal engine functions (see math-function example) +- Infrastructure changes (see cdk-construct example) + +## Customization Guide +| Element | Customize? | Notes | +|---------|-----------|-------| +| Route path and method | Yes | Follow /v1/{category}/{function} convention | +| Request/response models | Yes | Define Pydantic models specific to the endpoint | +| Engine function call | Yes | Call the appropriate engine module function | +| Error handling structure | No | Always use CalcEngineError hierarchy and error_response() | +| Logging calls | No | Always log with request_id, api_key_id, duration_ms | +| Response envelope | No | Always return {"result": ..., "expression": ..., "computation_time_ms": ..., "engine_version": ...} | +``` + +`examples/api-endpoint/example_endpoint.py`: + +```python +"""Example: Standard API endpoint pattern for CalcEngine.""" + +import time + +import structlog +from fastapi import APIRouter, Depends +from pydantic import BaseModel, Field + +from calcengine.api.middleware.auth import get_api_key_id +from calcengine.api.models.errors import error_response +from calcengine.api.models.responses import CalculationResponse +from calcengine.engine.errors import CalcEngineError +from calcengine.engine.trigonometry import sin + +logger = structlog.get_logger() + +router = APIRouter() + + +class SinRequest(BaseModel): + """Request model for sine calculation.""" + + value: float = Field(..., description="Input angle") + angle_mode: str = Field( + default="radians", + pattern="^(radians|degrees)$", + description="Angle unit: 'radians' or 'degrees'", + ) + + +@router.post("/v1/trigonometry/sin", response_model=CalculationResponse) +async def calculate_sin( + request: SinRequest, + api_key_id: str = Depends(get_api_key_id), +) -> CalculationResponse | dict: + """Calculate the sine of the given value.""" + start = time.perf_counter() + + try: + result = sin(request.value, angle_mode=request.angle_mode) + elapsed = (time.perf_counter() - start) * 1000 + + logger.info( + "calculation_completed", + endpoint="/v1/trigonometry/sin", + input_value=request.value, + angle_mode=request.angle_mode, + result=result, + computation_time_ms=round(elapsed, 3), + api_key_id=api_key_id, + ) + + return CalculationResponse( + result=result, + expression=f"sin({request.value})", + computation_time_ms=round(elapsed, 3), + ) + + except CalcEngineError as e: + elapsed = (time.perf_counter() - start) * 1000 + logger.warning( + "calculation_failed", + endpoint="/v1/trigonometry/sin", + input_value=request.value, + error_code=e.code, + error_detail=str(e), + computation_time_ms=round(elapsed, 3), + api_key_id=api_key_id, + ) + return error_response(e) +``` + +`examples/api-endpoint/test_example_endpoint.py`: + +```python +"""Example: Standard test pattern for a CalcEngine API endpoint.""" + +import math + +import pytest +from fastapi.testclient import TestClient + +from calcengine.main import app + + +@pytest.fixture +def client() -> TestClient: + """Create a test client with a mocked API key.""" + return TestClient(app) + + +class TestSinEndpoint: + """Tests for POST /v1/trigonometry/sin.""" + + @pytest.mark.unit + def test_sin_zero_radians(self, client: TestClient) -> None: + response = client.post( + "/v1/trigonometry/sin", + json={"value": 0, "angle_mode": "radians"}, + headers={"Authorization": "Bearer test-api-key"}, + ) + assert response.status_code == 200 + data = response.json() + assert data["result"] == pytest.approx(0.0) + assert "computation_time_ms" in data + + @pytest.mark.unit + def test_sin_pi_over_2_radians(self, client: TestClient) -> None: + response = client.post( + "/v1/trigonometry/sin", + json={"value": math.pi / 2, "angle_mode": "radians"}, + headers={"Authorization": "Bearer test-api-key"}, + ) + assert response.status_code == 200 + assert response.json()["result"] == pytest.approx(1.0) + + @pytest.mark.unit + def test_sin_90_degrees(self, client: TestClient) -> None: + response = client.post( + "/v1/trigonometry/sin", + json={"value": 90, "angle_mode": "degrees"}, + headers={"Authorization": "Bearer test-api-key"}, + ) + assert response.status_code == 200 + assert response.json()["result"] == pytest.approx(1.0) + + @pytest.mark.unit + def test_sin_invalid_angle_mode(self, client: TestClient) -> None: + response = client.post( + "/v1/trigonometry/sin", + json={"value": 1.0, "angle_mode": "gradians"}, + headers={"Authorization": "Bearer test-api-key"}, + ) + assert response.status_code == 422 # Pydantic validation error + + @pytest.mark.unit + def test_sin_missing_auth(self, client: TestClient) -> None: + response = client.post( + "/v1/trigonometry/sin", + json={"value": 0}, + ) + assert response.status_code == 401 +``` + +### Example 2: Pure Math Function Pattern + +`examples/math-function/README.md`: + +```markdown +# Math Function Pattern + +## What This Demonstrates +Standard pattern for implementing a pure math function in the engine layer. +Shows: function signature, type hints, domain validation, error raising, docstring format. + +## When to Use +- Adding any new mathematical function to src/calcengine/engine/ + +## When Not to Use +- API endpoints (see api-endpoint example) +- Functions that require AWS or HTTP access (those belong in api/ or storage/) + +## Key Rules +- No imports from api/, storage/, or any external service +- Pure functions only: same input always produces same output +- Raise CalcEngineError subclasses for domain errors, never return None or NaN +- Type hints on all parameters and return values +``` + +`examples/math-function/example_function.py`: + +```python +"""Example: Standard pattern for a pure math function in CalcEngine engine layer.""" + +import math + +from calcengine.engine.errors import DomainError + + +def log_base(value: float, base: float = 10.0) -> float: + """Compute the logarithm of a value with the given base. + + Args: + value: The number to compute the logarithm of. Must be positive. + base: The logarithm base. Must be positive and not equal to 1. + Defaults to 10 (common logarithm). + + Returns: + The logarithm of value in the given base. + + Raises: + DomainError: If value <= 0, base <= 0, or base == 1. + """ + if value <= 0: + raise DomainError( + code="DOMAIN_ERROR", + message=f"Cannot compute logarithm of {value}", + detail="Logarithm is only defined for positive numbers", + parameter="value", + ) + + if base <= 0: + raise DomainError( + code="DOMAIN_ERROR", + message=f"Cannot use {base} as logarithm base", + detail="Logarithm base must be positive", + parameter="base", + ) + + if base == 1.0: + raise DomainError( + code="DOMAIN_ERROR", + message="Cannot use 1 as logarithm base", + detail="Logarithm base 1 is undefined (division by zero in change-of-base)", + parameter="base", + ) + + return math.log(value) / math.log(base) +``` + +`examples/math-function/test_example_function.py`: + +```python +"""Example: Standard test pattern for a pure math function.""" + +import math + +import pytest +from hypothesis import given, strategies as st + +from calcengine.engine.errors import DomainError +from calcengine.engine.logarithmic import log_base + + +class TestLogBase: + """Tests for log_base function.""" + + # --- Known values --- + + @pytest.mark.unit + def test_log10_of_100(self) -> None: + assert log_base(100, 10) == pytest.approx(2.0) + + @pytest.mark.unit + def test_log2_of_8(self) -> None: + assert log_base(8, 2) == pytest.approx(3.0) + + @pytest.mark.unit + def test_ln_of_e(self) -> None: + assert log_base(math.e, math.e) == pytest.approx(1.0) + + @pytest.mark.unit + def test_log_of_1_any_base(self) -> None: + assert log_base(1, 10) == pytest.approx(0.0) + assert log_base(1, 2) == pytest.approx(0.0) + assert log_base(1, math.e) == pytest.approx(0.0) + + # --- Default base --- + + @pytest.mark.unit + def test_default_base_is_10(self) -> None: + assert log_base(1000) == pytest.approx(3.0) + + # --- Domain errors --- + + @pytest.mark.unit + def test_log_of_zero_raises_domain_error(self) -> None: + with pytest.raises(DomainError, match="Cannot compute logarithm"): + log_base(0) + + @pytest.mark.unit + def test_log_of_negative_raises_domain_error(self) -> None: + with pytest.raises(DomainError, match="Cannot compute logarithm"): + log_base(-5) + + @pytest.mark.unit + def test_log_base_zero_raises_domain_error(self) -> None: + with pytest.raises(DomainError, match="Cannot use 0"): + log_base(10, 0) + + @pytest.mark.unit + def test_log_base_one_raises_domain_error(self) -> None: + with pytest.raises(DomainError, match="Cannot use 1"): + log_base(10, 1) + + @pytest.mark.unit + def test_log_base_negative_raises_domain_error(self) -> None: + with pytest.raises(DomainError, match="Cannot use -2"): + log_base(10, -2) + + # --- Property-based: accuracy against stdlib --- + + @pytest.mark.accuracy + @given( + st.floats(min_value=1e-300, max_value=1e300, allow_nan=False, allow_infinity=False), + ) + def test_log10_matches_stdlib(self, x: float) -> None: + assert log_base(x, 10) == pytest.approx(math.log10(x), rel=1e-14) + + @pytest.mark.accuracy + @given( + st.floats(min_value=1e-300, max_value=1e300, allow_nan=False, allow_infinity=False), + ) + def test_log2_matches_stdlib(self, x: float) -> None: + assert log_base(x, 2) == pytest.approx(math.log2(x), rel=1e-14) + + # --- Property-based: mathematical identity --- + + @pytest.mark.accuracy + @given( + a=st.floats(min_value=1e-100, max_value=1e100, allow_nan=False, allow_infinity=False), + b=st.floats(min_value=1e-100, max_value=1e100, allow_nan=False, allow_infinity=False), + ) + def test_log_product_identity(self, a: float, b: float) -> None: + """log(a * b) should equal log(a) + log(b).""" + if a * b > 0: + assert log_base(a * b, 10) == pytest.approx( + log_base(a, 10) + log_base(b, 10), rel=1e-10 + ) +``` + +### Example 3: CDK Construct Pattern + +`examples/cdk-construct/README.md`: + +```markdown +# CDK Construct Pattern + +## What This Demonstrates +Standard pattern for defining a CDK stack for CalcEngine infrastructure. +Shows: Lambda function, API Gateway integration, DynamoDB table, IAM permissions. + +## When to Use +- Adding new infrastructure resources to the project + +## Key Rules +- All infrastructure in infrastructure/stacks/ directory +- One stack per logical group (api, data, monitoring, auth, docs) +- Use environment variables from CDK context, never hardcode +- Least-privilege IAM: each Lambda gets only the permissions it needs +``` + +`examples/cdk-construct/example_stack.py`: + +```python +"""Example: Standard CDK stack pattern for CalcEngine.""" + +from aws_cdk import Duration, Stack +from aws_cdk import aws_apigatewayv2 as apigwv2 +from aws_cdk import aws_dynamodb as dynamodb +from aws_cdk import aws_lambda as lambda_ +from aws_cdk import aws_logs as logs +from aws_cdk.aws_apigatewayv2_integrations import HttpLambdaIntegration +from constructs import Construct + + +class ExampleApiStack(Stack): + """Example stack showing Lambda + API Gateway + DynamoDB pattern.""" + + def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: + super().__init__(scope, construct_id, **kwargs) + + # DynamoDB table - single table design + table = dynamodb.Table( + self, + "ExampleTable", + partition_key=dynamodb.Attribute( + name="PK", type=dynamodb.AttributeType.STRING + ), + sort_key=dynamodb.Attribute( + name="SK", type=dynamodb.AttributeType.STRING + ), + billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST, + encryption=dynamodb.TableEncryption.AWS_MANAGED, + point_in_time_recovery=True, + ) + + # Lambda function + handler = lambda_.Function( + self, + "ExampleHandler", + runtime=lambda_.Runtime.PYTHON_3_12, + handler="calcengine.main.handler", + code=lambda_.Code.from_asset("src/"), + memory_size=256, + timeout=Duration.seconds(30), + environment={ + "TABLE_NAME": table.table_name, + "LOG_LEVEL": "INFO", + }, + log_retention=logs.RetentionDays.ONE_MONTH, + ) + + # Grant Lambda read/write access to DynamoDB (least privilege) + table.grant_read_write_data(handler) + + # HTTP API Gateway with Lambda integration + api = apigwv2.HttpApi( + self, + "ExampleHttpApi", + api_name="calcengine-api", + default_integration=HttpLambdaIntegration( + "LambdaIntegration", + handler, + ), + ) +``` + +--- + +## How This Document Feeds Into AI-DLC + +| Section | AI-DLC Stage | How It Is Used | +| ----------------------------- | ---------------------------------- | ------------------------------------------------------------------------ | +| Project Technical Summary | Workspace Detection | Greenfield classification, team context | +| Programming Languages | Code Generation | Python 3.12 enforced, no other languages without approval | +| uv Standards | Code Generation | All dependency operations use uv, pyproject.toml is single config source | +| Frameworks and Libraries | Code Generation, NFR Design | FastAPI + Pydantic + Mangum stack, prohibited library enforcement | +| Cloud Services Allow/Disallow | Infrastructure Design | Lambda + API Gateway + DynamoDB only for MVP | +| Architecture Pattern | Application Design | Modular monolith, module boundaries in engine/ vs api/ vs storage/ | +| API Design Standards | Functional Design, Code Generation | Endpoint conventions, error codes, response format | +| Security Requirements | NFR Requirements, NFR Design | Input validation rules, no eval(), API key auth pattern | +| Testing Requirements | Code Generation, Build and Test | pytest + hypothesis, 90% coverage, accuracy tests mandatory | +| Project Structure | Code Generation | Exact directory layout and file placement rules | +| Example Code | Code Generation | Canonical patterns for endpoints, engine functions, CDK stacks | diff --git a/docs/writing-inputs/example-vision-scientific-calculator-api.md b/docs/writing-inputs/example-vision-scientific-calculator-api.md new file mode 100644 index 00000000..1a7b1a4d --- /dev/null +++ b/docs/writing-inputs/example-vision-scientific-calculator-api.md @@ -0,0 +1,366 @@ +# Vision Document: CalcEngine Scientific Calculator API + +## Executive Summary + +CalcEngine is a hosted scientific calculator library delivered as a REST API that enables software development teams to embed accurate, standards-compliant mathematical computation into their own applications without building or maintaining calculation logic themselves. It addresses the recurring problem of engineering teams spending months implementing, testing, and debugging mathematical functions that are peripheral to their core product. The expected outcome is a subscription API service generating $2M ARR within 24 months by capturing developers building in education technology, financial modeling, engineering simulation, and data analysis. + +--- + +## Business Context + +### Problem Statement + +Software teams building products in education, finance, engineering, and science regularly need mathematical computation beyond basic arithmetic. They face a choice: use a local library (often language-specific, inconsistent across platforms, and a maintenance burden) or build the math functions themselves (expensive, error-prone, and slow). + +The specific problems are: + +- **Accuracy risk**: Teams without mathematics expertise introduce subtle floating-point errors, incorrect edge-case handling (division by zero, overflow, domain errors), and inconsistent rounding behavior that erode trust in their products. +- **Duplicated effort**: Every team that needs trigonometric functions, statistical distributions, matrix operations, or unit conversions builds them independently. This work is repeated across thousands of companies. +- **Cross-platform inconsistency**: A calculation performed in a Python backend may produce a different result than the same calculation in a JavaScript frontend. Customers who operate across platforms cannot guarantee consistency. +- **Compliance and auditability**: In regulated industries (finance, healthcare, engineering), calculations must be traceable, versioned, and validated. Ad-hoc implementations rarely meet audit requirements. + +### Business Drivers + +- **API-first economy**: Developers increasingly prefer consuming hosted APIs over embedding libraries. Stripe (payments), Twilio (communications), and SendGrid (email) have proven the model. No equivalent exists for mathematical computation. +- **EdTech growth**: The global education technology market is expanding rapidly, and calculator functionality is a universal requirement across math, science, and engineering courseware. +- **Regulatory pressure**: Financial services firms face increasing scrutiny on calculation accuracy. A certified, auditable API reduces their compliance burden. +- **AI/ML preprocessing**: Data science teams need reliable mathematical transformations as preprocessing steps. An API that guarantees precision and reproducibility has clear value in ML pipelines. + +### Target Users and Stakeholders + +| User Type | Description | Primary Need | +| ----------------------------------- | ------------------------------------------------------------ | ---------------------------------------------------------------------------- | +| Application Developer | Backend or frontend engineer integrating math into a product | Reliable, well-documented API with consistent results across calls | +| EdTech Product Manager | Non-technical stakeholder at an education company | Confidence that calculation results shown to students are correct | +| Financial Analyst / Quant Developer | Developer building financial models or trading systems | Arbitrary-precision arithmetic with auditable, versioned calculation history | +| Engineering Simulation Developer | Engineer building CAD, physics, or modeling tools | High-performance matrix, vector, and differential equation operations | +| Data Scientist | Analyst building preprocessing pipelines | Consistent statistical functions callable from any language or platform | +| DevOps / Platform Engineer | Engineer responsible for uptime and integration | Low-latency, high-availability API with clear SLAs and monitoring | + +### Business Constraints + +- **Bootstrap budget**: Initial development funded from existing revenue. No external investment planned for MVP. Total MVP budget is $150K (covering development, infrastructure, and initial marketing). +- **Small team**: Two backend developers, one frontend developer (for documentation portal), one QA engineer. No dedicated math PhD on staff for MVP phase. +- **Time to market**: MVP must be publicly available within 6 months to validate demand before committing to Phase 2 investment. +- **Pricing model**: Must support a free tier (to drive adoption) and usage-based paid tiers. Pricing infrastructure needed at launch. +- **Accuracy standards**: All functions must match or exceed the accuracy of IEEE 754 double-precision arithmetic. Arbitrary-precision mode is a Phase 2 feature, not MVP. + +### Success Metrics + +| Metric | Current State | Target State (12 months post-MVP) | Measurement Method | +| ------------------------------------ | -------------- | ----------------------------------- | -------------------------------------------- | +| Registered API accounts | 0 | 5,000 | API key registration count | +| Monthly active API consumers | 0 | 1,200 | Unique API keys making at least 1 call/month | +| API calls per month | 0 | 10 million | CloudWatch API Gateway metrics | +| Paid subscribers | 0 | 200 | Billing system records | +| Monthly recurring revenue | $0 | $80K | Billing system records | +| API uptime | N/A | 99.9% | CloudWatch availability monitoring | +| Mean response time (p50) | N/A | < 50ms | CloudWatch latency metrics | +| Customer-reported accuracy bugs | N/A | < 5 per quarter | Support ticket tracking | +| Developer documentation satisfaction | N/A | > 4.2 / 5.0 | Quarterly survey of registered developers | + +--- + +## Full Scope Vision + +### Product Vision Statement + +CalcEngine becomes the default computation layer for any application that needs mathematical operations beyond basic arithmetic, the way Stripe became the default for payments, by offering an API that is more accurate, more consistent, and easier to integrate than building it yourself. + +### Feature Areas + +#### Feature Area 1: Core Arithmetic and Algebra + +- **Description**: Fundamental mathematical operations that go beyond what standard language math libraries provide reliably. +- **Key Capabilities**: + - Arbitrary-precision integer and decimal arithmetic (configurable precision up to 1,000 digits) + - Expression parsing and evaluation (accept string expressions like "2 * sin(pi/4) + log(100)") + - Polynomial operations (evaluation, root finding, factoring) + - Equation solving (linear, quadratic, cubic, systems of linear equations) + - Fraction and rational number arithmetic (exact representation, simplification) + - Complex number arithmetic (addition, multiplication, polar/rectangular conversion) +- **User Value**: Developers send a math expression as a string and get a precise, verified result without implementing a parser or worrying about operator precedence, floating-point drift, or edge cases. + +#### Feature Area 2: Trigonometry and Geometry + +- **Description**: Complete trigonometric and geometric calculation capabilities. +- **Key Capabilities**: + - All six trigonometric functions and their inverses (sin, cos, tan, csc, sec, cot) + - Hyperbolic functions and inverses + - Degree/radian/gradian conversion + - Coordinate system conversions (Cartesian, polar, spherical, cylindrical) + - Geometric calculations (area, volume, perimeter for standard shapes) + - Distance and angle calculations in 2D and 3D space +- **User Value**: Eliminates the need to implement trigonometric edge cases (exact values at special angles, quadrant handling, domain validation). + +#### Feature Area 3: Statistics and Probability + +- **Description**: Statistical analysis and probability distribution functions. +- **Key Capabilities**: + - Descriptive statistics (mean, median, mode, variance, standard deviation, quartiles, percentiles) + - Probability distributions (normal, binomial, Poisson, chi-squared, t-distribution, F-distribution) with PDF, CDF, and inverse CDF + - Regression analysis (linear, polynomial, exponential, logarithmic) + - Hypothesis testing (t-test, chi-squared test, ANOVA) + - Combinatorics (permutations, combinations, factorial, binomial coefficients) + - Random number generation with configurable distributions and seeds +- **User Value**: A single API call replaces importing and configuring statistical libraries. Results are reproducible and auditable. + +#### Feature Area 4: Linear Algebra and Matrix Operations + +- **Description**: Matrix and vector computation for engineering, graphics, and data science. +- **Key Capabilities**: + - Matrix arithmetic (addition, multiplication, scalar operations) + - Matrix decompositions (LU, QR, SVD, Cholesky, eigenvalue) + - Determinant, inverse, rank, trace + - Vector operations (dot product, cross product, normalization) + - Systems of linear equations (Gaussian elimination, least squares) + - Sparse matrix support for large-scale problems +- **User Value**: Teams building simulation, ML, or graphics applications get validated linear algebra without linking to LAPACK or maintaining native bindings. + +#### Feature Area 5: Calculus + +- **Description**: Symbolic and numerical calculus operations. +- **Key Capabilities**: + - Numerical differentiation (first and higher-order derivatives) + - Numerical integration (definite integrals with configurable methods: trapezoidal, Simpson's, Gaussian quadrature) + - Symbolic differentiation and integration (for supported expression types) + - Limits and series expansion (Taylor, Maclaurin) + - Ordinary differential equation solvers (Euler, Runge-Kutta) +- **User Value**: Engineers and scientists get calculus operations via API without embedding a computer algebra system. + +#### Feature Area 6: Unit Conversion and Physical Constants + +- **Description**: Standard unit conversion and access to verified physical and mathematical constants. +- **Key Capabilities**: + - Unit conversion across all SI and common imperial units (length, mass, temperature, time, energy, pressure, speed, etc.) + - Currency conversion (with daily rate updates from a financial data provider) + - Physical constants (speed of light, Planck's constant, Avogadro's number, etc.) with cited sources and uncertainty values + - Mathematical constants to configurable precision (pi, e, golden ratio, etc.) + - Dimensional analysis (validate that unit combinations are physically meaningful) +- **User Value**: One API replaces multiple conversion libraries and hardcoded constant values, with the guarantee that constants are sourced and current. + +#### Feature Area 7: Financial Mathematics + +- **Description**: Financial calculation functions for lending, investment, and risk analysis. +- **Key Capabilities**: + - Time value of money (present value, future value, annuities, perpetuities) + - Loan amortization schedules + - Bond pricing and yield calculations + - Option pricing (Black-Scholes, binomial model) + - Internal rate of return (IRR) and net present value (NPV) + - Depreciation methods (straight-line, declining balance, sum-of-years) +- **User Value**: FinTech companies get auditable, regulation-ready financial calculations without building proprietary math engines. + +#### Feature Area 8: Developer Experience and Platform + +- **Description**: The API platform, documentation, SDKs, and developer tools that make CalcEngine easy to adopt. +- **Key Capabilities**: + - Interactive API documentation with live "try it" sandbox + - Client SDKs for Python, JavaScript/TypeScript, Java, C#, Go, and Ruby + - Webhook support for long-running calculations (batch processing) + - Calculation history and audit log per API key + - Rate limiting with clear quotas and overage handling + - API versioning with 12-month deprecation policy + - Workspace feature for teams (shared API keys, usage dashboards, billing management) +- **User Value**: Developers can go from signup to first successful API call in under 5 minutes. + +### Integration Points + +- **Payment processor** (Stripe) - Subscription billing and usage-based metering +- **Identity provider** (Auth0 or Cognito) - Developer account authentication +- **Financial data provider** (for currency rates) - Daily exchange rate feeds +- **NIST / CODATA** - Source of truth for physical constants +- **CI/CD systems** (GitHub Actions, GitLab CI) - SDK publishing and version management +- **Monitoring** (Datadog or CloudWatch) - API performance, error rates, usage dashboards + +### User Journeys (Full Vision) + +#### Journey 1: EdTech Developer Adds Calculation to a Course Platform + +1. Developer discovers CalcEngine through a search for "scientific calculator API" and lands on the documentation site. +2. Developer creates a free account and gets an API key in under 2 minutes. +3. Developer browses the interactive documentation, finds the trigonometry endpoint, and tests `sin(pi/4)` in the sandbox. +4. Developer installs the Python SDK via pip and writes a 3-line integration that sends student-entered expressions to CalcEngine and displays the result. +5. Developer configures the API to return step-by-step solution breakdowns so students can see how the answer was derived. +6. Course platform goes live. Thousands of students submit calculations daily. The developer monitors usage through the CalcEngine dashboard and upgrades to a paid tier when free-tier limits are reached. + +**Outcome**: The education platform ships a reliable calculator feature in one afternoon instead of spending weeks building and testing math parsing. + +#### Journey 2: FinTech Startup Builds a Loan Comparison Tool + +1. Product team at a lending startup needs amortization schedules, APR calculations, and present-value computations for a customer-facing loan comparison tool. +2. Developer signs up for CalcEngine and navigates to the Financial Mathematics section. +3. Developer uses the loan amortization endpoint to generate a payment schedule for a 30-year mortgage at 6.5% interest. The API returns month-by-month principal, interest, and balance breakdowns. +4. Developer integrates the NPV and IRR endpoints to let customers compare different loan offers side by side. +5. Compliance team reviews CalcEngine's accuracy certification and audit log. Each calculation is traceable to a versioned API call with timestamped inputs and outputs. +6. The loan comparison tool launches. CalcEngine handles 500K calculations per month. The startup pays based on usage and avoids hiring a quant developer. + +**Outcome**: The FinTech startup launches a compliant, auditable financial tool without building proprietary calculation logic. + +#### Journey 3: Data Scientist Uses CalcEngine in an ML Pipeline + +1. A data scientist at a healthcare company needs to normalize patient measurement data using statistical transformations (z-scores, percentile ranks, log transforms) as preprocessing before model training. +2. Data scientist installs the CalcEngine Python SDK and calls the statistics endpoints from within a Jupyter notebook. +3. The SDK accepts arrays of values and returns descriptive statistics and transformed datasets. +4. The data scientist configures batch mode to process 100K records. CalcEngine returns results via webhook when processing completes. +5. Because CalcEngine guarantees reproducible results (same inputs, same outputs, across versions), the scientist can cite the API version in their research paper for reproducibility. + +**Outcome**: The scientist gets validated, reproducible statistical transformations without writing and debugging custom statistics code. + +### Scalability and Growth + +- **Geographic expansion**: Initial deployment in US-East. Expand to EU-West and AP-Southeast within 12 months of MVP based on user geography data. +- **Volume growth**: Architect for 1 billion API calls/month within 3 years. Start serverless, migrate high-traffic endpoints to containers if latency requires it. +- **Feature growth**: New feature areas added based on customer demand data. Candidates include: number theory, graph theory, signal processing, optimization solvers. +- **Enterprise expansion**: Introduce on-premises deployment option for regulated industries that cannot send data to a shared API. Target Phase 3. +- **Marketplace presence**: List on AWS Marketplace, Azure Marketplace, and RapidAPI for additional distribution channels. + +### Long-Term Roadmap + +| Phase | Focus | Timeframe | +| ------- | ------------------------------------------------------------------------------------------------------------------------------------- | ------------ | +| MVP | Core arithmetic, trigonometry, basic statistics, expression evaluation, API platform, documentation portal, free + paid tiers | Months 1-6 | +| Phase 2 | Linear algebra, calculus, financial math, arbitrary-precision mode, client SDKs (5 languages), calculation audit log, team workspaces | Months 7-14 | +| Phase 3 | Unit conversion, physical constants, step-by-step solutions, batch processing, enterprise features, on-premises option | Months 15-22 | +| Phase 4 | Advanced statistics (hypothesis testing, regression), symbolic computation, optimization solvers, marketplace listings | Months 23-30 | + +--- + +## MVP Scope + +### MVP Objective + +Prove that developers will pay for a hosted scientific calculator API by launching with core mathematical functions, validating adoption through free-tier signups, and converting at least 50 accounts to paid plans within 6 months of launch. + +### MVP Success Criteria + +- [ ] 1,000 registered developer accounts within 3 months of launch +- [ ] 300 monthly active API consumers (at least 1 call/month) within 3 months +- [ ] 50 paid subscribers within 6 months +- [ ] $15K MRR within 6 months +- [ ] API uptime of 99.9% over first 3 months +- [ ] Mean response time (p50) under 50ms for all MVP endpoints +- [ ] Zero critical accuracy bugs reported (calculations returning wrong results) +- [ ] Net Promoter Score of 40+ from developer survey at 3-month mark + +### Features In Scope (MVP) + +| Feature | Description | Priority | Rationale for Inclusion | +| ------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------ | +| Basic arithmetic operations | Add, subtract, multiply, divide, modulo, power, square root, nth root, absolute value, floor, ceiling, rounding | Must Have | Foundation for all other calculations. Table stakes for any calculator API. | +| Expression evaluation | Accept a string math expression (e.g., "2 * (3 + 4)^2 / sin(pi)") and return the evaluated result. Support operator precedence, parentheses, and nested functions. | Must Have | The single most valuable differentiator. Developers send expressions as strings instead of building parsers. | +| Trigonometric functions | sin, cos, tan, asin, acos, atan, atan2 with degree and radian mode | Must Have | Universal requirement across EdTech, engineering, and graphics use cases. | +| Logarithmic and exponential functions | log (base 10), ln (natural log), log with arbitrary base, exp, power | Must Have | Required for financial, scientific, and statistical calculations. | +| Basic statistics | Mean, median, mode, standard deviation, variance, min, max, sum, count, percentile | Must Have | High-frequency need. Validates demand from data science and EdTech segments. | +| Mathematical constants | pi, e, golden ratio (phi), sqrt(2), sqrt(3), ln(2), ln(10) to IEEE 754 double precision | Must Have | Low implementation cost, high utility. Prevents developers from hardcoding imprecise values. | +| Factorial, permutations, combinations | n!, nPr, nCr with large number support | Must Have | Required for probability and combinatorics use cases in EdTech. | +| Error handling and domain validation | Clear error responses for domain errors (sqrt of negative, log of zero, division by zero), overflow, and invalid expressions. Structured error format with error codes. | Must Have | Professional API quality. Bad error handling is the top reason developers abandon APIs. | +| API key management | Developer signup, API key generation, key rotation, key revocation | Must Have | Minimum authentication infrastructure for a commercial API. | +| Usage metering and rate limiting | Track calls per API key. Free tier: 10,000 calls/month. Paid tier: 1M calls/month. Clear rate limit headers in responses. | Must Have | Revenue model depends on usage-based pricing. Must be present at launch. | +| REST API with JSON | All endpoints accept JSON, return JSON. Standard REST conventions. OpenAPI 3.x specification published. | Must Have | Expected standard for modern APIs. | +| API documentation portal | Hosted documentation site with endpoint reference, code examples in 3 languages (Python, JavaScript, cURL), and interactive "try it" sandbox. | Must Have | Developer adoption depends entirely on documentation quality. | +| Billing integration | Stripe-based subscription billing. Free tier, Starter ($29/mo), Professional ($99/mo). Usage overage billing. | Must Have | Revenue collection must be automated from day one. | + +### Features Explicitly Out of Scope (MVP) + +| Feature | Reason for Deferral | Target Phase | +| --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- | -------------- | +| Arbitrary-precision arithmetic | Adds significant complexity to every endpoint. Standard IEEE 754 double precision is sufficient for MVP validation. | Phase 2 | +| Matrix and linear algebra operations | Large feature surface area. Not needed to validate core business hypothesis. | Phase 2 | +| Calculus (differentiation, integration) | Requires numerical methods expertise and extensive edge-case testing. | Phase 2 | +| Financial mathematics | Specialized domain. Validate general developer demand first. | Phase 2 | +| Symbolic computation | Requires a computer algebra system. Out of scope for small team and MVP timeline. | Phase 3 | +| Step-by-step solution breakdowns | High value for EdTech but significant implementation effort. Validate demand through customer interviews during MVP. | Phase 3 | +| Unit conversion | Useful but not core to calculator value proposition. Many free alternatives exist. | Phase 3 | +| Physical constants database | Low implementation cost but low urgency. Include in Phase 3 with unit conversion. | Phase 3 | +| Client SDKs (Python, JS, Java, etc.) | Documentation with cURL and code examples is sufficient for MVP. SDKs accelerate adoption but are not required to validate demand. | Phase 2 | +| Batch processing / webhooks | Needed for high-volume users. MVP focuses on synchronous single-calculation calls. | Phase 3 | +| Calculation audit log | Important for regulated industries. Not needed for initial developer adoption. | Phase 2 | +| Team workspaces | Enterprise feature. Individual developer accounts are sufficient for MVP. | Phase 3 | +| On-premises deployment | Enterprise feature requiring significant packaging effort. | Phase 3+ | +| Probability distributions (PDF, CDF) | Useful but not core to MVP validation. Basic statistics covers initial demand. | Phase 2 | +| Regression analysis | Specialized statistical feature. Defer until statistics demand is validated. | Phase 4 | +| Complex number arithmetic | Niche use case. Validate demand from engineering users first. | Phase 2 | + +### MVP User Journeys + +#### Journey 1: Developer Discovers and Integrates CalcEngine + +1. Developer searches for "math expression evaluation API" and finds CalcEngine documentation. +2. Developer clicks "Get API Key" and completes a one-page signup form (email, password, company name optional). +3. Developer receives API key immediately on the confirmation page and in a welcome email. +4. Developer copies a cURL example from the documentation and runs it in their terminal: `curl -X POST https://api.calcengine.io/v1/evaluate -H "Authorization: Bearer {key}" -d '{"expression": "sin(pi/4) * 2 + sqrt(16)"}'` +5. Developer receives a JSON response: `{"result": 5.414213562373095, "expression": "sin(pi/4) * 2 + sqrt(16)", "precision": "double"}` +6. Developer reads the Python code example on the documentation site, copies it into their application, and replaces the expression string with user input. +7. Application is live. Developer monitors usage on the CalcEngine dashboard. + +**Outcome**: Developer goes from discovery to working integration in under 15 minutes. +**Limitation vs Full Vision**: No SDK (raw HTTP calls), no step-by-step breakdowns, no audit log. + +#### Journey 2: EdTech Company Evaluates CalcEngine for Student Use + +1. EdTech product manager asks their developer to evaluate CalcEngine for a homework-checking feature. +2. Developer signs up for the free tier and tests 20 common student calculations (quadratic formula, trig identities, basic statistics) using the API sandbox. +3. Developer verifies results against known correct answers. All match. +4. Developer integrates CalcEngine into the homework checker. Students type math expressions, the app sends them to CalcEngine, and the result is compared against the expected answer. +5. Free tier handles initial classroom pilot (500 students, ~8,000 calls/month). When the pilot expands to the full school district, the developer upgrades to the Starter plan. + +**Outcome**: EdTech company ships a homework-checking feature without building a math parser. +**Limitation vs Full Vision**: No step-by-step solutions for students, no complex number support, no calculus functions for advanced courses. + +### MVP Constraints and Assumptions + +- **Assumption**: Developers prefer a hosted API over a local library for math operations. **Risk if wrong**: Low adoption despite accurate computation. **Mitigation**: Free tier allows low-commitment validation; pivot to open-source library model if API model fails. +- **Assumption**: Expression evaluation (string-in, result-out) is the highest-value feature. **Risk if wrong**: Developers actually want individual function endpoints more than expression parsing. **Mitigation**: MVP includes both expression evaluation and individual function endpoints; usage data will reveal which is preferred. +- **Assumption**: IEEE 754 double precision is sufficient for MVP users. **Risk if wrong**: Early adopters in finance or science demand higher precision immediately. **Mitigation**: Arbitrary precision is Phase 2 priority and can be accelerated if demand signals are strong. +- **Assumption**: 10,000 free calls/month is enough to evaluate the product but low enough to drive paid conversion. **Risk if wrong**: Free tier is either too generous (no conversion) or too restrictive (users leave before evaluating). **Mitigation**: Adjust limit based on conversion data at 2-month mark. +- **Accepted Limitation**: No client SDKs at MVP. Developers must make raw HTTP calls. This adds friction but SDKs are expensive to build and maintain across multiple languages before product-market fit is validated. +- **Accepted Limitation**: Single-region deployment (US-East-1). Latency for users in Europe and Asia will be higher. Acceptable for MVP because calculation payloads are small (low bandwidth sensitivity). + +### MVP Definition of Done + +- [ ] All 13 "Must Have" features implemented, tested, and deployed +- [ ] API responds correctly to a validation suite of 500+ mathematical test cases covering all MVP functions +- [ ] Edge cases handled gracefully: division by zero, overflow, underflow, invalid expressions, domain errors (e.g., log(-1)) +- [ ] API uptime demonstrated at 99.9% over a 2-week burn-in period before public launch +- [ ] p50 response time under 50ms, p99 under 200ms measured over burn-in period +- [ ] Documentation portal live with endpoint reference, code examples (Python, JavaScript, cURL), and interactive sandbox +- [ ] Billing integration functional: free tier enforced, paid tier purchasable, usage tracked accurately +- [ ] OpenAPI 3.x specification published and downloadable +- [ ] Security review completed: API key authentication, rate limiting, input validation, no injection vulnerabilities +- [ ] Load test completed: API handles 1,000 concurrent requests without degradation +- [ ] Stakeholder demo completed and sign-off received + +--- + +## Risks and Dependencies + +### Key Risks + +| Risk | Likelihood | Impact | Mitigation | +| -------------------------------------------------------------------------------- | ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Low developer adoption: market prefers local libraries over hosted APIs for math | Medium | High | Free tier lowers barrier. Emphasize cross-platform consistency and expression evaluation as differentiators that local libraries lack. Monitor signup-to-active-use conversion. | +| Accuracy bugs damage credibility: a wrong calculation result reported publicly | Low | Critical | Comprehensive test suite (500+ cases per function), comparison against reference implementations (Wolfram Alpha, Python mpmath), automated regression testing on every deploy. | +| Expression parser edge cases: unexpected input causes crashes or wrong results | Medium | High | Fuzz testing with randomized expressions, explicit grammar definition, sandbox the parser to prevent injection. | +| Free tier abuse: bots or scrapers consume resources without converting | Medium | Medium | Rate limiting per API key, CAPTCHA on signup, anomaly detection on usage patterns. Adjust free tier limit if needed. | +| Stripe billing integration delays MVP launch | Low | Medium | Begin billing integration in month 2. Use manual invoicing as temporary fallback if needed. | +| Single-region outage takes down the entire service | Low | High | Deploy to two availability zones within US-East-1. Multi-region is Phase 2 but AZ redundancy provides baseline resilience. | +| Competitor launches similar API during our development | Low | Medium | Speed to market is the primary defense. 6-month MVP timeline. Focus on developer experience as a moat: documentation quality, response time, error messages. | + +### External Dependencies + +- **Stripe** - Payment processing and subscription management - Available, well-documented API +- **Auth0 or AWS Cognito** - Developer authentication - Available, evaluate during month 1 +- **Domain registrar** - calcengine.io domain (or similar) - Must secure before documentation site goes live +- **Cloud provider (AWS)** - Compute, API Gateway, database - Available, no approval needed +- **SSL certificate provider** - TLS for API and documentation site - Available via AWS Certificate Manager + +### Open Questions + +- [ ] Should the expression evaluator support variable assignment (e.g., "x = 5; 2*x + 3") or only single-expression evaluation in the MVP? +- [ ] What is the maximum expression length the parser should accept? 1KB? 10KB? Need to balance flexibility against abuse potential. +- [ ] Should the API return results as strings (preserving precision representation) or as JSON numbers (risking floating-point serialization issues)? +- [ ] Do we need to support implicit multiplication (e.g., "2pi" meaning "2 * pi") or require explicit operators? +- [ ] Should the free tier require a credit card on file to reduce abuse, or is email-only signup better for adoption? +- [ ] What is the cancellation and refund policy for paid subscriptions? +- [ ] Should we publish accuracy benchmarks comparing CalcEngine results against Wolfram Alpha and Python mpmath on the documentation site? diff --git a/docs/writing-inputs/inputs-quickstart.md b/docs/writing-inputs/inputs-quickstart.md new file mode 100644 index 00000000..6b1f1b80 --- /dev/null +++ b/docs/writing-inputs/inputs-quickstart.md @@ -0,0 +1,165 @@ +# AI-DLC Quick Start + +AI-DLC (AI-Driven Development Life Cycle) is a structured workflow that guides an AI assistant through planning, designing, and building software. Before starting a project, you provide two documents that tell the AI **what to build** and **what tools to use**. + +--- + +## What You Need to Provide + +### 1. Vision Document -- what to build and why + +| Section | What to Write | How Long | +| ----------------------------- | -------------------------------------------------------------------------- | --------------------------------------------------- | +| **Executive Summary** | One paragraph: what is it, who is it for, why does it matter | 3-5 sentences | +| **Problem Statement** | The specific business problem this solves | 1-2 paragraphs | +| **Target Users** | Who will use it, what each user type needs | A table with one row per user type | +| **Success Metrics** | How you measure whether this project succeeded | A table with measurable targets | +| **Full Scope Vision** | Everything the product could become at maturity, organized by feature area | As many feature areas as needed | +| **MVP Scope -- Features IN** | Every feature included in the first release, with rationale | A table. If it is not listed, it is not in the MVP. | +| **MVP Scope -- Features OUT** | Features deliberately excluded from MVP, with reason and target phase | A table. This prevents scope creep. | +| **Risks and Open Questions** | What could go wrong, what is still undecided | Tables and bullet lists | + +**Key principle**: Separate the full vision from the MVP. The full vision is aspirational. The MVP is the smallest thing that delivers value. + +Full guide: [vision-document-guide.md](vision-document-guide.md) +Worked example: [example-vision-scientific-calculator-api.md](example-vision-scientific-calculator-api.md) + +--- + +### 2. Technical Environment Document -- what tools to use + +| Section | What to Write | How Long | +| ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------- | +| **Languages** | Required, permitted, and prohibited languages with versions | A table per category | +| **Frameworks and Libraries** | Required, preferred, and prohibited with rationale and alternatives | A table per category | +| **Cloud Services** | Allow list and disallow list of cloud services with constraints | A table per list | +| **Architecture and Patterns** | API style, data patterns, messaging, project structure | Short sections with tables | +| **Security** | Auth method, encryption, input validation, secrets management, and a chosen security compliance framework with controls documented per category | Several subsections | +| **Testing** | Test types, coverage targets, tooling, CI/CD gates | Tables | +| **Example Code** | Template code showing canonical patterns for endpoints, functions, tests, and infrastructure | Working code files in an `examples/` directory | + +**Key principle**: Be explicit about what is allowed and what is not. Allow lists and disallow lists prevent the AI from making assumptions. + +Full guide: [technical-environment-guide.md](technical-environment-guide.md) +Worked example: [example-tech-env-scientific-calculator-api.md](example-tech-env-scientific-calculator-api.md) + +--- + +## Minimum Viable Input + +If you want to start fast and fill in details later, provide at least this: + +### Vision (minimum) + +```text +1. One paragraph saying what you are building and for whom +2. A list of MVP features (what is IN scope) +3. A list of what is NOT in the MVP +4. Open questions -- things you already know are uncertain or unresolved +``` + +Open questions are optional but valuable. They feed directly into Requirements Analysis as pre-declared ambiguities, so AI-DLC addresses them early rather than surfacing them as surprises mid-design. + +See [example-minimal-vision-scientific-calculator-api.md](example-minimal-vision-scientific-calculator-api.md) for a worked example. + +### Technical Environment (minimum) + +```text +1. Language and version +2. Package manager +3. Web framework (if applicable) +4. Cloud provider and deployment model (or "local only") +5. Test framework +6. Prohibited libraries and services -- use a table: prohibited | reason | use instead +7. Security basics (auth method, input validation approach, secrets management) +8. Example code patterns -- one short example each for a typical endpoint, function, and test +``` + +**On item 6**: including the reason and the recommended alternative is important. Without them, AI-DLC may honour the prohibition but not understand the intent well enough to make good substitution decisions. + +**On item 8**: even one or two short examples give AI-DLC a concrete pattern to follow during code generation rather than inventing its own. This is the single highest-leverage addition beyond the basics. + +See [example-minimal-tech-env-scientific-calculator-api.md](example-minimal-tech-env-scientific-calculator-api.md) for a worked example of both. + +Everything else can be answered through AI-DLC's clarifying questions during the Inception phase. The more you provide up front, the fewer questions the AI will need to ask. + +--- + +## Brownfield Projects + +If you are adding to or modifying an existing codebase, your inputs need to answer a different set of questions. The full guides cover brownfield in detail, but the minimum is: + +### Vision (brownfield minimum) + +```text +1. Current state -- one paragraph describing what the system does today +2. What we are adding or changing -- a clear description of the change +3. Features IN scope for this iteration +4. Features OUT of scope for this iteration +5. What must NOT change -- existing components, APIs, or data the new work must not touch +6. Open questions +``` + +The "what must not change" section is critical. AI-DLC will run a Reverse Engineering stage to analyze your existing codebase, but being explicit about boundaries prevents it from proposing changes that would destabilize working parts of the system. + +See [example-minimal-vision-brownfield.md](example-minimal-vision-brownfield.md) for a worked example. + +### Technical Environment (brownfield minimum) + +```text +1. Existing stack -- language, framework, database, infra -- with versions +2. What to add (new services, tables, components) +3. What must stay unchanged -- services, schemas, contracts, configs not to touch +4. Prohibited patterns -- libraries or approaches that conflict with the existing codebase +5. Security basics -- how auth and secrets work in the existing system +6. Example code patterns from the existing codebase +``` + +The example code patterns are especially important for brownfield. AI-DLC should generate code that looks like it belongs in the existing codebase, not code that introduces new conventions alongside old ones. Pull your examples from actual existing files. + +See [example-minimal-tech-env-brownfield.md](example-minimal-tech-env-brownfield.md) for a worked example. + +--- + +## What Happens After You Provide These Documents + +AI-DLC runs through two main phases: + +**Inception** -- understand and plan + +1. Detects your workspace (new project or existing code) +2. Analyzes requirements (asks clarifying questions if anything is unclear) +3. Creates user stories (if the project warrants them) +4. Builds an execution plan (which stages to run, which to skip) +5. Designs components and units of work (if complexity warrants it) + +**Construction** -- design and build (per unit of work) + +1. Functional design (business logic, domain models) +2. NFR requirements and design (performance, security, scalability) +3. Infrastructure design (maps to actual cloud services) +4. Code generation (writes the code, tests, and deployment artifacts) +5. Build and test (build instructions, test execution, verification) + +Every stage requires your approval before proceeding. You can request changes, add skipped stages, or redirect at any gate. + +--- + +## File Overview + +```text +docs/writing-inputs/ + inputs-quickstart.md <-- You are here + vision-document-guide.md <-- How to write a vision document + technical-environment-guide.md <-- How to write a tech environment document + + -- Greenfield examples (new project from scratch) -- + example-vision-scientific-calculator-api.md <-- Full example: CalcEngine vision + example-tech-env-scientific-calculator-api.md <-- Full example: CalcEngine tech env + example-minimal-vision-scientific-calculator-api.md<-- Minimal example: CalcEngine vision + example-minimal-tech-env-scientific-calculator-api.md<-- Minimal example: CalcEngine tech env + + -- Brownfield examples (adding to an existing system) -- + example-minimal-vision-brownfield.md <-- Minimal example: returns module on existing platform + example-minimal-tech-env-brownfield.md <-- Minimal example: returns module on existing platform +``` diff --git a/docs/writing-inputs/technical-environment-guide.md b/docs/writing-inputs/technical-environment-guide.md new file mode 100644 index 00000000..22342dad --- /dev/null +++ b/docs/writing-inputs/technical-environment-guide.md @@ -0,0 +1,549 @@ +# Technical Environment Document Guide + +## Purpose + +A Technical Environment Document defines the **technical tooling, standards, constraints, and preferences** that govern how a project is built. It is the technical counterpart to the Vision Document and serves as a binding reference during the Construction Phase of AI-DLC. + +This document ensures that code generation, infrastructure design, and NFR decisions align with organizational standards, security policies, and team capabilities. Without it, AI-DLC stages will ask extensive clarifying questions to fill in these gaps, or worse, make assumptions that require rework. + +## When to Write a Technical Environment Document + +- Before starting any new project (greenfield) +- Before modifying an existing project where technical constraints have changed (brownfield) +- When organizational technology standards have been updated +- When migrating between cloud providers, frameworks, or deployment models + +## Document Applicability + +A Technical Environment Document can target one of two project contexts: + +| Context | Definition | Key Differences | +| -------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------- | +| **Greenfield** | No existing code. Building from scratch. | All choices are open. Document defines the starting point. | +| **Brownfield** | Existing codebase. Adding, modifying, or migrating. | Choices are constrained by what exists. Document defines what to keep, change, or avoid. | + +Structure your document for the applicable context. Sections below are marked with **(Greenfield)**, **(Brownfield)**, or **(Both)** to indicate where they apply. + +--- + +## Document Structure + +### 1. Project Technical Summary (Both) + +```markdown +## Project Technical Summary + +- **Project Name**: [Name] +- **Project Type**: [Greenfield / Brownfield] +- **Primary Runtime Environment**: [Cloud / On-Premises / Hybrid] +- **Cloud Provider**: [AWS / Azure / GCP / Multi-cloud / None] +- **Target Deployment Model**: [Serverless / Containers / VMs / Hybrid] +- **Team Size**: [Number of developers] +- **Team Experience**: [Key skills and experience levels relevant to tech choices] +``` + +--- + +### 2. Programming Languages (Both) + +Define the languages the project must use, may use, and must not use. + +```markdown +## Programming Languages + +### Required Languages +[Languages that must be used for specific purposes.] + +| Language | Version | Purpose | Rationale | +|----------|---------|---------|-----------| +| TypeScript | 5.x | Backend services, CDK infrastructure | Team expertise, type safety | +| Python | 3.12+ | Data processing, Lambda functions | ML library ecosystem | + +### Permitted Languages +[Languages that may be used if justified, but are not required.] + +| Language | Conditions for Use | +|----------|-------------------| +| Go | Approved for high-throughput microservices where latency is critical | +| Rust | Approved for systems-level components only with tech lead approval | + +### Prohibited Languages +[Languages that must not be used, with reasoning.] + +| Language | Reason | +|----------|--------| +| PHP | No team expertise, not aligned with platform direction | +| Ruby | Organizational standard prohibits new Ruby services | +``` + +**Brownfield addition:** + +```markdown +### Existing Language Inventory +[Languages currently in the codebase that must be maintained or migrated.] + +| Language | Current Usage | Direction | +|----------|--------------|-----------| +| Java 11 | Core backend services | Maintain (upgrade to Java 21 in Phase 2) | +| JavaScript | Legacy frontend | Migrate to TypeScript | +``` + +--- + +### 3. Frameworks and Libraries (Both) + +```markdown +## Frameworks and Libraries + +### Required Frameworks +[Frameworks that must be used for their respective domains.] + +| Framework/Library | Version | Domain | Rationale | +|-------------------|---------|--------|-----------| +| React | 18.x | Frontend UI | Organizational standard | +| Express | 4.x | API layer | Lightweight, team familiarity | +| AWS CDK | 2.x | Infrastructure as Code | AWS deployment target | +| Jest | 29.x | Unit testing | Consistent test runner across projects | + +### Preferred Libraries +[Libraries that should be used when their capability is needed, but are not +mandatory if the capability is not required.] + +| Library | Purpose | Use When | +|---------|---------|----------| +| Zod | Runtime type validation | Any external data ingestion or API input | +| Pino | Structured logging | All services that emit logs | +| Axios | HTTP client | Outbound HTTP calls from services | + +### Prohibited Libraries +[Libraries that must not be used. Include the preferred alternative.] + +| Library | Reason | Alternative | +|---------|--------|-------------| +| Moment.js | Deprecated, large bundle size | date-fns or Luxon | +| Lodash (full) | Bundle size | Native JS or lodash-es for specific imports | +| Request | Deprecated | Axios or native fetch | + +### Library Approval Process +[How does a developer get approval to use a library not on the required +or preferred lists?] + +- [Describe approval process, e.g., "Submit a tech review request to the + architecture team with justification, license check, and maintenance + status assessment."] +``` + +--- + +### 4. Cloud Environment and Services (Both) + +```markdown +## Cloud Environment + +### Cloud Provider +- **Primary Provider**: [AWS / Azure / GCP] +- **Account Structure**: [Single account / Multi-account / Organization] +- **Regions**: [Primary region(s) and disaster recovery region(s)] + +### Service Allow List +[Services that are approved for use. Only services on this list may be used +without additional approval.] + +| Service | Approved Use Cases | Constraints | +|---------|-------------------|-------------| +| AWS Lambda | Event-driven compute, API handlers | Max 15 min timeout, 10GB memory | +| Amazon DynamoDB | Key-value and document storage | On-demand capacity for dev, provisioned for prod | +| Amazon S3 | Object storage, static assets | Must enable versioning and encryption | +| Amazon SQS | Asynchronous message queuing | Standard queues preferred; FIFO only when ordering required | +| Amazon CloudWatch | Monitoring, logging, alarms | All services must emit structured logs | +| AWS Secrets Manager | Secrets storage | All credentials and API keys | +| Amazon API Gateway | REST and HTTP API exposure | HTTP APIs preferred over REST for new services | +| Amazon ECR | Container image registry | Required for all container-based services | +| AWS ECS Fargate | Container compute | Preferred over EC2-based ECS | +| Amazon RDS PostgreSQL | Relational data storage | Aurora Serverless v2 for variable workloads | + +### Service Disallow List +[Services that must not be used, with reasoning and approved alternatives.] + +| Service | Reason | Alternative | +|---------|--------|-------------| +| Amazon EC2 (direct) | Prefer managed/serverless compute | Lambda or ECS Fargate | +| Amazon ElastiCache | Cost and operational overhead for current scale | DynamoDB DAX or application-level caching | +| AWS Elastic Beanstalk | Does not fit IaC workflow | CDK with ECS or Lambda | +| Amazon Kinesis | Complexity exceeds current needs | SQS or EventBridge | + +### Service Approval Process +[How does a developer get approval to use a service not on the allow list?] + +- [Describe process, e.g., "Submit a Cloud Service Request with business + justification, cost estimate, security review, and operational plan. + Requires architecture team approval."] +``` + +--- + +### 5. Preferred Technologies and Patterns (Both) + +```markdown +## Preferred Technologies and Patterns + +### Architecture Patterns +| Pattern | When to Use | When Not to Use | +|---------|-------------|-----------------| +| Serverless-first | Default for all new services | Workloads requiring persistent connections or >15 min processing | +| Event-driven | Asynchronous workflows, decoupled services | Simple CRUD with no downstream effects | +| Microservices | Independently deployable domains | Small projects with single-team ownership | +| Monolith (modular) | Single-team projects, early-stage MVPs | Multi-team or independently scalable domains | + +### API Design Standards +- **Style**: [REST / GraphQL / gRPC] - [When to use each] +- **Versioning**: [URL path versioning (v1/v2) / Header-based] +- **Documentation**: [OpenAPI 3.x spec required for all REST APIs] +- **Naming Convention**: [kebab-case for URLs, camelCase for JSON fields] +- **Pagination**: [Cursor-based preferred, offset-based acceptable for admin APIs] +- **Error Format**: [Standard error response structure] + +### Data Patterns +- **Primary Data Store**: [DynamoDB for service-owned data] +- **Relational Data**: [RDS PostgreSQL when relational queries are required] +- **Caching Strategy**: [Describe caching approach] +- **Data Ownership**: [Each service owns its data; no shared databases] + +### Messaging and Events +- **Synchronous**: [HTTP/REST between services for request-response] +- **Asynchronous**: [SQS for task queuing, EventBridge for event distribution] +- **Event Schema**: [Describe event schema standards, e.g., CloudEvents format] + +### Frontend Patterns (if applicable) +- **Component Library**: [e.g., Internal design system, Material UI, Shadcn] +- **State Management**: [e.g., React Context for local, Zustand for global] +- **Routing**: [e.g., React Router v6] +- **Build Tool**: [e.g., Vite] +``` + +--- + +### 6. Security Requirements (Both) + +```markdown +## Security Requirements + +### Authentication and Authorization +- **Authentication Method**: [e.g., Amazon Cognito, OIDC, SAML] +- **Authorization Model**: [e.g., RBAC, ABAC, custom policy engine] +- **Token Format**: [e.g., JWT with RS256 signing] +- **Session Management**: [e.g., Token expiry, refresh token rotation] + +### Data Protection +- **Encryption at Rest**: [Required for all data stores. Specify KMS key management.] +- **Encryption in Transit**: [TLS 1.2+ required for all communications] +- **PII Handling**: [Identify PII fields, masking requirements, retention policies] +- **Data Classification**: [Public / Internal / Confidential / Restricted] + +### Network Security +- **VPC Requirements**: [Services that must run in VPC] +- **Security Groups**: [Least-privilege rules, no 0.0.0.0/0 ingress] +- **WAF**: [Required for all public-facing endpoints] +- **Private Endpoints**: [Use VPC endpoints for AWS service access where available] + +### Secrets Management +- **Secrets Storage**: [AWS Secrets Manager / Parameter Store] +- **Rotation Policy**: [Automatic rotation every N days] +- **Access Policy**: [Least-privilege IAM policies per service] +- **Prohibited Practices**: + - No secrets in source code, environment variables at build time, or config files + - No shared credentials across services + - No long-lived access keys + +### Compliance Requirements +- **Standards**: [SOC 2, HIPAA, PCI-DSS, GDPR, FedRAMP, or "None specific"] +- **Audit Logging**: [All API calls logged, CloudTrail enabled, log retention period] +- **Vulnerability Scanning**: [Container image scanning, dependency scanning tools] + +### Dependency Security +- **Dependency Scanning**: [Tool and frequency, e.g., Dependabot weekly, Snyk on PR] +- **License Policy**: [Allowed licenses: MIT, Apache 2.0, BSD. Prohibited: GPL, AGPL] +- **Update Policy**: [Critical vulnerabilities patched within N days] + +### Security Compliance Framework + +Every project must adopt a security risk framework and document how the +project addresses each risk category in that framework. The choice of +framework depends on the project's domain, regulatory environment, and +organizational standards. + +**Select one or more frameworks and document compliance per category:** + +- **Framework chosen**: [Name and version, e.g., OWASP Top 10 (2021), + NIST 800-53, CIS Controls v8, AWS Well-Architected Security Pillar, + SANS Top 25, or an internal organizational framework] +- **Rationale**: [Why this framework was selected. Reference regulatory + requirements, customer contracts, or organizational policy if applicable.] + +**Common frameworks by context:** + +| Context | Common Framework Choices | +|---------|------------------------| +| Web applications and APIs | OWASP Top 10, OWASP API Security Top 10 | +| Cloud-native infrastructure | AWS/Azure/GCP Well-Architected Security Pillar, CIS Benchmarks | +| Government / regulated | NIST 800-53, FedRAMP, ISO 27001 | +| General software | CIS Controls v8, SANS Top 25 | +| Internal / low-risk | Organizational security checklist (document it here) | + +**For each risk category in the chosen framework, document:** + +1. **How the project addresses it** - Specific controls, patterns, and + tooling that mitigate the risk +2. **Not Applicable justifications** - If a category does not apply, + state why explicitly. Do not leave categories blank. +3. **Deferred items** - If a control is planned for a later phase, + document the current gap and the target phase for remediation + +**Where to put the detailed compliance matrix:** + +For small frameworks (10 or fewer categories), include the full matrix +in this document under this heading. + +For large frameworks (NIST 800-53, ISO 27001), create a separate file +and reference it here: +- `security/[framework-name]-compliance.md` + +See the CalcEngine example for a complete worked example using +OWASP Top 10 (2021) as the chosen framework. +``` + +--- + +### 7. Testing Requirements (Both) + +```markdown +## Testing Requirements + +### Test Strategy Overview +| Test Type | Required | Coverage Target | Tooling | +|-----------|----------|----------------|---------| +| Unit Tests | Yes | 80% line coverage minimum | Jest / pytest | +| Integration Tests | Yes | All service-to-service interactions | Jest + Testcontainers / pytest | +| End-to-End Tests | Conditional | Critical user journeys | Playwright / Cypress | +| Contract Tests | Conditional | All inter-service APIs | Pact | +| Performance Tests | Conditional | When SLA targets defined | k6 / Artillery | +| Security Tests | Yes | All public endpoints | OWASP ZAP / Snyk | + +### Unit Testing Standards +- **Coverage Minimum**: [80% line coverage, 70% branch coverage] +- **Mocking Policy**: [Mock external dependencies, do not mock internal business logic] +- **Naming Convention**: [describe/it pattern, e.g., "describe('OrderService') > it('should calculate total with tax')"] +- **Test Location**: [Co-located with source (e.g., `__tests__/`) or separate tree (e.g., `tests/unit/`)] + +### Integration Testing Standards +- **Scope**: [Test actual service interactions, database queries, and API contracts] +- **Environment**: [Local containers via Docker Compose / Testcontainers] +- **Data Management**: [Test fixtures, database seeding and cleanup approach] + +### End-to-End Testing Standards +- **Scope**: [Critical user journeys only, not comprehensive UI testing] +- **Environment**: [Deployed staging environment] +- **Data-testid Requirements**: [All interactive elements must have stable data-testid attributes] + +### Performance Testing Standards +- **Baseline Requirements**: [Define SLA targets: response time, throughput, error rate] +- **Test Scenarios**: [Load test, stress test, soak test] +- **Tooling**: [k6 / Artillery / JMeter] + +### CI/CD Testing Gates +[Define which tests must pass at each stage of the pipeline.] + +| Pipeline Stage | Required Tests | Failure Action | +|---------------|---------------|----------------| +| Pre-commit | Linting, type checking | Block commit | +| Pull Request | Unit tests, integration tests | Block merge | +| Pre-deploy (staging) | E2E tests, contract tests | Block deploy | +| Post-deploy (production) | Smoke tests, health checks | Auto-rollback | +``` + +--- + +### 8. Example and Template Code Guidance (Both) + +This section tells AI-DLC and the development team how to provide, use, and maintain example or template code that establishes project conventions. + +````markdown +## Example and Template Code Guidance + +### Purpose of Example Code +Example code establishes the **canonical patterns** for the project. When AI-DLC +generates code, it should follow these patterns rather than inventing new ones. +When developers write code, they reference these examples for consistency. + +### When to Provide Example Code +Provide example or template code for any of the following: + +- **Project structure setup** - Directory layout, file naming, module organization +- **API endpoint pattern** - How a standard endpoint is structured from route to response +- **Database access pattern** - How queries, transactions, and connections are handled +- **Error handling pattern** - Standard error types, error response format, logging +- **Authentication/authorization integration** - How auth is applied to endpoints +- **Testing pattern** - How a standard unit test and integration test are structured +- **Logging pattern** - Structured log format, what to log at each level +- **Configuration pattern** - How environment-specific configuration is loaded +- **Infrastructure as Code pattern** - How a standard CDK construct or Terraform module looks + +### How to Structure Example Code + +#### Location +Store example code in a dedicated directory that AI-DLC and developers can reference: + +``` +project-root/ + examples/ # Or "templates/" if preferred + api-endpoint/ + handler.ts # Example API handler + handler.test.ts # Corresponding test + README.md # Explains the pattern and when to use it + database-access/ + repository.ts # Example repository pattern + repository.test.ts + README.md + infrastructure/ + standard-lambda-stack.ts # Example CDK stack + README.md +``` + +#### Structure of Each Example +Every example should include: + +1. **Working code** - Not pseudocode. It must compile/run. +2. **Corresponding test** - Shows how to test the pattern. +3. **README.md** - Explains: + - What pattern this demonstrates + - When to use it + - When NOT to use it + - What to customize vs what to keep as-is + - References to relevant standards from this Technical Environment Document + +#### Example README Template + +``` +# [Pattern Name] Example + +## What This Demonstrates +[One paragraph describing the pattern.] + +## When to Use +- [Condition 1] +- [Condition 2] + +## When Not to Use +- [Condition 1 - with alternative reference] + +## File Inventory +| File | Purpose | +| --------------- | ---------------------- | +| handler.ts | Example implementation | +| handler.test.ts | Test pattern | + +## Customization Guide +| Element | Customize? | Notes | +| ------------------------ | ----------- | -------------------------------- | +| Error handling structure | No | Must follow project standard | +| Business logic | Yes | Replace with actual domain logic | +| Route path | Yes | Follow API naming conventions | +| Logging calls | No | Keep structured logging format | + +## Related Standards +- [Link to API Design Standards section] +- [Link to Error Handling pattern] +``` + +### How AI-DLC Uses Example Code + +During Code Generation, AI-DLC should: + +1. **Read examples first** - Before generating any code, read relevant examples + from the examples/ directory +2. **Follow established patterns** - Match the structure, naming, error handling, + and testing patterns shown in examples +3. **Do not invent alternatives** - If an example exists for a pattern, use it. + Do not create a different approach unless the example explicitly does not apply. +4. **Reference examples in plans** - Code Generation Plans should reference which + examples apply to each step + +### Maintaining Example Code + +- **Update examples when standards change** - Examples must stay current with this + Technical Environment Document +- **Review examples during onboarding** - New team members should read all examples + before contributing code +- **Version examples with the project** - Examples live in the same repository and + go through the same review process as production code +- **Mark deprecated examples** - If a pattern is superseded, rename the directory + with a "deprecated-" prefix and add a note pointing to the replacement +```` + +--- + +### 9. Brownfield-Specific Sections + +Include these sections only for brownfield projects. + +```markdown +## Brownfield: Existing Technical Inventory + +### Current State Assessment +[Reference the Reverse Engineering artifacts if available, or provide +a summary of the current technical state.] + +- **Current Languages**: [List with versions] +- **Current Frameworks**: [List with versions] +- **Current Infrastructure**: [Cloud services, deployment model] +- **Current Test Coverage**: [Percentage or qualitative assessment] +- **Known Technical Debt**: [Key items] + +### Migration and Modernization Rules + +#### What to Keep +[Technologies and patterns that should remain unchanged.] + +| Technology | Reason to Keep | +|-----------|---------------| +| [Tech] | [Rationale] | + +#### What to Migrate +[Technologies that should be replaced, with target and timeline.] + +| Current | Target | Priority | Approach | +|---------|--------|----------|----------| +| JavaScript | TypeScript | High | Incremental file-by-file migration | +| REST API v1 | REST API v2 | Medium | New endpoints use v2, migrate existing in Phase 2 | + +#### What to Remove +[Technologies, patterns, or dependencies that should be eliminated.] + +| Item | Reason | Removal Timeline | +|------|--------|-----------------| +| [Deprecated library] | [Security/maintenance concern] | [When] | + +### Coexistence Rules +[When old and new patterns must coexist, define the rules.] + +- **API versioning during migration**: [How v1 and v2 coexist] +- **Database schema migration**: [How schema changes are managed alongside existing data] +- **Feature flags**: [How new functionality is gated during transition] +- **Dependency conflicts**: [How conflicting library versions are managed] +``` + +--- + +## How This Document Feeds Into AI-DLC + +| Technical Environment Section | AI-DLC Stage | How It Is Used | +| ----------------------------------- | -------------------------------------- | -------------------------------------------------- | +| Project Technical Summary | Workspace Detection | Context for project classification | +| Programming Languages | Code Generation | Language selection and version constraints | +| Frameworks and Libraries | Code Generation, NFR Design | Dependency selection and prohibited library checks | +| Cloud Services Allow/Disallow Lists | Infrastructure Design | Service selection boundaries | +| Preferred Patterns | Application Design, Functional Design | Architecture and design pattern decisions | +| Security Requirements | NFR Requirements, NFR Design | Security pattern selection and compliance checks | +| Testing Requirements | Code Generation, Build and Test | Test strategy, tooling, and coverage targets | +| Example Code | Code Generation | Pattern reference during code generation | +| Brownfield Inventory | Reverse Engineering, Workflow Planning | Migration decisions and coexistence rules | diff --git a/docs/writing-inputs/vision-document-guide.md b/docs/writing-inputs/vision-document-guide.md new file mode 100644 index 00000000..ad18c0c0 --- /dev/null +++ b/docs/writing-inputs/vision-document-guide.md @@ -0,0 +1,269 @@ +# Vision Document Guide + +## Purpose + +A Vision Document defines the **business goals**, **target outcomes**, and **scope boundaries** for a project before entering the AI-DLC workflow. It serves as the primary input to the Inception Phase, giving the AI model and the team a shared understanding of what the project aims to achieve and why it matters. + +A well-written Vision Document reduces ambiguity during Requirements Analysis, improves User Story quality, and prevents scope creep during Construction. + +## When to Write a Vision Document + +- Before starting any new project or major initiative +- When proposing a new product, feature set, or platform +- When pivoting an existing product in a new direction +- When multiple stakeholders need alignment on goals before development begins + +## Document Structure + +### 1. Executive Summary + +A brief paragraph (3-5 sentences) that captures the essence of the project. Anyone reading only this section should understand what the project is, who it serves, and why it exists. + +**Template:** + +```markdown +## Executive Summary + +[Project Name] is a [type of system/product] that enables [target users] to [core capability]. +It addresses [business problem or opportunity] by [approach or differentiation]. +The expected outcome is [measurable business result]. +``` + +**Example:** + +```markdown +## Executive Summary + +OrderFlow is a web-based order management platform that enables mid-size retailers to +track inventory, process customer orders, and manage supplier relationships in a single +interface. It addresses the fragmented tooling problem that causes fulfillment delays +and inventory mismatches. The expected outcome is a 30% reduction in order processing +time and elimination of manual inventory reconciliation. +``` + +--- + +### 2. Business Context + +Describe the business environment, the problem being solved, and why solving it matters now. + +**Sections to include:** + +```markdown +## Business Context + +### Problem Statement +[What specific business problem or pain point does this project address? +Be concrete. Avoid vague statements like "improve efficiency."] + +### Business Drivers +[Why is this project being pursued now? What market conditions, competitive +pressures, regulatory changes, or internal needs make this timely?] + +### Target Users and Stakeholders +[Who will use the system? Who has a stake in its success? +List user types with a brief description of each.] + +| User Type | Description | Primary Need | +|-----------|-------------|--------------| +| [Role] | [Who they are] | [What they need from this system] | + +### Business Constraints +[Budget limits, regulatory requirements, organizational policies, timeline +pressures, or other non-negotiable boundaries.] + +### Success Metrics +[How will the business measure whether this project succeeded? +Use specific, measurable criteria.] + +| Metric | Current State | Target State | Measurement Method | +|--------|--------------|--------------|-------------------| +| [Metric name] | [Baseline] | [Goal] | [How measured] | +``` + +--- + +### 3. Full Scope Vision + +This section describes the **complete long-term vision** for the product or system. It is deliberately aspirational and covers everything the project could become, not just what will be built first. + +**Sections to include:** + +```markdown +## Full Scope Vision + +### Product Vision Statement +[A single sentence or short paragraph that captures the long-term aspirational +state of the product. What does the world look like when this product is fully +realized?] + +### Feature Areas +[Organize the full feature set into logical groups. For each area, describe +what the system will do at full maturity.] + +#### Feature Area 1: [Name] +- **Description**: [What this area covers] +- **Key Capabilities**: + - [Capability 1] + - [Capability 2] + - [Capability 3] +- **User Value**: [Why this matters to users] + +#### Feature Area 2: [Name] +[Same structure] + +### Integration Points +[What external systems, APIs, or data sources will the full system integrate +with at maturity?] + +- [System/Service] - [Purpose of integration] + +### User Journeys (Full Vision) +[Describe 2-3 end-to-end user journeys that represent the complete product +experience. These should reflect the full scope, not the MVP.] + +#### Journey 1: [Name] +1. [Step] +2. [Step] +3. [Step] +**Outcome**: [What the user achieves] + +### Scalability and Growth +[How is the product expected to grow? New markets, user types, geographies, +data volumes, or feature categories?] + +### Long-Term Roadmap (Optional) +[If known, outline the high-level phases or milestones beyond the MVP. +This is directional, not committal.] + +| Phase | Focus | Timeframe (if known) | +|-------|-------|---------------------| +| MVP | [Core scope] | [Target] | +| Phase 2 | [Expansion area] | [Target] | +| Phase 3 | [Further expansion] | [Target] | +``` + +--- + +### 4. MVP Scope + +This section defines the **minimum viable product**: the smallest set of functionality that delivers measurable value and validates the core business hypothesis. Everything listed here must be built before the product can launch or be evaluated. + +**Sections to include:** + +```markdown +## MVP Scope + +### MVP Objective +[What is the single most important thing the MVP must prove or deliver? +Keep this to 1-2 sentences.] + +### MVP Success Criteria +[How will you know the MVP succeeded? These should be testable and specific.] + +- [ ] [Criterion 1] +- [ ] [Criterion 2] +- [ ] [Criterion 3] + +### Features In Scope (MVP) +[List every feature that is included in the MVP. Be explicit. If it is not +listed here, it is not in the MVP.] + +| Feature | Description | Priority | Rationale for Inclusion | +|---------|-------------|----------|------------------------| +| [Feature name] | [Brief description] | Must Have | [Why it cannot be deferred] | + +### Features Explicitly Out of Scope (MVP) +[List features from the Full Scope Vision that are deliberately excluded +from the MVP. State why each is deferred. This prevents scope creep.] + +| Feature | Reason for Deferral | Target Phase | +|---------|-------------------|--------------| +| [Feature name] | [Why it can wait] | [Phase 2/3/TBD] | + +### MVP User Journeys +[Describe the user journeys that the MVP must support. These are subsets +or simplified versions of the Full Vision journeys.] + +#### Journey 1: [Name] +1. [Step] +2. [Step] +3. [Step] +**Outcome**: [What the user achieves] +**Limitation vs Full Vision**: [What is simplified or missing compared to full scope] + +### MVP Constraints and Assumptions +[What assumptions is the MVP built on? What known limitations are accepted?] + +- **Assumption**: [Statement] - **Risk if wrong**: [Consequence] +- **Accepted Limitation**: [What is intentionally limited and why] + +### MVP Definition of Done +[What must be true for the MVP to be considered complete and ready for +evaluation or launch?] + +- [ ] All "Must Have" features implemented and tested +- [ ] [Additional criteria specific to this project] +- [ ] [Deployment or accessibility requirement] +- [ ] [Stakeholder sign-off requirement] +``` + +--- + +### 5. Risks and Dependencies + +```markdown +## Risks and Dependencies + +### Key Risks +| Risk | Likelihood | Impact | Mitigation | +|------|-----------|--------|------------| +| [Risk description] | High/Medium/Low | High/Medium/Low | [Mitigation strategy] | + +### External Dependencies +[List anything outside the team's control that the project depends on.] + +- [Dependency] - [Owner] - [Status] + +### Open Questions +[List unresolved questions that need answers before or during development. +These feed directly into the Requirements Analysis clarifying questions.] + +- [ ] [Question] +- [ ] [Question] +``` + +--- + +## Writing Guidelines + +### Do + +- Be specific and measurable. "Reduce order processing time by 30%" is better than "make things faster." +- Clearly separate full vision from MVP. Mixing them causes scope creep. +- Include "out of scope" lists. They are as valuable as "in scope" lists. +- Write for the team, not for executives. Avoid marketing language. +- State assumptions explicitly so they can be challenged. +- Include success criteria that can actually be tested. + +### Do Not + +- Use vague language: "world-class," "seamless," "intuitive," "best-in-class." +- List technologies or implementation details. That belongs in the Technical Environment Document. +- Skip the MVP section. Every project needs a defined starting boundary. +- Combine features and user journeys. Features describe what the system does; journeys describe how users experience it. +- Assume readers know the business context. Write the Problem Statement even if it seems obvious. + +--- + +## How This Document Feeds Into AI-DLC + +| Vision Document Section | AI-DLC Stage | How It Is Used | +| ------------------------ | -------------------------------- | -------------------------------------------------- | +| Executive Summary | Workspace Detection | Initial context for project classification | +| Business Context | Requirements Analysis | Drives clarifying questions and requirements depth | +| Full Scope Vision | User Stories, Application Design | Informs persona creation, component identification | +| MVP Scope | Workflow Planning | Determines which stages execute, scope boundaries | +| Features In/Out of Scope | Code Generation | Defines what gets built in this iteration | +| Risks and Dependencies | All stages | Informs risk assessment and error handling | +| Open Questions | Requirements Analysis | Become clarifying questions in the question files | diff --git a/scripts/aidlc-evaluator/.gitignore b/scripts/aidlc-evaluator/.gitignore new file mode 100644 index 00000000..62a365cd --- /dev/null +++ b/scripts/aidlc-evaluator/.gitignore @@ -0,0 +1,56 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +config.yaml + +# Virtual environments +venv/ +env/ +ENV/ + +# Testing +.pytest_cache/ +.coverage +htmlcov/ +.tox/ + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Project specific +test_results/ +reports/ +*.log +.env +runs +ralph-coded +.venv/ +.ruff_cache/ +.claude/ diff --git a/scripts/aidlc-evaluator/ARCHITECTURE.md b/scripts/aidlc-evaluator/ARCHITECTURE.md new file mode 100644 index 00000000..1628f402 --- /dev/null +++ b/scripts/aidlc-evaluator/ARCHITECTURE.md @@ -0,0 +1,672 @@ +# AIDLC Evaluation Framework — Design Document + +## 1. Purpose + +This document describes the architecture, design decisions, data flows, and internal mechanics of the **AI-DLC Workflows Evaluation & Reporting Framework**. It is intended for developers who need to understand how the system works, extend it, or debug it. + +The framework validates changes to the [AI-DLC workflows](https://github.com/awslabs/aidlc-workflows) repository by running an AI-driven software development lifecycle end-to-end, then scoring the outputs across multiple quality dimensions: functional correctness, code quality, API contract conformance, and semantic similarity to a golden baseline. + +--- + +## 2. High-Level Architecture + +```text + ┌──────────────────────┐ + │ Entry Points (CLI) │ + └──────────┬───────────┘ + ┌───────────────────────┼──────────────────────┐ + │ │ │ + run_evaluation.py run_batch_evaluation.py run_ide_evaluation.py + (single model) (multi-model loop) (IDE adapter) + │ │ │ + └───────────┬───────────┘ │ + │ │ + ┌──────────────▼──────────────┐ ┌───────────────▼──────────┐ + │ 6-Stage Pipeline │ │ IDE Harness │ + │ ┌──────────────────────┐ │ │ ┌───────────────────┐ │ + │ │ 1. Execution │ │ │ │ Adapter (Cursor, │ │ + │ │ (Strands Swarm) │ │ │ │ Cline, Kiro, ...) │ │ + │ ├──────────────────────┤ │ │ └────────┬──────────┘ │ + │ │ 2. Post-Run Tests │ │ │ │ │ + │ ├──────────────────────┤ │ │ ┌────────▼──────────┐ │ + │ │ 3. Quantitative │ │ │ │ Output Normalizer │ │ + │ ├──────────────────────┤ │ │ └────────┬──────────┘ │ + │ │ 4. Contract Tests │ │ │ │ │ + │ ├──────────────────────┤ │ └───────────┼──────────────┘ + │ │ 5. Qualitative │ │ │ + │ ├──────────────────────┤ │ ┌──────────▼──────────┐ + │ │ 6. Report Generation │ │ │ --evaluate-only │ + │ └──────────────────────┘ │ │ (stages 2-6) │ + └─────────────────────────────┘ └─────────────────────┘ + │ + ┌──────────────▼───────────────┐ + │ runs/<timestamp>/ │ + │ ├── aidlc-docs/ │ + │ ├── workspace/ │ + │ ├── run-meta.yaml │ + │ ├── run-metrics.yaml │ + │ ├── test-results.yaml │ + │ ├── quality-report.yaml │ + │ ├── contract-test-results… │ + │ ├── qualitative-comparison…│ + │ ├── report.md / .html │ + │ └── evaluation-config.yaml │ + └─────────────────────────────┘ +``` + +--- + +## 3. Package Structure + +The project uses a **uv workspace** (defined in the root `pyproject.toml`) with eight internal packages. Each package is independently structured with its own `pyproject.toml`, `src/` layout, and `tests/` directory. + +| Package | PyPI Name | Purpose | +| ------------------------ | --------------------- | --------------------------------------------------- | +| `packages/execution` | `aidlc-runner` | Two-agent swarm that runs the AIDLC workflow | +| `packages/qualitative` | `aidlc-qualitative` | Semantic scoring of documents vs golden baseline | +| `packages/quantitative` | `aidlc-quantitative` | Static analysis: linting, security, duplication | +| `packages/contracttest` | `aidlc-contracttest` | API contract testing against OpenAPI specs | +| `packages/nonfunctional` | `aidlc-nonfunctional` | NFR evaluation (tokens, timing, consistency) | +| `packages/reporting` | `aidlc-reporting` | Consolidated report generation (Markdown + HTML) | +| `packages/ide-harness` | (not published) | IDE adapter framework for third-party AI assistants | +| `packages/shared` | `aidlc-shared` | Common utilities shared across packages | + +**Dependency graph** (simplified): + +```text +run_evaluation.py ──► execution (aidlc-runner) + ──► quantitative + ──► contracttest + ──► qualitative + ──► reporting ──► reporting.collector + ──► reporting.baseline + ──► reporting.render_md + ──► reporting.render_html +``` + +All packages communicate through **YAML files on disk**. There are no in-process library-level dependencies between the evaluation packages — the orchestrator (`run_evaluation.py`) invokes each package as a subprocess via `python -m <package>`, passing file paths as arguments. This design keeps packages independently testable and allows each to be run in isolation. + +--- + +## 4. Configuration System + +### 4.1 Layered Config Resolution + +Configuration follows a three-tier precedence model: + +```text +CLI flags > YAML config file > Built-in Python defaults +``` + +1. **Built-in defaults** are defined as dataclass field defaults in `packages/execution/src/aidlc_runner/config.py` (`RunnerConfig` and its nested dataclasses). +2. **YAML config** is loaded from `config/default.yaml` (or a custom path via `--config`). The `_merge_dict_into_dataclass()` function recursively overlays YAML values onto the dataclass tree. +3. **CLI flags** (e.g., `--executor-model`, `--profile`) are applied last, overriding both YAML and defaults. + +### 4.2 Config Dataclass Hierarchy + +```python +RunnerConfig + ├── aws: AwsConfig # profile, region + ├── models: ModelsConfig + │ ├── executor: ModelConfig # provider, model_id + │ └── simulator: ModelConfig + ├── aidlc: AidlcConfig # rules_source, rules_repo, rules_ref + ├── swarm: SwarmConfig # max_handoffs, max_iterations, timeouts + ├── runs: RunsConfig # output_dir + └── execution: ExecutionConfig # enabled, command_timeout, post_run_tests +``` + +### 4.3 Per-Model Config Files + +Files in `config/` (e.g., `config/sonnet-4-5.yaml`, `config/nova-pro.yaml`) override only the `models.executor.model_id` field. The batch runner (`run_batch_evaluation.py`) discovers these automatically by scanning `config/*.yaml` and excluding `default.yaml`. + +--- + +## 5. Stage-by-Stage Pipeline Design + +### 5.1 Stage 1: Execution (`packages/execution`) + +This is the core of the framework. It uses the **Strands SDK** multi-agent orchestration to run the full AIDLC workflow. + +#### Two-Agent Swarm Architecture + +```text + ┌──────────────────────┐ + │ Strands Swarm │ + │ │ + initial prompt ──►│ ┌────────────────┐ │ + │ │ Executor │ │ + │ │ Agent │◄─┤── handoff ──┐ + │ │ ├──┤── handoff ──│ + │ └────────────────┘ │ │ + │ │ ┌──────────▼─┐ + │ │ │ Simulator │ + │ │ │ Agent │ + │ │ └────────────┘ + └──────────────────────┘ +``` + +**Executor Agent** — Drives the AIDLC workflow through all phases (Inception → Construction). It: + +- Loads AIDLC rule files on demand via the `load_rule` tool (lazy loading keeps context window usage low) +- Reads/writes files in the run folder via sandboxed `read_file`, `write_file`, `list_files` tools +- Executes shell commands (dependency install, test runs) via the `run_command` tool +- Hands off to the Simulator when human input is needed (questions, approvals, reviews) + +**Simulator Agent** — Acts as a simulated human stakeholder. It: + +- Has the vision document (and optional tech-env document) embedded in its system prompt +- Answers clarifying questions, approves documents, reviews code +- Always hands back to the Executor to continue the workflow + +**Key design decisions:** + +- **Sandboxed file operations**: All file tools use `_resolve_safe()` to prevent path traversal outside the run folder +- **Sandboxed command execution**: `run_command` uses a restricted environment (only PATH, HOME, LANG) to isolate execution +- **Lazy rule loading**: Rules are loaded one-at-a-time as each stage begins, rather than pre-loading all rules into the system prompt +- **Progress streaming**: `AgentProgressHandler` logs tool invocations to stderr without printing full LLM output; `SwarmProgressHook` logs handoff timing +- **Metrics collection**: `MetricsCollector` records token usage, handoff timing, context size samples, and error events during execution + +#### AIDLC Workflow Stages + +The Executor drives this sequence (some stages are conditional based on project scope): + +| # | Stage | Phase | Conditional? | +| --- | --------------------- | ------------ | --------------- | +| 1 | Workspace Detection | Inception | Always | +| 2 | Reverse Engineering | Inception | Brownfield only | +| 3 | Requirements Analysis | Inception | Always | +| 4 | User Stories | Inception | If complex | +| 5 | Workflow Planning | Inception | Always | +| 6 | Application Design | Inception | If needed | +| 7 | Units Generation | Inception | If needed | +| 8 | Functional Design | Construction | If needed | +| 9 | NFR Requirements | Construction | If needed | +| 10 | NFR Design | Construction | If needed | +| 11 | Infrastructure Design | Construction | If needed | +| 12 | Code Generation | Construction | Always | +| 13 | Build and Test | Construction | Always | + +Each stage loads its corresponding rule file (e.g., `inception/requirements-analysis.md`) before execution. The Executor writes all documentation artifacts to `aidlc-docs/` and all generated code to `workspace/`. + +#### Rules Setup + +The runner either: + +- **Git clones** the AIDLC rules repository (default: `awslabs/aidlc-workflows`, ref configurable) into the run folder, then extracts the `aidlc-rules/` content +- **Copies** from a local path when `rules_source: "local"` is configured + +#### Run Folder Layout + +```text +runs/<YYYYMMDDTHHMMSS>-<rules_slug>/ + ├── vision.md # Copied input + ├── tech-env.md # Copied input (if provided) + ├── aidlc-rules/ # AIDLC workflow rules + │ ├── aws-aidlc-rules/ # Core workflow definition + │ └── aws-aidlc-rule-details/ # Per-stage rule files + ├── aidlc-docs/ # Generated AIDLC documents + │ ├── inception/ # Requirements, user stories, design docs + │ ├── construction/ # Functional design, code-gen docs + │ ├── aidlc-state.md # Workflow state tracker + │ └── audit.md # Timestamped audit log + ├── workspace/ # Generated application code + └── run-meta.yaml # Run identity and config snapshot +``` + +#### Post-Run Test Evaluation + +After the swarm completes, `post_run.py` performs automatic testing: + +1. **Project detection**: BFS scan of `workspace/` for marker files (`pyproject.toml`, `package.json`, `Cargo.toml`, `go.mod`) up to 3 levels deep +2. **Dependency install**: Runs the appropriate install command (e.g., `uv pip install -e ".[dev]"`) +3. **Test execution**: Runs the appropriate test command (e.g., `uv run pytest`) +4. **Output parsing**: Language-specific parsers extract pass/fail counts from test output (pytest, Jest/Vitest, cargo test, go test) +5. **Results**: Written to `test-results.yaml` + +### 5.2 Stage 2: Post-Run Tests (Summary) + +This stage reads `test-results.yaml` written by Stage 1 and prints a human-readable summary. It is embedded in the execution stage — the orchestrator reads the file for its summary display. + +### 5.3 Stage 3: Quantitative Analysis (`packages/quantitative`) + +Runs static analysis tools against the generated code in `workspace/`. The analysis is language-aware. + +#### Tool Selection by Project Type + +| Project Type | Linter | Security Scanner | Duplication | +| ------------- | -------- | ------------------- | ------------- | +| Python | ruff | bandit + semgrep | PMD CPD | +| Node.js | eslint | npm audit + semgrep | PMD CPD | + +#### Analysis Flow + +```text +scan_workspace(path) + ├── detect project type (pyproject.toml → Python, package.json → Node) + ├── run_ruff() or run_eslint() → LintFinding[] + ├── run_bandit() or run_npm_audit() → SecurityFinding[] + ├── run_semgrep() → SecurityFinding[] + ├── run_cpd() → DuplicationFinding[] + └── compute_summary() → QualityReport +``` + +Each tool runner: + +1. Checks if the tool is available (`shutil.which` or `uv run --version`) +2. Executes with JSON output format +3. Parses structured output into standardized finding models +4. Returns a `ToolResult` with findings and metadata + +**Graceful degradation**: If any tool is not installed, the analysis for that tool is skipped with a note — it never fails the evaluation. + +Output: `quality-report.yaml` + +### 5.4 Stage 4: Contract Tests (`packages/contracttest`) + +Validates the generated application's API endpoints against an OpenAPI 3.x specification. + +#### Architecture + +```text +openapi.yaml ──► spec.py (parser) ──► ContractSpec + ├── AppConfig (module, port, framework) + └── TestCase[] (from x-test-cases extensions) + +workspace/ ──► server.py (ServerProcess) ──► uvicorn subprocess + │ + ▼ +ContractSpec ──► runner.py ──► HTTP requests ──► CaseResult[] + │ + ▼ + ContractTestResults +``` + +**Key mechanics:** + +- **Spec parsing**: The OpenAPI spec uses custom `x-app` (server configuration) and `x-test-cases` (per-operation test inputs/expected outputs) extensions +- **Server management**: `ServerProcess` creates an isolated venv for the workspace project, starts uvicorn, polls `/health` until ready, and cleanly shuts down after tests +- **Test execution**: Each test case sends an HTTP request and validates: status code matches, response body contains expected keys/values (recursive deep match with floating-point tolerance) +- **Abort conditions**: Testing stops early if the server process dies or after 3 consecutive connection errors + +Output: `contract-test-results.yaml` + +### 5.5 Stage 5: Qualitative Evaluation (`packages/qualitative`) + +Compares the generated AIDLC documents against a golden baseline using semantic similarity scoring. + +#### Document Matching + +```text +golden aidlc-docs/ candidate aidlc-docs/ + inception/ inception/ + requirements.md ◄──► requirements.md (paired) + user-stories.md ◄──► user-stories.md (paired) + construction/ construction/ + code-generation.md ◄──► code-generation.md (paired) + extra-doc.md (unmatched candidate) +``` + +Documents are paired by relative path. Internal workflow files (`aidlc-state.md`, `audit.md`) are excluded. + +#### Scoring Dimensions + +Each document pair is scored on three dimensions (0.0 to 1.0): + +| Dimension | Weight | What It Measures | +| ----------------- | -------- | --------------------------------------- | +| Intent Similarity | 0.4 | Same goals, requirements, and purpose | +| Design Similarity | 0.4 | Same architecture, components, patterns | +| Completeness | 0.2 | Candidate covers all reference topics | + +**Overall per-document** = 0.4 × intent + 0.4 × design + 0.2 × completeness + +Scores are aggregated per-phase (inception, construction) then into an overall score. + +#### Two Scorer Implementations + +**HeuristicScorer** (offline, deterministic): + +- Intent: Term-frequency cosine similarity with stopword removal +- Design: Weighted blend of technical identifier Jaccard similarity (0.6) and heading structure Jaccard similarity (0.4) +- Completeness: Fraction of reference headings present in candidate + +**LlmScorer** (default, requires Bedrock): + +- Sends both documents to an LLM via the Bedrock `converse` API +- Prompt asks for JSON with the three dimension scores plus notes +- Uses temperature 0.0 for reproducibility +- Content truncated to 15K characters per document + +Output: `qualitative-comparison.yaml` + +### 5.6 Stage 6: Report Generation (`packages/reporting`) + +Generates consolidated reports by collecting all YAML artifacts from the run folder. + +#### Data Collection + +`reporting.collector.collect(run_folder)` reads all YAML files and assembles a `ReportData` dataclass containing: + +- `RunMeta` — identity, timing, models, rules +- `RunMetrics` — tokens (total + per-agent), wall clock, handoff timeline, artifact counts, error counts, context size stats +- `TestResults` — unit test pass/fail/total with pass percentage +- `QualityReport` — lint, security, duplication findings +- `ContractResults` — per-endpoint test results +- `QualitativeResults` — per-document and per-phase semantic scores + +#### Baseline Comparison + +If a `golden.yaml` baseline file exists (auto-discovered next to the `--golden` directory), the report includes a regression comparison: + +1. `extract_baseline()` flattens `ReportData` into a `BaselineMetrics` with ~30 numeric fields +2. `compare()` computes deltas and classifies each metric as improved/regressed/unchanged +3. Classification respects directionality (e.g., fewer lint errors = improved, higher test pass% = improved) + +#### Output Formats + +- **Markdown**: `render_markdown()` produces GitHub-flavored Markdown with verdict banners, tables, delta indicators, and collapsible detail sections +- **HTML**: `render_html()` wraps the Markdown with CSS styling for standalone viewing + +--- + +## 6. Orchestrators + +### 6.1 Single-Model Pipeline (`run_evaluation.py`) + +The main entry point. Orchestrates all six stages sequentially: + +```text +parse CLI args + │ + ├── --test mode ──► run pytest on all packages ──► exit + │ + ├── --evaluate-only mode ──► skip Stage 1 + │ ├── Stage 3 (quantitative) + │ ├── Stage 4 (contract) + │ ├── Stage 5 (qualitative) + │ └── Stage 6 (report) + │ + └── full pipeline mode + ├── Stage 1 (execution) ──► creates timestamped run folder + ├── Save evaluation config and repo info + ├── Stage 2 (read test-results.yaml from Stage 1) + ├── Stage 3 (quantitative) + ├── Stage 4 (contract, if --openapi provided) + ├── Stage 5 (qualitative) + ├── Stage 6 (report) + └── Print summary, exit 0 if all pass +``` + +**Resilience**: If the Strands swarm exits non-zero but AIDLC documents were produced, evaluation continues (the swarm may fail on a late handoff after all documents are written). + +### 6.2 Batch Evaluation (`run_batch_evaluation.py`) + +Runs `run_evaluation.py` in a loop for each selected model config: + +```text +discover_models() ← scans config/*.yaml, excludes default.yaml + │ + for each model: + │ ├── build CLI command with --executor-model override + │ ├── run as subprocess, capture stdout/stderr to log file + │ ├── find new timestamped run folder + │ ├── rename folder: <timestamp>-<slug>-<model-name> + │ └── write per-model batch-summary.yaml + │ + write batch-summary.yaml with timing and pass/fail for all models +``` + +Each model run is fully isolated — a separate subprocess invocation with its own run folder. + +### 6.3 Cross-Model Comparison (`run_comparison_report.py`) + +Generates a side-by-side comparison matrix after batch evaluation: + +```text +find_model_runs() ← discovers run folders by model name suffix + │ + for each model: + │ └── collect() + extract_baseline() → BaselineMetrics + │ + load golden baseline (golden.yaml) + │ + generate_comparison_markdown() → comparison-report.md + generate_comparison_yaml() → comparison-data.yaml +``` + +The comparison table includes ~30 metrics across unit tests, contract tests, code quality, qualitative scores, artifacts, execution cost, and context size — with delta indicators (^ better, v worse) relative to the golden baseline. + +### 6.4 IDE Evaluation (`run_ide_evaluation.py`) + +Runs the AIDLC workflow through third-party IDE AI assistants: + +```text +get_adapter(name) ← lazy import from registry + │ + ├── check_prerequisites() + ├── adapter.run(config) ──► IDE-specific automation + ├── normalize_output() ──► standard run folder layout + └── run_evaluation.py --evaluate-only ──► stages 2-6 +``` + +**Adapter pattern**: Each IDE is implemented as a subclass of `IDEAdapter` with three methods: + +- `check_prerequisites()` — verify the IDE is installed and configured +- `run(config)` — execute the AIDLC process through the IDE +- `name` — human-readable identifier + +**Output normalization**: `normalizer.py` converts IDE-specific output layouts into the standard run folder structure expected by the evaluation pipeline, generating synthetic `run-meta.yaml` and `run-metrics.yaml`. + +Supported adapters: Cursor, Cline, Copilot, Kiro, Windsurf, Antigravity. + +--- + +## 7. Data Flow: YAML Artifact Graph + +Every stage communicates through YAML files in the run folder. No in-memory state crosses stage boundaries. + +```text +Stage 1 (execution) + ├── writes: run-meta.yaml, run-metrics.yaml, test-results.yaml + ├── writes: aidlc-docs/**/*.md, workspace/**/* + │ +Stage 3 (quantitative) reads: workspace/ + └── writes: quality-report.yaml + │ +Stage 4 (contract) reads: workspace/, openapi.yaml (test input) + └── writes: contract-test-results.yaml + │ +Stage 5 (qualitative) reads: aidlc-docs/, golden-aidlc-docs/ (test input) + └── writes: qualitative-comparison.yaml + │ +Stage 6 (report) reads: ALL of the above YAML files + golden.yaml + └── writes: report.md, report.html +``` + +The orchestrator also writes `evaluation-config.yaml` (full resolved config snapshot) and updates `run-meta.yaml` with evaluation-level fields. + +--- + +## 8. Key Data Models + +### 8.1 Execution Metrics (`run-metrics.yaml`) + +```yaml +tokens: + total: {input_tokens, output_tokens, total_tokens, cache_read_tokens, cache_write_tokens} + per_agent: + executor: {input_tokens, output_tokens, total_tokens} + simulator: {input_tokens, output_tokens, total_tokens} +timing: + total_wall_clock_ms: int + handoffs: [{handoff: int, node_id: str, duration_ms: int}, ...] +handoff_patterns: + total_handoffs: int + sequence: [str, ...] + per_agent: {agent: {turn_count, total_duration_ms, avg_turn_duration_ms}} +artifacts: + workspace: {source_files, test_files, config_files, total_files, total_lines_of_code} + aidlc_docs: {inception_files, construction_files, total_files} +errors: + throttle_events, timeout_events, failed_tool_calls, model_error_events, ... +context_size: + total: {min_tokens, max_tokens, avg_tokens, median_tokens, sample_count} + per_agent: {executor: {...}, simulator: {...}} +``` + +### 8.2 Qualitative Scores (`qualitative-comparison.yaml`) + +```yaml +overall_score: float # 0.0 to 1.0 +phases: + - phase: inception + avg_intent: float + avg_design: float + avg_completeness: float + avg_overall: float + documents: + - path: inception/requirements.md + intent_similarity: float + design_similarity: float + completeness: float + overall: float + notes: str +``` + +### 8.3 Golden Baseline (`golden.yaml`) + +A flat numeric snapshot of ~30 key metrics from a promoted run. Used as the regression comparison target. Fields span execution cost, artifacts, test results, code quality, and qualitative scores. + +--- + +## 9. Tool Integration + +### 9.1 Strands SDK (Multi-Agent) + +The execution package uses the [Strands Agents SDK](https://github.com/strands-agents/sdk-python) for: + +- `Agent` — wraps a Bedrock model with a system prompt and tool set +- `Swarm` — orchestrates handoffs between agents with configurable limits (max handoffs, max iterations, execution timeout, node timeout) +- `@tool` decorator — registers Python functions as callable tools for agents +- `BedrockModel` — Bedrock model provider with configurable retry policy +- Hook system — `BeforeNodeCallEvent` / `AfterNodeCallEvent` for progress tracking + +### 9.2 Amazon Bedrock + +All LLM calls go through Amazon Bedrock via boto3. Configuration: + +- Read timeout: 900s (15 min) for execution agents, 300s (5 min) for the qualitative scorer +- Connect timeout: 30s +- Retry policy: 10 attempts with adaptive mode +- Models: Configurable per role (executor, simulator, scorer) + +### 9.3 Static Analysis Tools + +| Tool | Purpose | Output Format | Graceful Degradation | +| --------- | ----------------------- | -------------- | ------------------------------ | +| ruff | Python linting | JSON | Skipped if not on PATH | +| bandit | Python security | JSON | Skipped if not on PATH | +| semgrep | Multi-language security | JSON | Skipped if not on PATH | +| eslint | JS/TS linting | JSON | Falls back to npx | +| npm audit | JS dependency security | JSON | Needs package-lock.json | +| PMD CPD | Code duplication | XML | Configurable path or PATH scan | + +--- + +## 10. Security Model + +### 10.1 File Sandboxing + +All file operations performed by AI agents are sandboxed to the run folder: + +- `_resolve_safe(run_folder, relative_path)` resolves the path and verifies it stays within the run folder boundary +- Path traversal attempts (e.g., `../../etc/passwd`) are rejected with a `ValueError` +- Applied to: `read_file`, `write_file`, `list_files`, `run_command` + +### 10.2 Command Sandboxing + +The `run_command` tool provides a restricted shell environment: + +- Only `PATH`, `HOME`, `LANG`, `TERM` are set (plus tool-specific vars like `UV_CACHE_DIR`) +- `HOME` is set to the run folder to prevent reading host user configuration +- Commands have a configurable timeout (default 120s) +- Output is truncated at 50K characters + +### 10.3 Server Isolation (Contract Tests) + +The contract test server runs in its own venv: + +- `ServerProcess._ensure_venv()` creates an isolated venv in the workspace project +- This prevents `uv run` from walking up the directory tree and resolving the parent project +- The server is started via the venv's own Python binary + +--- + +## 11. Test Cases + +Test cases live in `test_cases/` and follow a standard structure: + +```text +test_cases/<case-name>/ + ├── vision.md # Project vision and constraints + ├── tech-env.md # Technical environment requirements + ├── openapi.yaml # API contract spec with x-test-cases + ├── golden-aidlc-docs/ # Reference aidlc-docs output (golden baseline) + │ ├── inception/ + │ │ ├── requirements.md + │ │ └── ... + │ └── construction/ + │ ├── code-generation.md + │ └── ... + └── golden.yaml # Promoted baseline metrics +``` + +The default test case is `sci-calc` (a scientific calculator API). All CLI defaults point to this test case. + +--- + +## 12. Extension Points + +### Adding a New Model + +1. Create `config/<model-name>.yaml` with `models.executor.model_id` set to the Bedrock model ID +2. The batch runner will automatically discover it + +### Adding a New IDE Adapter + +1. Create `packages/ide-harness/src/ide_harness/adapters/<name>.py` +2. Implement the `IDEAdapter` abstract class (three methods: `name`, `check_prerequisites`, `run`) +3. Register in `_ADAPTER_MAP` in `packages/ide-harness/src/ide_harness/registry.py` + +### Adding a New Static Analysis Tool + +1. Add an analyzer function in `packages/quantitative/src/quantitative/analyzers.py` (follow the `run_ruff` pattern) +2. Define a finding model if needed in `models.py` +3. Call it from `scanner.py` based on project type detection + +### Adding a New Test Case + +1. Create a directory under `test_cases/<case-name>/` +2. Provide `vision.md`, `tech-env.md`, and optionally `openapi.yaml` +3. Run the full pipeline once to generate the golden baseline +4. Use `reporting.baseline.promote()` to create `golden.yaml` +5. Copy the run's `aidlc-docs/` as `golden-aidlc-docs/` + +--- + +## 13. Dependency Stack + +| Component | Technology | +| --------------------- | ------------------------ | +| Language | Python 3.13+ | +| Package manager | uv (workspace mode) | +| AI orchestration | Strands Agents SDK | +| LLM provider | Amazon Bedrock (boto3) | +| HTTP client | httpx (contract tests) | +| ASGI server | uvicorn (contract tests) | +| Test framework | pytest | +| Serialization | PyYAML | +| Linting | ruff | +| Security scanning | bandit, semgrep | +| Duplication detection | PMD CPD (external) | diff --git a/scripts/aidlc-evaluator/CONTRIBUTING.md b/scripts/aidlc-evaluator/CONTRIBUTING.md new file mode 100644 index 00000000..c8b1fb7f --- /dev/null +++ b/scripts/aidlc-evaluator/CONTRIBUTING.md @@ -0,0 +1,159 @@ +# Contributing to AI-DLC Evaluation Framework + +Thank you for contributing to the AI-DLC workflows evaluation and reporting framework! + +## Getting Started + +### Prerequisites + +- Python 3.13+ +- [uv](https://github.com/astral-sh/uv) package manager +- Git + +### Setup + +```bash +# Clone the repository +git clone <repository-url> +cd aidlc-evaluation-framework + +# Install dependencies +uv sync + +# Run tests to verify setup +uv run pytest +``` + +## Development Workflow + +### 1. Create a Branch + +```bash +git checkout -b feature/your-feature-name +``` + +### 2. Make Changes + +Work in the appropriate package: + +- `aidlc-runner/` - Execution Framework (two-agent AIDLC workflow runner) +- `packages/qualitative/` - Semantic Evaluation (intent & design similarity scoring) +- `packages/quantitative/` - Code Evaluation (linting, security, organization) +- `packages/nonfunctional/` - NFR Evaluation (tokens, timing, consistency) +- `packages/reporting/` - Report generation +- `packages/shared/` - Common utilities + +Or contribute to other work streams: + +- `test_cases/` - Golden Test Cases (baseline inputs) +- `docs/writing-inputs/` - Vision and tech-env document guides +- `.github/workflows/` - GitHub CI/CD Integration & Management + +### 3. Run Tests + +```bash +# Run all tests +uv run pytest + +# Run specific package tests +uv run pytest tests/test_qualitative.py + +# Run with coverage +uv run pytest --cov +``` + +### 4. Lint Your Code + +```bash +# Check code style +uv run ruff check . + +# Auto-fix issues +uv run ruff check --fix . + +# Format code +uv run ruff format . +``` + +### 5. Commit Changes + +Write clear, descriptive commit messages: + +```bash +git add . +git commit -m "Add token tracking to nonfunctional package" +``` + +### 6. Submit a Pull Request + +- Push your branch to the repository +- Open a PR with a clear description of changes +- Link to any related issues +- Wait for automated tests to pass +- Address review feedback + +## Work Streams + +The project is organized around six big rocks. Your changes will typically fall into one or more of these: + +| Work Stream | Description | Package / Area | +| ----------------------- | --------------------------------------------- | ------------------------- | +| **Golden Test Case** | Curated baseline test inputs | `test_cases/` | +| **Execution Framework** | Two-agent AIDLC workflow runner (Owner: Jeff) | `aidlc-runner/` | +| **Semantic Evaluation** | Intent & design similarity scoring | `packages/qualitative/` | +| **Code Evaluation** | Linting, security, organization | `packages/quantitative/` | +| **NFR Evaluation** | Tokens, timing, consistency | `packages/nonfunctional/` | +| **GitHub CI/CD** | Pipeline integration & management | `.github/workflows/` | + +## Code Standards + +### Python Style + +- Follow PEP 8 (enforced by Ruff) +- Use type hints +- Maximum line length: 100 characters +- Write docstrings for public functions and classes + +### Testing + +- Write tests for new functionality +- Maintain or improve code coverage +- Use descriptive test names: `test_<what>_<condition>_<expected>` + +### Documentation + +- Update README.md if adding new features +- Add docstrings to new modules and functions +- Update relevant docs in `docs/` directory + +## Package Dependencies + +When adding dependencies: + +1. Add to the appropriate `pyproject.toml` in `packages/<package>/` or `aidlc-runner/` +2. Run `uv sync` to update lock file +3. Document why the dependency is needed in your PR + +## Reporting Issues + +When reporting bugs or requesting features: + +- Use GitHub Issues +- Provide clear reproduction steps +- Include relevant logs or error messages +- Specify which package is affected + +## Questions? + +- Review [FAQ.md](./FAQ.md) for common questions +- Check [OPERATING_PRINCIPLES.md](./OPERATING_PRINCIPLES.md) for decision-making guidance +- Ask in PR comments or open a discussion + +## Code of Conduct + +- Be respectful and constructive +- Focus on the code, not the person +- Welcome diverse perspectives +- Help others learn and grow + +Thank you for helping improve the AI-DLC evaluation framework! diff --git a/scripts/aidlc-evaluator/FAQ.md b/scripts/aidlc-evaluator/FAQ.md new file mode 100644 index 00000000..7b52c2cf --- /dev/null +++ b/scripts/aidlc-evaluator/FAQ.md @@ -0,0 +1,115 @@ +# AI-DLC Workflows Evaluation & Reporting Framework - FAQ + +## What is this? + +A comprehensive testing and reporting framework that validates changes to the AI-DLC workflows repository. It automatically evaluates code quality, semantic correctness, and performance to ensure changes don't negatively impact the system. + +## Who is this for? + +- **Maintainers** who need confidence that changes are safe to merge +- **Contributors** who want to demonstrate their changes improve (or don't harm) the system +- **Users** who depend on consistent, high-quality AI-assisted development workflows + +## What are the major work streams? + +The framework is organized around six big rocks: + +**1. Golden Test Case** + +- Curated baseline test cases containing full AIDLC docs and code output +- Versioned reference inputs that all evaluations run against +- Ensures consistent, reproducible evaluation across changes + +**2. Execution Framework (Jeff)** + +- Core orchestration engine that runs golden test cases through each evaluation +- Manages the pipeline from test case input to structured results output +- Coordinates across all evaluation dimensions + +**3. Semantic Evaluation** + +- Uses AI to semantically evaluate outputs at major human review points +- Scores outputs for correctness, completeness, and appropriateness +- Validates that AI-generated content meets quality standards +- All semantic metrics are reported **@k** — each evaluation runs multiple trials to account for non-determinism in AI-based grading (see "What does @k mean?" below) + +**4. Code Evaluation** + +- **Linting:** Code style correctness +- **Security:** Semgrep analysis for vulnerabilities +- **Organization:** Code duplication detection, library usage patterns +- Produces numeric scores (e.g., "3 high-severity security issues") + +**5. NFR Evaluation** + +- Token consumption per workflow +- Execution time measurements +- Cross-model consistency checks +- Resource utilization metrics + +**6. GitHub CI/CD Integration & Management** + +- Automated pipelines triggering evaluations on PRs +- Human-readable report generation and attachment +- Versioned report archiving for historical comparison + +## How does it work? + +1. **Golden test cases** define the reference inputs (AIDLC docs + expected code output) +2. The **execution framework** runs these test cases through each evaluation dimension +3. **Semantic, code, and NFR evaluations** produce structured results +4. **Reports** are generated summarizing impact across all dimensions +5. **GitHub CI/CD** automates the entire pipeline on PRs and attaches reports for review +6. Versioned reports are archived for historical comparison + +## What environments are supported? + +Kiro is a first-class citizen for testing, but the framework supports multiple AI tools and environments to meet customers where they are. + +## What does @k mean for semantic metrics? + +AI-based evaluations are non-deterministic — the same input can produce different scores across runs. To get trustworthy results, the framework runs each semantic evaluation multiple times (*k* trials) and reports two complementary metrics (see [Anthropic: Demystifying Evals for AI Agents](https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents)): + +- **pass@k** — The probability of at least one success in *k* attempts. Answers: *"Can this workflow produce a correct result?"* Higher *k* increases the score, since more attempts mean higher odds of at least one success. +- **pass^k** — The probability that *all k* attempts succeed. Answers: *"Does this workflow consistently produce correct results?"* Higher *k* makes this harder to achieve, since every trial must pass. + +At *k*=1 the two metrics are identical (both equal the per-trial success rate). As *k* grows they diverge — pass@k approaches 100% while pass^k drops toward 0%. Together they tell you both the capability ceiling and the reliability floor of a workflow change. + +Code evaluation and NFR metrics are deterministic and do not require @k. + +## How do I interpret the reports? + +Reports include: + +- **Semantic scores @k:** AI-evaluated ratings with pass@k (capability) and pass^k (reliability) +- **Code scores:** Numeric metrics for linting, security, duplication (deterministic) +- **NFR metrics:** Token usage, execution time, consistency (deterministic) +- **Trend analysis:** Comparison to previous versions (against golden test cases) +- **Pass/fail gates:** Clear indicators of whether changes meet thresholds + +## What if my change shows a evaluation? + +Evaluations don't automatically block merges—they provide context. Work with maintainers to: + +- Understand if the evaluation is acceptable given the benefits +- Identify ways to mitigate the evaluation +- Document known trade-offs + +## How does this relate to the AI-DLC workflows repository? + +This framework monitors and validates the [AI-DLC workflows](https://github.com/awslabs/aidlc-workflows) to ensure changes maintain or improve quality. It's a testing layer on top of the workflows themselves. + +## Can I run tests locally before submitting a PR? + +Yes—the framework is designed to run in CI/CD but can also be executed locally to get early feedback. + +## How are reports versioned? + +Each test run produces a numbered/named version that includes: + +- Timestamp and commit SHA +- Full test results +- Comparison to baseline +- Human-readable summary + +Reports are stored for historical analysis and trend tracking. diff --git a/scripts/aidlc-evaluator/README.md b/scripts/aidlc-evaluator/README.md new file mode 100644 index 00000000..509638ae --- /dev/null +++ b/scripts/aidlc-evaluator/README.md @@ -0,0 +1,467 @@ +# AI-DLC Workflows Evaluation & Reporting Framework + +Automated testing and reporting framework for validating changes to the [AI-DLC workflows](https://github.com/awslabs/aidlc-workflows) repository. + +## Overview + +This framework is organized around six major work streams ("big rocks"): + +1. **Golden Test Case** — Curated baseline test cases (AIDLC docs + code output) used as reference inputs for all evaluations +2. **Execution Framework** — Core orchestration that runs test cases through the evaluation pipeline +3. **Semantic Evaluation** — AI-based evaluation of output correctness, completeness, and appropriateness (reported @k to account for non-determinism) +4. **Code Evaluation** — Static analysis of generated code (linting, security scanning, duplication detection) +5. **NFR Evaluation** — Non-functional requirements testing (token consumption, execution time, cross-model consistency) +6. **GitHub CI/CD Integration & Management** — Automated pipelines that trigger evaluations on PRs and attach reports + +## Quick Start + +```bash +# Install dependencies +uv sync + +# Run all unit tests +uv run python run.py test +# Note: On Windows, 7 tests in test_run_command.py are expected to fail +# because they use Unix shell commands (echo, exit, sleep, etc.) not available on Windows. + +# Build sandbox docker image +./docker/sandbox/build.sh + +# Full pipeline: execute AIDLC workflow + evaluate + report (requires Bedrock) with defaults +uv run python run.py full + +# Full pipeline: execute AIDLC workflow + evaluate + report (requires Bedrock) +uv run python run.py full \ + --vision test_cases/sci-calc/vision.md \ + --tech-env test_cases/sci-calc/tech-env.md \ + --golden test_cases/sci-calc/golden-aidlc-docs \ + --openapi test_cases/sci-calc/openapi.yaml + +# Evaluate an existing run (skip execution, just score via Bedrock) +uv run python run.py full \ + --evaluate-only runs/<run-folder>/aidlc-docs \ + --golden test_cases/sci-calc/golden-aidlc-docs \ + --openapi test_cases/sci-calc/openapi.yaml +``` + +## Evaluation Pipeline + +The evaluation pipeline (`run.py full` or `scripts/run_evaluation.py`) orchestrates six stages: + +| Stage | Package | Description | +| --------------- | ----------------------- | --------------------------------------------------------------- | +| 1. Execution | `packages/execution` | Runs the AIDLC two-agent workflow to produce docs + code | +| 2. Post-Run | (inside execution) | Installs deps and runs the generated project's tests | +| 3. Quantitative | `packages/quantitative` | Lints, security-scans, and duplication-checks generated code | +| 4. Contract | `packages/contracttest` | Spins up the generated app and validates API endpoints | +| 5. Qualitative | `packages/qualitative` | Compares generated docs against golden baseline via Bedrock LLM | +| 6. Report | `packages/reporting` | Generates consolidated Markdown + HTML reports | + +Output for each run is written to a timestamped folder under `runs/`: + +```txt +runs/<timestamp>/ + ├── aidlc-docs/ # AIDLC workflow documents + ├── workspace/ # Generated application code + ├── run-meta.yaml # Run identity + config + ├── run-metrics.yaml # Tokens, timing, artifacts, errors + ├── test-results.yaml # Post-run test output + ├── quality-report.yaml # Lint + security + duplication findings + ├── contract-test-results.yaml # API endpoint validation + ├── qualitative-comparison.yaml # Semantic scoring + ├── evaluation-config.yaml # Full resolved config snapshot + ├── report.md # Consolidated Markdown report + └── report.html # Consolidated HTML report +``` + +## Configuration + +### Config file (`config/default.yaml`) + +The main configuration file controls AWS settings, models, swarm parameters, timeouts, and tool paths. Edit this file to change defaults, or pass a custom config with `--config`: + +```yaml +aws: + profile: "default" + region: "us-east-1" + +models: + executor: + provider: "bedrock" + model_id: "global.anthropic.claude-opus-4-6-v1" + simulator: + provider: "bedrock" + model_id: "global.anthropic.claude-opus-4-6-v1" + scorer: + provider: "bedrock" + model_id: "global.anthropic.claude-opus-4-6-v1" + +aidlc: + rules_source: "git" # "git" or "local" + rules_repo: "https://github.com/awslabs/aidlc-workflows.git" + rules_ref: "main" + rules_local_path: null + +swarm: + max_handoffs: 200 + max_iterations: 200 + execution_timeout: 14400 + node_timeout: 3600 + +runs: + output_dir: "./runs" + +execution: + enabled: true + command_timeout: 120 + post_run_tests: true + post_run_timeout: 300 + +execution: + sandbox: + enabled: true + image: aidlc-sandbox:latest + memory: 2g + cpus: 2 + +tools: + pmd_path: null # Path to PMD executable; if null, looks for 'pmd' on PATH +``` + +Precedence: `CLI flags > YAML config > built-in defaults` + +### Model-specific configs + +Per-model config files in `config/` override the executor model while inheriting everything else from `default.yaml`: + +| File | Model | +| ----------------------------- | ------------------------------------------- | +| `config/opus-4-6.yaml` | Claude Opus 4.6 | +| `config/opus-4-5.yaml` | Claude Opus 4.5 | +| `config/sonnet-4-6.yaml` | Claude Sonnet 4.6 | +| `config/sonnet-4-5.yaml` | Claude Sonnet 4.5 | +| `config/nova-premier.yaml` | Amazon Nova Premier | +| `config/nova-pro.yaml` | Amazon Nova Pro | +| `config/nova-lite.yaml` | Amazon Nova Lite | +| `config/mistral-large-3.yaml` | Mistral Large 3 (675B) | +| `config/devstral-2.yaml` | Mistral Devstral 2 (123B, code-specialized) | + +### Docker Sandbox + +The evaluation framework runs AI-generated code inside an isolated Docker container to prevent untrusted code from affecting the host system. The sandbox image includes Python 3.13 + uv, Node.js 22 + npm, and common build tools, running as a non-root user. + +#### Prerequisites + +Docker must be installed and running on the host machine. + +#### Building the sandbox image + +```bash +# Build the image (one-time setup, or after Dockerfile changes) +./docker/sandbox/build.sh + +# Or build manually +docker build -t aidlc-sandbox:latest docker/sandbox/ +``` + +This produces the `aidlc-sandbox:latest` image referenced by the default configuration. + +#### Configuration + +Sandbox settings are in `config/default.yaml` under `execution.sandbox`: + +```yaml +execution: + sandbox: + enabled: true # Set to false to run generated code directly on the host + image: aidlc-sandbox:latest # Docker image name (must be built first) + memory: 2g # Container memory limit + cpus: 2 # Container CPU limit +``` + +When sandbox is enabled, post-run tests (stage 2) and contract test servers (stage 4) execute inside the container. The generated `workspace/` directory is mounted into the container at `/workspace`. If Docker is not available or `enabled` is set to `false`, commands run directly on the host. + +### Tool configuration + +**PMD (code duplication detection):** PMD CPD is used for copy-paste detection in stage 3. Configure the path in `config/default.yaml`: + +```yaml +tools: + pmd_path: /path/to/pmd # Absolute path to PMD executable + # pmd_path: null # null = search PATH automatically +``` + +If PMD is not found, duplication analysis is skipped with a note — it does not fail the evaluation. + +### Pipeline CLI flags + +```bash +uv run python run.py full \ + --vision test_cases/sci-calc/vision.md \ + --tech-env test_cases/sci-calc/tech-env.md \ + --golden test_cases/sci-calc/golden-aidlc-docs \ + --openapi test_cases/sci-calc/openapi.yaml \ + --config config/default.yaml \ + --profile my-aws-profile \ + --region us-west-2 \ + --executor-model global.anthropic.claude-opus-4-6-v1 \ + --scorer-model us.anthropic.claude-sonnet-4-5-20250929-v1:0 \ + --report-format both +``` + +Supported flags: + +- `--config` — path to YAML config file (default: `config/default.yaml`) +- `--test` — run unit tests only +- `--vision`, `--tech-env` — execution inputs +- `--evaluate-only` — score an existing `aidlc-docs` folder without re-running execution +- `--golden` — reference baseline docs directory +- `--openapi` — contract test spec +- `--report-format` — `markdown`, `html`, or `both` +- `--baseline` — override path to `golden.yaml` (otherwise auto-discovered next to `--golden`) +- `--output-dir` — override run output folder +- `--results` — write qualitative results YAML to custom path +- `--profile`, `--region` — AWS credentials/region for Bedrock +- `--executor-model` — execution model override +- `--scorer-model` — qualitative scoring model override +- `--rules-ref` — git ref (branch/tag/commit) for AIDLC rules + +## Batch Evaluation + +Run the evaluation pipeline across multiple Bedrock models sequentially, then generate a cross-model comparison report. + +### List available models + +```bash +uv run python run.py batch --list +``` + +### Run batch evaluation + +```bash +# Run all configured models +uv run python run.py batch --models all + +# Run specific models (names match config file stems in config/) +uv run python run.py batch --models nova-pro,sonnet-4-5 + +# Override AWS settings +uv run python run.py batch --models all \ + --profile my-aws-profile \ + --region us-east-1 +``` + +Each model run is stored under `runs/<model-name>/` with full evaluation artifacts. A `batch-summary.yaml` is written to the runs directory with timing and pass/fail status for each model. + +### Generate cross-model comparison + +After batch evaluation completes, generate a comparison matrix: + +```bash +# Compare all model runs found under runs/ +uv run python run.py compare + +# Compare specific models against golden baseline +uv run python run.py compare \ + --models nova-pro,sonnet-4-5 \ + --baseline test_cases/sci-calc/golden.yaml +``` + +This produces `runs/comparison/comparison-report.md` and `runs/comparison/comparison-data.yaml` with side-by-side metrics across all models (unit tests, contract tests, code quality, qualitative scores, token usage, and timing). + +## CLI Evaluation + +Run the AIDLC evaluation through CLI-based AI assistants (Claude Code, Kiro CLI) using the CLI harness (`packages/cli-harness`). + +### List available adapters + +```bash +uv run python run.py cli --list +``` + +Supported adapters: `claude-code`, `kiro-cli`. + +### Run CLI evaluation + +```bash +# Run evaluation through Claude Code +uv run python run.py cli --cli claude-code \ + --vision test_cases/sci-calc/vision.md \ + --golden test_cases/sci-calc/golden-aidlc-docs + +# Run through Kiro CLI with a specific model +uv run python run.py cli --cli kiro-cli \ + --vision test_cases/sci-calc/vision.md \ + --golden test_cases/sci-calc/golden-aidlc-docs \ + --model claude-sonnet-4 + +# Check prerequisites for an adapter +uv run python run.py cli --cli claude-code --check-only +``` + +Output is written to `runs/<cli-name>-<timestamp>-<uuid>/`. The CLI harness runs the adapter, then invokes `scripts/run_evaluation.py --evaluate-only` for scoring (stages 2–6). + +## IDE Evaluation + +Run the AIDLC evaluation through third-party IDE AI assistants using the IDE harness (`packages/ide-harness`). + +### List available adapters + +```bash +uv run python run.py ide --list +``` + +Supported adapters: Cursor, Cline, Copilot, Kiro, Windsurf, Antigravity. + +### Run IDE evaluation + +```bash +# Run evaluation through Cursor +uv run python run.py ide --ide cursor \ + --vision test_cases/sci-calc/vision.md \ + --golden test_cases/sci-calc/golden-aidlc-docs + +# Check prerequisites for an IDE adapter +uv run python run.py ide --ide kiro --check-only +``` + +Output is written to `runs/ide-<adapter-name>/`. + +## Extension Hook Testing + +Test the AIDLC workflow with different rules extension configurations. The extension hook feature allows progressive loading of extensions (security, performance, observability) based on opt-in questions. + +```bash +# List available extension configurations +uv run python run.py ext-test --list-configs + +# Run standard test (all extensions vs no extensions) +uv run python run.py ext-test --scenario sci-calc + +# Use specific rules branch with extension support +uv run python run.py ext-test --scenario sci-calc \ + --rules-ref feat/extension_hook_question_split +``` + +This runs the evaluation twice: + +1. With all extension opt-ins answered "YES" (maximum guidance) +2. With all extension opt-ins answered "NO" (baseline only) + +Results are saved to `runs/<scenario>/extension-test/` with a comparison report showing the impact of different extension configurations. + +See [Extension Hook Testing Guide](./docs/extension-hook-testing.md) for detailed documentation. + +## Trend Reporting + +Generate cross-release trend reports that track evaluation metrics over time. Fetches evaluation bundles from GitHub releases and Actions artifacts, then renders HTML, Markdown, and YAML reports. + +```bash +# Generate trend report (requires gh CLI authenticated) +uv run python run.py trend --baseline test_cases/sci-calc/golden.yaml + +# HTML only with verbose output +uv run python run.py trend --baseline test_cases/sci-calc/golden.yaml --format html -v + +# Include local evaluation bundles +uv run python run.py trend --baseline test_cases/sci-calc/golden.yaml \ + --local-bundle runs/my-run/report.zip + +# Gate mode (exit non-zero on regressions) +uv run python run.py trend --baseline test_cases/sci-calc/golden.yaml --gate +``` + +The HTML executive summary displays six metric cards: + +- **Qualitative Score** — semantic quality vs golden baseline (higher is better) +- **Contract Tests** — API pass rate as passed/total (higher is better) +- **Unit Tests** — pass rate shown as percentage (higher is better) +- **Lint Findings** — static analysis issues (lower is better) +- **Execution Time** — generation duration (lower is better) +- **Total Tokens** — LLM token consumption (lower is better) + +Output is written to a timestamped folder under the output directory (default: `runs/`). + +A sample HTML report is available at [`packages/trend-reports/examples/trend-report.html`](./packages/trend-reports/examples/trend-report.html). + +## Running the Execution Component Directly + +For full execution-level controls you can run `aidlc-runner` directly: + +```bash +uv run aidlc-runner \ + --vision test_cases/sci-calc/vision.md \ + --tech-env test_cases/sci-calc/tech-env.md \ + --config config/default.yaml \ + --aws-profile my-aws-profile \ + --aws-region us-west-2 \ + --executor-model global.anthropic.claude-opus-4-6-v1 \ + --simulator-model us.anthropic.claude-sonnet-4-5-20250929-v1:0 \ + --output-dir ./runs +``` + +Execution-specific toggles: + +- `--rules-path <local-rules-dir>` — forces local rules source +- `--no-exec` — disable in-workflow command execution +- `--no-post-tests` — disable post-run tests + +## Repository Structure + +```txt +. +├── run.py # Master entry point — dispatches to evaluation modes +├── scripts/ # Specialized run scripts +│ ├── run_evaluation.py # Single-model evaluation pipeline +│ ├── run_batch_evaluation.py # Multi-model batch evaluation +│ ├── run_comparison_report.py # Cross-model comparison report generator +│ ├── run_cli_evaluation.py # CLI adapter evaluation runner +│ ├── run_ide_evaluation.py # IDE adapter evaluation runner +│ ├── run_extension_test.py # Extension hook testing (opt-in configurations) +│ ├── run_trend_report.py # Cross-release trend report generation +│ └── README.md # Scripts documentation +├── config/ +│ ├── default.yaml # Default configuration (models, AWS, timeouts, tools) +│ ├── nova-premier.yaml # Amazon Nova Premier executor override +│ ├── nova-pro.yaml # Amazon Nova Pro executor override +│ ├── sonnet-4-5.yaml # Claude Sonnet 4.5 executor override +│ └── sonnet-4-6.yaml # Claude Sonnet 4.6 executor override +├── packages/ +│ ├── execution/ # AIDLC workflow runner (two-agent Strands orchestrator) +│ ├── qualitative/ # Semantic evaluation — intent & design similarity via Bedrock +│ ├── quantitative/ # Code evaluation — linting, security, duplication (PMD CPD) +│ ├── contracttest/ # API contract testing against OpenAPI specs +│ ├── nonfunctional/ # NFR evaluation — tokens, timing, consistency +│ ├── reporting/ # Consolidated report generation (Markdown + HTML) +│ ├── trend-reports/ # Cross-release trend reporting (HTML, Markdown, YAML) +│ ├── cli-harness/ # CLI adapter framework (Claude Code, Kiro CLI) +│ ├── ide-harness/ # IDE adapter framework (Cursor, Cline, Kiro, etc.) +│ └── shared/ # Common utilities +├── test_cases/ # Golden test cases (vision + tech-env + golden aidlc-docs) +├── runs/ # Run output folders (one per evaluation run) +├── docker/ +│ └── sandbox/ # Dockerfile + build script for isolated execution +├── docs/ # Additional documentation +│ ├── extension-hook-testing.md # Extension hook testing guide +│ ├── ide-harness-design.md # IDE adapter architecture +│ └── file-structure.md # Project file organization reference +├── pyproject.toml # Workspace configuration +└── uv.lock # Dependency lock file +``` + +## Documentation + +- [FAQ](./FAQ.md) — Common questions and answers +- [Contributing](./CONTRIBUTING.md) — Guidelines for submitting changes +- [Architecture](./ARCHITECTURE.md) — System design and implementation details +- [Extension Hook Testing](./docs/extension-hook-testing.md) — Testing AIDLC with different extension configurations +- [IDE Harness Design](./docs/ide-harness-design.md) — Architecture of the IDE adapter framework +- [File Structure](./docs/file-structure.md) — Project file organization reference + +## Contributing + +See [CONTRIBUTING.md](./CONTRIBUTING.md) for guidelines on submitting changes. + +## License + +[License information to be added] diff --git a/scripts/aidlc-evaluator/config/default.yaml b/scripts/aidlc-evaluator/config/default.yaml new file mode 100644 index 00000000..28992087 --- /dev/null +++ b/scripts/aidlc-evaluator/config/default.yaml @@ -0,0 +1,43 @@ +# aws: +# profile: "default" +# region: "us-east-1" + +models: + executor: + provider: "bedrock" + model_id: "global.anthropic.claude-opus-4-6-v1" + simulator: + provider: "bedrock" + model_id: "global.anthropic.claude-opus-4-6-v1" + scorer: + provider: "bedrock" + model_id: "global.anthropic.claude-opus-4-6-v1" + +aidlc: + rules_source: "git" + rules_repo: "https://github.com/awslabs/aidlc-workflows" + rules_ref: "main" + rules_local_path: null + +swarm: + max_handoffs: 200 + max_iterations: 200 + execution_timeout: 14400 + node_timeout: 3600 + +runs: + output_dir: "./runs" + +execution: + enabled: true + command_timeout: 120 + post_run_tests: true + post_run_timeout: 300 + sandbox: + enabled: true + image: aidlc-sandbox:latest + memory: 2g + cpus: 2 + +tools: + pmd_path: null # Path to PMD executable; if null, looks for 'pmd' on PATH diff --git a/scripts/aidlc-evaluator/config/devstral-2.yaml b/scripts/aidlc-evaluator/config/devstral-2.yaml new file mode 100644 index 00000000..34fe62d1 --- /dev/null +++ b/scripts/aidlc-evaluator/config/devstral-2.yaml @@ -0,0 +1,5 @@ +# Devstral 2 (123B, code-specialized) — override executor model only; all other settings inherit from default.yaml +# No global or cross-region inference profile available; region-specific only +models: + executor: + model_id: "mistral.devstral-2-123b" diff --git a/scripts/aidlc-evaluator/config/mistral-large-3.yaml b/scripts/aidlc-evaluator/config/mistral-large-3.yaml new file mode 100644 index 00000000..8a3e7744 --- /dev/null +++ b/scripts/aidlc-evaluator/config/mistral-large-3.yaml @@ -0,0 +1,5 @@ +# Mistral Large 3 (675B) — override executor model only; all other settings inherit from default.yaml +# No global or cross-region inference profile available; region-specific only +models: + executor: + model_id: "mistral.mistral-large-3-675b-instruct" diff --git a/scripts/aidlc-evaluator/config/nova-lite.yaml b/scripts/aidlc-evaluator/config/nova-lite.yaml new file mode 100644 index 00000000..a19aa434 --- /dev/null +++ b/scripts/aidlc-evaluator/config/nova-lite.yaml @@ -0,0 +1,4 @@ +# Nova Lite — override executor model only; all other settings inherit from default.yaml +models: + executor: + model_id: "us.amazon.nova-lite-v1:0" diff --git a/scripts/aidlc-evaluator/config/nova-premier.yaml b/scripts/aidlc-evaluator/config/nova-premier.yaml new file mode 100644 index 00000000..39d05c7b --- /dev/null +++ b/scripts/aidlc-evaluator/config/nova-premier.yaml @@ -0,0 +1,4 @@ +# Nova Premier — override executor model only; all other settings inherit from default.yaml +models: + executor: + model_id: "us.amazon.nova-premier-v1:0" diff --git a/scripts/aidlc-evaluator/config/nova-pro.yaml b/scripts/aidlc-evaluator/config/nova-pro.yaml new file mode 100644 index 00000000..0e5c0ff5 --- /dev/null +++ b/scripts/aidlc-evaluator/config/nova-pro.yaml @@ -0,0 +1,4 @@ +# Nova Pro — override executor model only; all other settings inherit from default.yaml +models: + executor: + model_id: "us.amazon.nova-pro-v1:0" diff --git a/scripts/aidlc-evaluator/config/opus-4-5.yaml b/scripts/aidlc-evaluator/config/opus-4-5.yaml new file mode 100644 index 00000000..dcd8d964 --- /dev/null +++ b/scripts/aidlc-evaluator/config/opus-4-5.yaml @@ -0,0 +1,4 @@ +# Claude Opus 4.5 — override executor model only; all other settings inherit from default.yaml +models: + executor: + model_id: "global.anthropic.claude-opus-4-5-20251101-v1:0" diff --git a/scripts/aidlc-evaluator/config/opus-4-6.yaml b/scripts/aidlc-evaluator/config/opus-4-6.yaml new file mode 100644 index 00000000..8aa8c892 --- /dev/null +++ b/scripts/aidlc-evaluator/config/opus-4-6.yaml @@ -0,0 +1,4 @@ +# Claude Opus 4.6 — override executor model only; all other settings inherit from default.yaml +models: + executor: + model_id: "global.anthropic.claude-opus-4-6-v1" diff --git a/scripts/aidlc-evaluator/config/sonnet-4-5.yaml b/scripts/aidlc-evaluator/config/sonnet-4-5.yaml new file mode 100644 index 00000000..273354b4 --- /dev/null +++ b/scripts/aidlc-evaluator/config/sonnet-4-5.yaml @@ -0,0 +1,4 @@ +# Claude Sonnet 4.5 — override executor model only; all other settings inherit from default.yaml +models: + executor: + model_id: "global.anthropic.claude-sonnet-4-5-20250929-v1:0" diff --git a/scripts/aidlc-evaluator/config/sonnet-4-6.yaml b/scripts/aidlc-evaluator/config/sonnet-4-6.yaml new file mode 100644 index 00000000..a146eacd --- /dev/null +++ b/scripts/aidlc-evaluator/config/sonnet-4-6.yaml @@ -0,0 +1,4 @@ +# Claude Sonnet 4.6 — override executor model only; all other settings inherit from default.yaml +models: + executor: + model_id: "global.anthropic.claude-sonnet-4-6" diff --git a/scripts/aidlc-evaluator/docker/sandbox/Dockerfile b/scripts/aidlc-evaluator/docker/sandbox/Dockerfile new file mode 100644 index 00000000..6606c8f9 --- /dev/null +++ b/scripts/aidlc-evaluator/docker/sandbox/Dockerfile @@ -0,0 +1,52 @@ +# Multi-language sandbox image for running AI-generated code in isolation. +# +# Includes Python 3.13 + uv, Node.js 22 + npm, and common build tools. +# Runs as a non-root user with no credentials or host tools. +# +# Security notes: +# - Base image is intentionally not pinned to a hash to receive security updates +# - HEALTHCHECK is omitted as this is an ephemeral test sandbox, not a service +# - RUN commands use pipes without pipefail, acceptable for dependency installation + +# checkov:skip=CKV_DOCKER_2:HEALTHCHECK not needed for ephemeral test sandbox +# nosemgrep: dockerfile-source-not-pinned +FROM public.ecr.aws/docker/library/python:3.13-slim AS base + +# Install system dependencies and Node.js 22 +# nosemgrep: set-pipefail +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + gcc \ + g++ \ + make \ + git \ + ca-certificates \ + gnupg \ + && mkdir -p /etc/apt/keyrings \ + && curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key \ + | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \ + && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_22.x nodistro main" \ + > /etc/apt/sources.list.d/nodesource.list \ + && apt-get update && apt-get install -y --no-install-recommends nodejs \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +# Install uv (Python package manager) +COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv + +# Create non-root sandbox user (UID 1000) +RUN groupadd -g 1000 sandbox \ + && useradd -u 1000 -g 1000 -m -s /bin/bash sandbox + +# Set up workspace directory +RUN mkdir /workspace && chown sandbox:sandbox /workspace + +# Pre-configure uv and npm for the sandbox user +ENV UV_CACHE_DIR=/home/sandbox/.cache/uv +ENV NPM_CONFIG_CACHE=/home/sandbox/.cache/npm +RUN mkdir -p /home/sandbox/.cache/uv /home/sandbox/.cache/npm \ + && chown -R sandbox:sandbox /home/sandbox/.cache + +USER sandbox +WORKDIR /workspace + +CMD ["bash"] diff --git a/scripts/aidlc-evaluator/docker/sandbox/build.sh b/scripts/aidlc-evaluator/docker/sandbox/build.sh new file mode 100755 index 00000000..3d588049 --- /dev/null +++ b/scripts/aidlc-evaluator/docker/sandbox/build.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +# Build the aidlc-sandbox Docker image. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +docker build -t aidlc-sandbox:latest "$SCRIPT_DIR" diff --git a/scripts/aidlc-evaluator/docs/extension-hook-testing.md b/scripts/aidlc-evaluator/docs/extension-hook-testing.md new file mode 100644 index 00000000..d121a9a5 --- /dev/null +++ b/scripts/aidlc-evaluator/docs/extension-hook-testing.md @@ -0,0 +1,246 @@ +# Extension Hook Testing + +## Overview + +The extension hook testing framework validates the AIDLC workflow's progressive loading of rules extensions with opt-in questions. This feature allows testing how different extension configurations impact the quality and characteristics of generated outputs. + +## Background + +The AIDLC workflows repository includes an extension hook feature (branch: `feat/extension_hook_question_split`) that introduces opt-in questions for rules extensions. For example: + +- **Security Baseline**: `security-baseline.opt-in.md` - Security best practices extension +- **Performance**: Performance optimization guidelines +- **Observability**: Monitoring and logging patterns + +Each extension can be optionally loaded based on user responses to opt-in questions, allowing tailored AIDLC guidance. + +## Extension Test Script + +The `run_extension_test.py` script automates testing of different extension configurations by: + +1. Running the AIDLC evaluation multiple times with different opt-in configurations +2. Comparing results across configurations +3. Generating reports showing the impact of extension choices + +### Default Configurations + +Two default configurations are provided: + +1. **all-extensions**: All extension opt-ins answered "YES" + - Tests maximum AIDLC guidance with all extensions loaded + - Expected to produce more comprehensive outputs + +2. **no-extensions**: All extension opt-ins answered "NO" + - Tests baseline AIDLC guidance without extensions + - Provides a minimal baseline for comparison + +## Usage + +### Basic Usage + +Run a standard comparison (all yes vs all no): + +```bash +# Using master run.py (recommended) +python run.py ext-test --scenario sci-calc + +# Direct invocation +python scripts/run_extension_test.py --scenario sci-calc +``` + +### List Available Configurations + +```bash +python run.py ext-test --list-configs +``` + +Output: + +```text +Available extension test configurations: + + all-extensions All Extensions Enabled + All extension opt-ins answered YES + + no-extensions No Extensions + All extension opt-ins answered NO (baseline only) +``` + +### Specify Custom Configurations + +```bash +python run.py ext-test --scenario sci-calc \ + --configs all-extensions,no-extensions +``` + +### Override Rules Branch + +By default, the script uses the `feat/extension_hook_question_split` branch. To use a different branch: + +```bash +python run.py ext-test --scenario sci-calc \ + --rules-ref main +``` + +### Full Configuration + +```bash +python run.py ext-test --scenario sci-calc \ + --configs all-extensions,no-extensions \ + --rules-ref feat/extension_hook_question_split \ + --profile my-aws-profile \ + --region us-east-1 \ + --executor-model global.anthropic.claude-opus-4-6-v1 \ + --scorer-model global.anthropic.claude-opus-4-6-v1 +``` + +## Output Structure + +The extension test creates a structured output directory: + +```text +runs/<scenario>/extension-test/ +├── 20260309T151234-ext-all-extensions/ # Run with all extensions +│ ├── aidlc-docs/ # Generated docs +│ ├── workspace/ # Generated code +│ ├── run-meta.yaml # Run metadata +│ ├── extension-test-config.yaml # Extension config used +│ ├── test-results.yaml # Post-run test results +│ ├── quality-report.yaml # Code quality metrics +│ ├── contract-test-results.yaml # API contract test results +│ ├── qualitative-comparison.yaml # Semantic evaluation +│ └── extension-test.log # Run log +├── 20260309T153456-ext-no-extensions/ # Run without extensions +│ └── ... (same structure) +└── extension-comparison/ + ├── extension-test-summary.yaml # Comparison summary + └── extension-test-report.md # Human-readable report +``` + +## Extension Test Report + +The generated report includes: + +### Test Configuration Summary + +Shows each configuration that was tested: + +- Configuration name and description +- Pass/fail status +- Duration +- Output directory path + +### Detailed Comparison Instructions + +Provides commands to run detailed cross-run comparisons: + +```bash +python run.py compare --runs-dir runs/<scenario>/extension-test \ + --scenario <scenario> +``` + +### Analysis Guidance + +Suggests areas to examine: + +- Qualitative scores comparison +- Differences in generated artifacts +- Impact on code quality metrics +- Test pass rates +- Token usage differences + +## Interpreting Results + +### Expected Differences + +When comparing "all extensions" vs "no extensions", you may observe: + +1. **Code Quality** + - All extensions: More comprehensive error handling, security measures + - No extensions: Simpler, baseline implementation + +2. **Test Coverage** + - All extensions: Potentially more test cases + - No extensions: Basic test coverage + +3. **Documentation** + - All extensions: More detailed docs with security/performance notes + - No extensions: Essential documentation only + +4. **Token Usage** + - All extensions: Higher token consumption (more context loaded) + - No extensions: Lower token usage + +5. **Qualitative Scores** + - Compare alignment with golden baseline + - Extensions may improve specific dimensions + +## Integration with CI/CD + +The extension test can be integrated into continuous integration: + +```yaml +# Example GitLab CI job +extension-test: + script: + - python run.py ext-test --scenario sci-calc + artifacts: + paths: + - runs/sci-calc/extension-test/ + expire_in: 1 week +``` + +## Implementation Notes + +### Current Status + +The extension opt-in mechanism is still under active development. The test script includes placeholders for controlling opt-in answers. Once the mechanism is finalized, the script will be updated to support: + +- Environment variables (e.g., `AIDLC_EXTENSION_OPT_IN=yes|no`) +- Config file fields (e.g., `aidlc.extension_opt_in_default`) +- CLI flags (e.g., `--extension-opt-in yes|no|prompt`) +- Answer files (e.g., `--extension-answers answers.yaml`) + +### Extension Metadata + +Each test run includes an `extension-test-config.yaml` file documenting: + +- Which configuration was used +- The opt-in settings applied +- The rules reference (branch/tag/commit) +- Timestamp of the run + +## Future Enhancements + +Planned improvements to extension testing: + +1. **Custom Configuration Files** + - Define arbitrary extension combinations + - YAML-based configuration format + +2. **Extension-Specific Comparisons** + - Test individual extensions in isolation + - Measure incremental impact of each extension + +3. **Automated Regression Detection** + - Flag when extension changes degrade quality + - Track extension impact over time + +4. **Matrix Testing** + - Test all combinations of N extensions + - Generate comprehensive comparison matrices + +## References + +- [Extension Hook Feature Branch](https://github.com/awslabs/aidlc-workflows/tree/feat/extension_hook_question_split) +- [Security Baseline Opt-in Example](https://github.com/awslabs/aidlc-workflows/blob/feat/extension_hook_question_split/aidlc-rules/aws-aidlc-rule-details/extensions/security/baseline/security-baseline.opt-in.md) +- [AIDLC Workflows Repository](https://github.com/awslabs/aidlc-workflows) + +## Support + +For questions or issues with extension testing: + +1. Check the extension test logs in the run output directory +2. Review the extension-test-config.yaml for configuration details +3. Compare against the extension-test-report.md for high-level analysis +4. File issues at the aidlc-regression repository diff --git a/scripts/aidlc-evaluator/docs/file-structure.md b/scripts/aidlc-evaluator/docs/file-structure.md new file mode 100644 index 00000000..1818575b --- /dev/null +++ b/scripts/aidlc-evaluator/docs/file-structure.md @@ -0,0 +1,132 @@ +# AI-DLC Evaluation Framework - File Structure + +```text +aidlc-regression/ +├── README.md # Project overview +├── VISION.md # Project vision and goals +├── FAQ.md # Frequently asked questions +├── OPERATING_PRINCIPLES.md # Decision-making guidelines +├── CONTRIBUTING.md # Contribution guidelines +├── pyproject.toml # Workspace configuration +├── uv.lock # Dependency lock file +│ +├── aidlc-runner/ # Execution framework (two-agent AIDLC runner) +│ ├── pyproject.toml +│ ├── config/ +│ │ └── default.yaml +│ ├── src/ +│ │ └── aidlc_runner/ +│ │ ├── cli.py # CLI entry point +│ │ ├── config.py # Configuration loading +│ │ ├── runner.py # Orchestration core +│ │ ├── metrics.py # NFR metrics collection +│ │ ├── post_run.py # Post-run test evaluation +│ │ ├── progress.py # Progress handlers +│ │ ├── agents/ # Executor and simulator agent factories +│ │ └── tools/ # Sandboxed file ops, rule loader, run_command +│ ├── tests/ +│ └── planning/ # Phase plans and backlog +│ +├── packages/ # Evaluation packages (monorepo) +│ ├── qualitative/ # Semantic evaluation +│ │ ├── pyproject.toml +│ │ ├── src/ +│ │ │ └── qualitative/ +│ │ │ ├── __init__.py +│ │ │ ├── comparator.py # Comparison orchestration +│ │ │ ├── document.py # Document loading and phase mapping +│ │ │ ├── scorer.py # Scoring protocol + implementations +│ │ │ └── models.py # Result data models +│ │ └── tests/ +│ │ +│ ├── quantitative/ # Code evaluation +│ │ ├── pyproject.toml +│ │ └── src/ +│ │ └── quantitative/ +│ │ ├── __init__.py +│ │ ├── linting.py # Ruff/eslint checks +│ │ ├── security.py # Semgrep/bandit integration +│ │ └── organization.py # Code duplication, structure +│ │ +│ ├── nonfunctional/ # NFR evaluation +│ │ ├── pyproject.toml +│ │ └── src/ +│ │ └── nonfunctional/ +│ │ ├── __init__.py +│ │ ├── tokens.py # Token consumption tracking +│ │ ├── timing.py # Execution time measurement +│ │ └── consistency.py # Cross-model consistency +│ │ +│ ├── reporting/ # Report generation +│ │ ├── pyproject.toml +│ │ └── src/ +│ │ └── reporting/ +│ │ ├── __init__.py +│ │ └── generate.py # Main report generator +│ │ +│ └── shared/ # Common utilities +│ ├── pyproject.toml +│ └── src/ +│ └── shared/ +│ └── __init__.py +│ +├── test_cases/ # Golden test cases (AIDLC inputs) +│ ├── instructions.md +│ └── sci-calc/ +│ ├── vision.md +│ └── tech-env.md +│ +├── runs/ # Evaluation run outputs +│ └── {timestamp}-{uuid}/ +│ ├── run-meta.yaml +│ ├── run-metrics.yaml +│ ├── test-results.yaml +│ ├── vision.md +│ ├── tech-env.md +│ ├── aidlc-docs/ # Generated AIDLC documentation +│ └── workspace/ # Generated application code +│ +├── overall_project/ # Broader project tenets and strategy +│ +└── docs/ # Additional documentation + └── writing-inputs/ # Guides for writing vision/tech-env docs +``` + +## Big Rocks → Package Mapping + +```text +1. Golden Test Case → test_cases/ +2. Execution Framework → aidlc-runner/ +3. Semantic Evaluation → packages/qualitative/ +4. Code Evaluation → packages/quantitative/ +5. NFR Evaluation → packages/nonfunctional/ +6. GitHub CI/CD → .github/workflows/ (planned) +``` + +## Package Dependencies + +```text +aidlc-runner (standalone — runs the AIDLC workflow and produces run folders) + +qualitative +├── shared +quantitative +├── shared +nonfunctional +├── shared +reporting +├── shared +├── qualitative (reads semantic evaluation results) +├── quantitative (reads code evaluation results) +└── nonfunctional (reads NFR results) +``` + +## Key Design Decisions + +1. **Monorepo with uv workspace:** Simplifies dependency management and cross-package development +2. **Python 3.13:** Latest stable Python with modern features +3. **Separate packages by evaluation type:** Clear separation of concerns, independent evolution +4. **aidlc-runner as execution engine:** Produces run folders that evaluation packages consume +5. **Golden test cases as versioned inputs:** Reproducible, curated baselines for consistent evaluation +6. **Shared utilities package:** Common code reused across all evaluation packages +7. **Reporting aggregates all:** Single entry point for generating comprehensive reports diff --git a/scripts/aidlc-evaluator/docs/ide-harness-design.md b/scripts/aidlc-evaluator/docs/ide-harness-design.md new file mode 100644 index 00000000..528ab261 --- /dev/null +++ b/scripts/aidlc-evaluator/docs/ide-harness-design.md @@ -0,0 +1,231 @@ +# IDE Test Harness — Architecture Design + +## Problem + +The AIDLC evaluation framework runs via a two-agent Strands swarm on Bedrock. To evaluate +IDE-based AI coding assistants, we need to drive the same AIDLC process through each IDE's +AI chat interface and capture the outputs in a format compatible with the existing evaluation +pipeline (stages 2–6). + +## Input/Output Contract + +### Inputs (provided to each IDE adapter) + +- `vision.md` — the application vision document +- `tech-env.md` — technical environment specification +- AIDLC rules — the full AIDLC workflow rules (from `aidlc-workflows` repo) +- Initial prompt template — instructions for the IDE AI to follow the AIDLC process + +### Outputs (captured from each IDE adapter) + +- `aidlc-docs/` — generated AIDLC documentation (same structure as Strands runs) + - `inception/requirements/`, `inception/plans/`, `inception/application-design/` + - `construction/plans/`, `construction/build-and-test/` + - `aidlc-state.md`, `audit.md` +- `workspace/` — generated application source code and tests +- `run-meta.yaml` — run metadata (adapter-generated, matches collector schema) +- `test-results.yaml` — post-run test results (adapter runs tests after IDE completes) + +### Output Normalization + +IDE outputs will not match the Strands run folder layout exactly. Each adapter must +normalize its output to match the expected structure: + +```text +<run-folder>/ + run-meta.yaml # adapter generates this + run-metrics.yaml # adapter generates (tokens if available, timing always) + test-results.yaml # adapter runs tests post-generation + aidlc-docs/ # extracted/copied from IDE workspace + workspace/ # extracted/copied from IDE workspace +``` + +This allows `run_evaluation.py --evaluate-only <run-folder>/aidlc-docs` to score the output. + +## Adapter Interface + +```python +from abc import ABC, abstractmethod +from dataclasses import dataclass +from pathlib import Path + +@dataclass +class AdapterConfig: + """Configuration for an IDE adapter run.""" + vision_path: Path # path to vision.md + tech_env_path: Path | None # path to tech-env.md (optional) + rules_path: Path # path to cloned aidlc-workflows rules + output_dir: Path # where to write normalized output + prompt_template: str # initial prompt to send to IDE AI + timeout_seconds: int = 7200 # max time to wait for IDE completion + +@dataclass +class AdapterResult: + """Result from an IDE adapter run.""" + success: bool + output_dir: Path + aidlc_docs_dir: Path | None + workspace_dir: Path | None + error: str | None = None + elapsed_seconds: float = 0.0 + token_estimate: int | None = None # if IDE reports token usage + +class IDEAdapter(ABC): + """Abstract base for IDE-specific automation adapters.""" + + @property + @abstractmethod + def name(self) -> str: + """Human-readable IDE name.""" + ... + + @abstractmethod + def check_prerequisites(self) -> tuple[bool, str]: + """Verify IDE is installed, configured, and accessible. + + Returns (ok, message). + """ + ... + + @abstractmethod + def run(self, config: AdapterConfig) -> AdapterResult: + """Execute the AIDLC process through the IDE and capture outputs. + + Steps: + 1. Set up a clean workspace directory + 2. Copy/symlink vision.md, tech-env.md, and rules into the workspace + 3. Launch IDE (or connect to running instance) + 4. Send the initial prompt to the IDE's AI chat + 5. Monitor for completion (all AIDLC phases done) + 6. Extract aidlc-docs/ and workspace/ from IDE output + 7. Generate run-meta.yaml with timing and adapter info + """ + ... +``` + +## Run Orchestration + +```text +run_ide_evaluation.py + ├── parse args (--ide <name>, --vision, --golden, etc.) + ├── load adapter by name + ├── adapter.check_prerequisites() + ├── adapter.run(config) → AdapterResult + ├── post_run_tests(result.workspace_dir) → test-results.yaml + └── run_evaluation.py --evaluate-only <result.aidlc_docs_dir> --golden <golden> +``` + +The orchestrator script: + +1. Instantiates the adapter for the target IDE +2. Runs the adapter to generate outputs +3. Runs post-generation tests (install deps + pytest/npm test) +4. Invokes the existing evaluation pipeline in evaluate-only mode + +## Adapter Implementation Strategy + +### Category A: CLI-scriptable IDEs + +IDEs with CLI or API support for sending prompts and receiving responses. + +- **Cursor** — Has CLI (`cursor` command). May support `--chat` or similar. +- **Kiro** — AWS IDE, likely has Bedrock integration. Check for CLI. + +Approach: Subprocess invocation, parse stdout/stderr, monitor workspace for output files. + +### Category B: VS Code extension IDEs + +IDEs that run as VS Code extensions with no independent CLI. + +- **Cline** — VS Code extension. Must automate VS Code. +- **GitHub CoPilot** — VS Code extension. Chat panel automation needed. + +Approach: Use `@vscode/test-electron` or Playwright-based VS Code automation. + +### Category C: VS Code fork IDEs + +Standalone IDE forks of VS Code with built-in AI. + +- **Windsurf** — Codeium's fork. Electron app, VS Code internals. +- **Antigravity** — AI coding assistant. + +Approach: Electron automation via Playwright or native extension API. + +### Common Post-Run Steps (all adapters) + +1. Scan workspace for `aidlc-docs/` directory structure +2. Identify generated source code under `workspace/` or project root +3. Normalize file layout to match expected schema +4. Detect project type (Python/Node/Rust/Go) +5. Install dependencies and run tests +6. Generate `run-meta.yaml` and `run-metrics.yaml` + +## Package Structure + +```text +packages/ide-harness/ + pyproject.toml + src/ide_harness/ + __init__.py + adapter.py # Abstract adapter interface + AdapterConfig/Result + orchestrator.py # Run orchestration (invoke adapter + evaluation) + normalizer.py # Output normalization utilities + post_run.py # Reuse/adapt execution package's post-run test logic + prompt_template.py # Standard AIDLC prompt template for IDE AI + adapters/ + __init__.py + kiro.py + cursor.py + cline.py + copilot.py + windsurf.py + antigravity.py + tests/ + test_normalizer.py + test_orchestrator.py +``` + +## Prompt Template + +The prompt sent to each IDE AI must instruct it to follow the AIDLC process: + +```text +You are tasked with building an application following the AIDLC (AI Development +Life Cycle) process. The AIDLC rules are provided in the `aidlc-rules/` directory. + +Please read the vision document at `vision.md` and follow the complete AIDLC process: + +1. INCEPTION PHASE: + - Read the AIDLC rules for the inception phase + - Create requirements, plans, and application design documents + - Output these to `aidlc-docs/inception/` + +2. CONSTRUCTION PHASE: + - Read the AIDLC rules for the construction phase + - Create build plans and test instructions + - Generate the application source code and tests + - Output documents to `aidlc-docs/construction/` + - Output code to the project root (which becomes `workspace/`) + +3. Generate `aidlc-docs/aidlc-state.md` tracking your progress through each phase. + +Follow every AIDLC rule precisely. Do not skip phases or documents. +``` + +## Open Questions + +1. **Completion detection**: How to detect when the IDE AI has finished all AIDLC phases? + - File-based: watch for `aidlc-state.md` indicating construction complete + - Time-based: timeout after N minutes + - Prompt-based: ask the IDE AI to signal completion + +2. **Multi-turn interaction**: The AIDLC process involves human simulator handoffs. + For IDEs, should we: + - Send a single comprehensive prompt and let the IDE handle everything? + - Script multi-turn interaction (approve each phase transition)? + - Use a semi-automated approach (human monitors, scripts capture)? + +3. **Token tracking**: Most IDEs don't expose token usage. Options: + - Estimate from output size + - Capture Bedrock CloudWatch metrics (if IDE uses Bedrock) + - Accept "N/A" for token metrics on IDE runs diff --git a/scripts/aidlc-evaluator/packages/cli-harness/pyproject.toml b/scripts/aidlc-evaluator/packages/cli-harness/pyproject.toml new file mode 100644 index 00000000..c5e30217 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/cli-harness/pyproject.toml @@ -0,0 +1,20 @@ +[project] +name = "aidlc-cli-harness" +version = "0.1.0" +description = "CLI-based harness for testing AIDLC workflows via kiro-cli" +requires-python = ">=3.13" +dependencies = [ + "pyyaml>=6.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.0.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/cli_harness"] diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/__init__.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/__init__.py new file mode 100644 index 00000000..16e8785b --- /dev/null +++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/__init__.py @@ -0,0 +1,16 @@ +"""CLI-based harness for testing AIDLC workflows via kiro-cli. + +Provides a common adapter interface for driving AIDLC workflows through +CLI-based AI coding assistants and capturing evaluation-compatible output. +""" + +from cli_harness.adapter import AdapterConfig, AdapterResult, CLIAdapter +from cli_harness.registry import get_adapter, list_adapters + +__all__ = [ + "AdapterConfig", + "AdapterResult", + "CLIAdapter", + "get_adapter", + "list_adapters", +] diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapter.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapter.py new file mode 100644 index 00000000..95f0f0c0 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapter.py @@ -0,0 +1,69 @@ +"""Abstract adapter interface for CLI-based automation.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from pathlib import Path + + +@dataclass +class AdapterConfig: + """Configuration for a CLI adapter run.""" + + vision_path: Path + output_dir: Path + rules_path: Path + tech_env_path: Path | None = None + prompt_template: str | None = None + model: str | None = None + aws_profile: str | None = None + timeout_seconds: int = 7200 # 2 hours max + + +@dataclass +class AdapterResult: + """Result from a CLI adapter run.""" + + success: bool + output_dir: Path + aidlc_docs_dir: Path | None = None + workspace_dir: Path | None = None + error: str | None = None + elapsed_seconds: float = 0.0 + token_estimate: int | None = None + extra: dict = field(default_factory=dict) + + +class CLIAdapter(ABC): + """Abstract base for CLI-specific automation adapters.""" + + @property + @abstractmethod + def name(self) -> str: + """Human-readable CLI tool name (e.g., 'kiro-cli').""" + ... + + @abstractmethod + def check_prerequisites(self) -> tuple[bool, str]: + """Verify the CLI tool is installed, configured, and accessible. + + Returns: + (ok, message) — True with a success message, or False with + a description of what's missing. + """ + ... + + @abstractmethod + def run(self, config: AdapterConfig) -> AdapterResult: + """Execute the AIDLC process through the CLI tool and capture outputs. + + The implementation should: + 1. Set up a clean workspace with vision.md, tech-env.md, and rules + 2. Launch the CLI tool or connect to a running instance + 3. Send the AIDLC prompt to the CLI tool + 4. Monitor for completion (all AIDLC phases done) + 5. Extract aidlc-docs/ and workspace/ from the output + 6. Generate run-meta.yaml with timing and adapter info + """ + ... diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/__init__.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/__init__.py new file mode 100644 index 00000000..0b56009b --- /dev/null +++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/__init__.py @@ -0,0 +1 @@ +"""CLI-specific adapter implementations.""" diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/claude_code.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/claude_code.py new file mode 100644 index 00000000..1bb25d94 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/claude_code.py @@ -0,0 +1,321 @@ +"""Claude Code adapter — drives AIDLC workflows via claude CLI with Bedrock. + +Uses ``claude`` CLI in print mode (``-p``) with ``--dangerously-skip-permissions`` +for fully headless, non-interactive execution. + +AIDLC rules are injected via ``--system-prompt`` or written to the workspace +as steering context. +""" + +from __future__ import annotations + +import json +import logging +import os +import selectors +import shutil +import subprocess +import sys +import time +from pathlib import Path + +from cli_harness.adapter import AdapterConfig, AdapterResult, CLIAdapter +from cli_harness.normalizer import normalize_output +from cli_harness.prompt_template import render_prompt + +logger = logging.getLogger(__name__) + +_CLAUDE_CLI = "claude" + + +def _log(msg: str) -> None: + """Print a progress message to stderr.""" + print(f" [claude-code] {msg}", file=sys.stderr, flush=True) + + +def _parse_stream_result(log_path: Path) -> dict: + """Parse the final ``{"type":"result",...}`` line from stream-json output. + + Scans every JSON line in the log and keeps the last object whose + ``type`` field equals ``"result"``. Claude's stream-json format + emits a single summary result line at the end, so the last match + is the complete summary. + + Returns a dict with token usage, cost, timing, and model breakdown, + or an empty dict if no result line is found. + """ + result_data: dict = {} + try: + with open(log_path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + except (json.JSONDecodeError, ValueError): + continue + if isinstance(obj, dict) and obj.get("type") == "result": + result_data = obj + except OSError: + pass + return result_data + + +class ClaudeCodeAdapter(CLIAdapter): + """Adapter for Claude Code CLI with Amazon Bedrock. + + Uses ``claude -p --dangerously-skip-permissions`` for headless execution. + """ + + def __init__(self, verbose: bool = False): + self.verbose = verbose + + @property + def name(self) -> str: + return "claude-code" + + def check_prerequisites(self) -> tuple[bool, str]: + """Verify that ``claude`` is on PATH.""" + if not shutil.which(_CLAUDE_CLI): + return False, ( + f"'{_CLAUDE_CLI}' not found in PATH. " + "Install Claude Code: npm install -g @anthropic-ai/claude-code" + ) + return True, f"Claude Code ('{_CLAUDE_CLI}') found" + + def run(self, config: AdapterConfig) -> AdapterResult: + """Execute the full AIDLC workflow through Claude Code CLI. + + Runs directly in ``<output_dir>/workspace/`` — no temp dir or copy step. + """ + ok, msg = self.check_prerequisites() + if not ok: + return AdapterResult( + success=False, + output_dir=config.output_dir, + error=f"Prerequisites not met: {msg}", + ) + + start_time = time.monotonic() + + # Work directly in the final output location + config.output_dir.mkdir(parents=True, exist_ok=True) + workspace = config.output_dir / "workspace" + workspace.mkdir(exist_ok=True) + _log(f"Workspace: {workspace}") + + try: + # Copy input documents + shutil.copy2(config.vision_path, workspace / "vision.md") + _log(f"Copied vision: {config.vision_path}") + if config.tech_env_path and config.tech_env_path.is_file(): + shutil.copy2(config.tech_env_path, workspace / "tech-env.md") + _log(f"Copied tech-env: {config.tech_env_path}") + + # Copy AIDLC rules into workspace + rules_dir = workspace / "aidlc-rules" + rules_dir.mkdir(parents=True, exist_ok=True) + rules_path = config.rules_path + if rules_path.is_dir(): + for rule_file in sorted(rules_path.rglob("*.md")): + rel = rule_file.relative_to(rules_path) + dst = rules_dir / rel + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(rule_file, dst) + _log(f"Copied AIDLC rules directory ({sum(1 for _ in rules_dir.rglob('*.md'))} files)") + else: + shutil.copy2(rules_path, rules_dir / rules_path.name) + _log(f"Copied AIDLC rules file: {rules_path.name}") + + # Build the prompt + prompt = config.prompt_template or render_prompt() + + # Build command — claude -p for non-interactive print mode + cmd = [ + _CLAUDE_CLI, + "-p", + "--dangerously-skip-permissions", + "--verbose", + "--output-format", "stream-json", + ] + if config.model: + cmd += ["--model", config.model] + + cmd.append(prompt) + + _log(f"Running: claude -p --dangerously-skip-permissions ...") + _log(f"Model: {config.model or 'default'}") + _log(f"Prompt length: {len(prompt)} chars") + + # Set up environment with AWS_PROFILE if specified + env = os.environ.copy() + if config.aws_profile: + env["AWS_PROFILE"] = config.aws_profile + _log(f"AWS_PROFILE: {config.aws_profile}") + + # Run claude as a subprocess, streaming output + log_path = config.output_dir / "claude-session.log" + _log(f"Session log: {log_path}") + + with open(log_path, "w", encoding="utf-8") as log_file: + # nosec B603 - Executing user's Claude Code CLI with validated configuration + # nosemgrep: dangerous-subprocess-use-audit + process = subprocess.Popen( + cmd, + cwd=str(workspace), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env, + ) + + sel = selectors.DefaultSelector() + sel.register(process.stdout, selectors.EVENT_READ) + sel.register(process.stderr, selectors.EVENT_READ) + + open_streams = 2 + while open_streams > 0: + for key, _ in sel.select(timeout=1): + stream = key.fileobj + # read1() is only available on BufferedReader; + # fall back to os.read() on unbuffered streams. + if hasattr(stream, "read1"): + chunk = stream.read1(4096) + else: + chunk = os.read(stream.fileno(), 4096) + if not chunk: + sel.unregister(key.fileobj) + open_streams -= 1 + continue + text = chunk.decode("utf-8", errors="replace") + log_file.write(text) + log_file.flush() + if self.verbose: + sys.stderr.write(text) + sys.stderr.flush() + + process.wait(timeout=config.timeout_seconds) + + elapsed_seconds = time.monotonic() - start_time + _log(f"\nclaude exited with code {process.returncode} after {elapsed_seconds:.0f}s") + + # Parse stream-json result for token usage and cost + stream_result = _parse_stream_result(log_path) + usage_extra: dict = {} + if stream_result: + usage = stream_result.get("usage", {}) + model_usage = stream_result.get("modelUsage", {}) + + input_tokens = usage.get("input_tokens", 0) + output_tokens = usage.get("output_tokens", 0) + cache_read = usage.get("cache_read_input_tokens", 0) + cache_write = usage.get("cache_creation_input_tokens", 0) + total_tokens = input_tokens + output_tokens + cache_read + cache_write + + num_turns = stream_result.get("num_turns", 0) + duration_ms = stream_result.get("duration_ms", 0) + duration_api_ms = stream_result.get("duration_api_ms", 0) + + # Build per-model info for model_params + model_params: dict = {} + for model_id, info in model_usage.items(): + model_params[model_id] = { + "input_tokens": info.get("inputTokens", 0), + "output_tokens": info.get("outputTokens", 0), + "cache_read_tokens": info.get("cacheReadInputTokens", 0), + "cache_write_tokens": info.get("cacheCreationInputTokens", 0), + "cost_usd": info.get("costUSD", 0.0), + } + + usage_extra = { + "input_tokens": input_tokens, + "output_tokens": output_tokens, + "total_tokens": total_tokens, + "cache_read_tokens": cache_read, + "cache_write_tokens": cache_write, + "total_cost_usd": stream_result.get("total_cost_usd", 0.0), + "duration_ms": duration_ms, + "duration_api_ms": duration_api_ms, + "num_turns": num_turns, + "session_id": stream_result.get("session_id", ""), + "model_usage": model_params, + "model": config.model or "", + } + _log( + f"Tokens: {input_tokens + cache_read + cache_write:,} in / " + f"{output_tokens:,} out | Cost: ${usage_extra['total_cost_usd']:.4f}" + ) + + # List workspace contents for debugging + _log("Workspace contents:") + for item in sorted(workspace.iterdir()): + _log(f" {item.name}/" if item.is_dir() else f" {item.name}") + + # Move aidlc-docs up from workspace/ to output_dir/ (sibling of workspace/) + src_docs = workspace / "aidlc-docs" + dst_docs = config.output_dir / "aidlc-docs" + if src_docs.is_dir(): + if dst_docs.exists(): + shutil.rmtree(dst_docs) + shutil.move(str(src_docs), str(dst_docs)) + + # Write run-meta.yaml and run-metrics.yaml + normalize_output( + source_dir=workspace, + output_dir=config.output_dir, + adapter_name=self.name, + elapsed_seconds=elapsed_seconds, + token_usage=usage_extra if usage_extra else None, + ) + + has_docs = dst_docs.is_dir() and any(dst_docs.iterdir()) + + if process.returncode == 0 and has_docs: + return AdapterResult( + success=True, + output_dir=config.output_dir, + aidlc_docs_dir=dst_docs, + workspace_dir=workspace, + elapsed_seconds=elapsed_seconds, + extra=usage_extra, + ) + + error_detail = ( + f"claude exited with code {process.returncode}, " + "no aidlc-docs/ output was produced." + if not has_docs + else f"claude exited with code {process.returncode} " + "but aidlc-docs/ may be incomplete." + ) + return AdapterResult( + success=has_docs, + output_dir=config.output_dir, + aidlc_docs_dir=dst_docs if has_docs else None, + workspace_dir=workspace, + error=error_detail if not has_docs else None, + elapsed_seconds=elapsed_seconds, + extra=usage_extra, + ) + + except subprocess.TimeoutExpired: + elapsed_seconds = time.monotonic() - start_time + process.kill() + _log(f"Timeout after {elapsed_seconds:.0f}s — killed process") + return AdapterResult( + success=False, + output_dir=config.output_dir, + workspace_dir=workspace, + error=f"claude timed out after {config.timeout_seconds}s", + elapsed_seconds=elapsed_seconds, + ) + + except Exception as exc: + elapsed_seconds = time.monotonic() - start_time + logger.exception("claude-code adapter run failed") + return AdapterResult( + success=False, + output_dir=config.output_dir, + workspace_dir=workspace, + error=f"claude-code adapter error: {exc}", + elapsed_seconds=elapsed_seconds, + ) diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/kiro_cli.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/kiro_cli.py new file mode 100644 index 00000000..ae0fc23e --- /dev/null +++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/adapters/kiro_cli.py @@ -0,0 +1,279 @@ +"""Kiro CLI adapter — drives AIDLC workflows via kiro-cli subprocess. + +Uses ``kiro-cli chat`` with ``--no-interactive`` and ``--trust-all-tools`` +flags for fully headless execution. + +AIDLC rules are injected through Kiro's steering-file mechanism by writing +them to ``.kiro/steering/aidlc-rules.md`` inside the workspace. +""" + +from __future__ import annotations + +import logging +import re +import shutil +import subprocess +import sys +import time +from pathlib import Path + +from cli_harness.adapter import AdapterConfig, AdapterResult, CLIAdapter +from cli_harness.normalizer import normalize_output +from cli_harness.prompt_template import render_prompt + +logger = logging.getLogger(__name__) + +_KIRO_CLI = "kiro-cli" + +# Matches ANSI escape sequences: CSI sequences (\x1b[...X), OSC sequences (\x1b]...\x07), +# and simple two-byte escapes (\x1b followed by one char). +_ANSI_RE = re.compile(r"\x1b\[[0-9;?]*[A-Za-z]|\x1b\][^\x07]*\x07|\x1b.") + + +def _strip_ansi(text: str) -> str: + """Remove ANSI escape sequences from text.""" + return _ANSI_RE.sub("", text) + + +def _log(msg: str) -> None: + """Print a progress message to stderr.""" + print(f" [kiro-cli] {msg}", file=sys.stderr, flush=True) + + +class KiroCLIAdapter(CLIAdapter): + """Adapter for kiro-cli. + + Uses ``kiro-cli chat --no-interactive --trust-all-tools`` for headless + execution via subprocess. + """ + + def __init__(self, verbose: bool = False): + self.verbose = verbose + + @property + def name(self) -> str: + return "kiro-cli" + + def check_prerequisites(self) -> tuple[bool, str]: + """Verify that ``kiro-cli`` is on PATH.""" + if not shutil.which(_KIRO_CLI): + return False, ( + f"'{_KIRO_CLI}' not found in PATH. " + "Install the Kiro CLI first (https://kiro.dev)." + ) + return True, f"Kiro CLI ('{_KIRO_CLI}') found" + + def run(self, config: AdapterConfig) -> AdapterResult: + """Execute the full AIDLC workflow through kiro-cli. + + Runs directly in ``<output_dir>/workspace/`` — no temp dir or copy step. + """ + ok, msg = self.check_prerequisites() + if not ok: + return AdapterResult( + success=False, + output_dir=config.output_dir, + error=f"Prerequisites not met: {msg}", + ) + + start_time = time.monotonic() + + # Work directly in the final output location + config.output_dir.mkdir(parents=True, exist_ok=True) + workspace = config.output_dir / "workspace" + workspace.mkdir(exist_ok=True) + _log(f"Workspace: {workspace}") + + try: + # Copy input documents + shutil.copy2(config.vision_path, workspace / "vision.md") + _log(f"Copied vision: {config.vision_path}") + if config.tech_env_path and config.tech_env_path.is_file(): + shutil.copy2(config.tech_env_path, workspace / "tech-env.md") + _log(f"Copied tech-env: {config.tech_env_path}") + + # Inject AIDLC rules via steering files + steering_dir = workspace / ".kiro" / "steering" + steering_dir.mkdir(parents=True, exist_ok=True) + + rules_path = config.rules_path + if rules_path.is_dir(): + parts = [] + for rule_file in sorted(rules_path.rglob("*.md")): + parts.append(rule_file.read_text(encoding="utf-8")) + rules_content = "\n\n".join(parts) + else: + rules_content = rules_path.read_text(encoding="utf-8") + + (steering_dir / "aidlc-rules.md").write_text( + rules_content, encoding="utf-8" + ) + _log(f"Injected AIDLC rules ({len(rules_content)} chars)") + + # Build the prompt + prompt = config.prompt_template or render_prompt() + + # Base command flags + base_flags = [ + "--no-interactive", + "--trust-all-tools", + ] + if config.model: + base_flags += ["--model", config.model] + + # Run kiro-cli in a loop to handle AIDLC review gates. + # The workflow pauses at gates (e.g. "Approve & Continue"). + # With --no-interactive, kiro-cli exits at each gate. + # We resume the session with an approval message each time. + log_path = config.output_dir / "kiro-session.log" + _log(f"Session log: {log_path}") + + turn = 0 + max_turns = 20 # safety limit + total_rc = 0 + + with open(log_path, "w", encoding="utf-8") as log_file: + while turn < max_turns: + turn += 1 + + if turn == 1: + cmd = [_KIRO_CLI, "chat"] + base_flags + [prompt] + _log(f"Turn {turn}: initial prompt ({len(prompt)} chars)") + else: + approval = "Approve & Continue. Proceed to the next phase." + cmd = [_KIRO_CLI, "chat"] + base_flags + ["--resume", approval] + _log(f"Turn {turn}: resuming with approval") + + log_file.write(f"\n{'='*60}\n") + log_file.write(f"TURN {turn}\n") + log_file.write(f"{'='*60}\n") + log_file.flush() + + # nosec B603 - Executing user's Kiro CLI with validated configuration + # nosemgrep: dangerous-subprocess-use-audit + process = subprocess.Popen( + cmd, + cwd=str(workspace), + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, + ) + + for line in process.stdout: + log_file.write(_strip_ansi(line)) + log_file.flush() + if self.verbose: + sys.stderr.write(line) + sys.stderr.flush() + + remaining = config.timeout_seconds - (time.monotonic() - start_time) + if remaining <= 0: + process.kill() + _log(f"Timeout reached at turn {turn}") + break + process.wait(timeout=max(remaining, 10)) + total_rc = process.returncode + + _log(f"Turn {turn} exited with code {process.returncode}") + + # Check if aidlc-docs looks complete (has construction phase files) + aidlc_docs_dir = workspace / "aidlc-docs" + if aidlc_docs_dir.is_dir(): + has_construction = any( + (aidlc_docs_dir / "construction").rglob("*.md") + ) if (aidlc_docs_dir / "construction").is_dir() else False + file_count = sum(1 for _ in aidlc_docs_dir.rglob("*") if _.is_file()) + _log(f" aidlc-docs: {file_count} files, construction={'yes' if has_construction else 'no'}") + + if has_construction: + _log("Construction phase detected — workflow complete") + break + else: + _log(" aidlc-docs/ not yet created") + + elapsed = time.monotonic() - start_time + if elapsed >= config.timeout_seconds: + _log("Timeout reached") + break + + elapsed_seconds = time.monotonic() - start_time + _log(f"Completed {turn} turn(s) in {elapsed_seconds:.0f}s") + + # List workspace contents for debugging + _log("Workspace contents:") + for item in sorted(workspace.iterdir()): + _log(f" {item.name}/") if item.is_dir() else _log(f" {item.name}") + + # Move aidlc-docs up from workspace/ to output_dir/ (sibling of workspace/) + src_docs = workspace / "aidlc-docs" + dst_docs = config.output_dir / "aidlc-docs" + if src_docs.is_dir(): + if dst_docs.exists(): + shutil.rmtree(dst_docs) + shutil.move(str(src_docs), str(dst_docs)) + + # Write run-meta.yaml and run-metrics.yaml + # Kiro CLI does not expose token usage; pass turn count + # so downstream reports show "data unavailable" rather than + # silently reporting zeros that look like infinite efficiency. + normalize_output( + source_dir=workspace, + output_dir=config.output_dir, + adapter_name=self.name, + elapsed_seconds=elapsed_seconds, + token_usage={ + "num_turns": turn, + "model": config.model or "", + }, + ) + + has_docs = dst_docs.is_dir() and any(dst_docs.iterdir()) + + if total_rc == 0 and has_docs: + return AdapterResult( + success=True, + output_dir=config.output_dir, + aidlc_docs_dir=dst_docs, + workspace_dir=workspace, + elapsed_seconds=elapsed_seconds, + ) + + error_detail = ( + f"kiro-cli completed {turn} turn(s), " + "no aidlc-docs/ output was produced." + if not has_docs + else f"kiro-cli completed {turn} turn(s) " + "but aidlc-docs/ may be incomplete." + ) + return AdapterResult( + success=has_docs, + output_dir=config.output_dir, + aidlc_docs_dir=dst_docs if has_docs else None, + workspace_dir=workspace, + error=error_detail if not has_docs else None, + elapsed_seconds=elapsed_seconds, + ) + + except subprocess.TimeoutExpired: + elapsed_seconds = time.monotonic() - start_time + process.kill() + _log(f"Timeout after {elapsed_seconds:.0f}s — killed process") + return AdapterResult( + success=False, + output_dir=config.output_dir, + workspace_dir=workspace, + error=f"kiro-cli timed out after {config.timeout_seconds}s", + elapsed_seconds=elapsed_seconds, + ) + + except Exception as exc: + elapsed_seconds = time.monotonic() - start_time + logger.exception("kiro-cli adapter run failed") + return AdapterResult( + success=False, + output_dir=config.output_dir, + workspace_dir=workspace, + error=f"kiro-cli adapter error: {exc}", + elapsed_seconds=elapsed_seconds, + ) diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/normalizer.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/normalizer.py new file mode 100644 index 00000000..724a333a --- /dev/null +++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/normalizer.py @@ -0,0 +1,234 @@ +"""Output normalization — map CLI workspace output to evaluation-compatible layout.""" + +from __future__ import annotations + +from datetime import UTC, datetime +from pathlib import Path + +import yaml + + +def normalize_output( + source_dir: Path, + output_dir: Path, + adapter_name: str, + model_hint: str = "", + elapsed_seconds: float = 0.0, + token_usage: dict | None = None, +) -> Path: + """Write run-meta.yaml and run-metrics.yaml for a completed CLI run. + + Adapters now work directly in ``<output_dir>/workspace/`` and move + ``aidlc-docs/`` up to ``<output_dir>/aidlc-docs/`` themselves, so this + function only generates the metadata files. + + Args: + source_dir: The workspace directory (``<output_dir>/workspace/``). + output_dir: The run output directory. + adapter_name: Name of the CLI adapter (e.g., "kiro-cli"). + model_hint: Optional model identifier for run-meta. + elapsed_seconds: Wall clock time for the run. + token_usage: Optional dict with token counts, cost, and model breakdown + (from stream-json result parsing). + + Returns: + Path to the output_dir. + """ + output_dir.mkdir(parents=True, exist_ok=True) + + dst_workspace = output_dir / "workspace" + dst_docs = output_dir / "aidlc-docs" + + # Generate run-meta.yaml + now = datetime.now(UTC).isoformat(timespec="seconds") + meta = { + "run_folder": str(output_dir), + "started_at": now, + "completed_at": now, + "status": "completed", + "execution_time_ms": int(elapsed_seconds * 1000), + "total_handoffs": 0, + "node_history": [], + "config": { + "executor_model": model_hint or f"cli:{adapter_name}", + "simulator_model": "human", + "aws_region": "", + }, + } + meta_path = output_dir / "run-meta.yaml" + with open(meta_path, "w", encoding="utf-8") as f: + yaml.safe_dump(meta, f, default_flow_style=False, sort_keys=False) + + # Generate run-metrics.yaml matching the execution pipeline schema + tu = token_usage or {} + input_tokens = tu.get("input_tokens", 0) + output_tokens = tu.get("output_tokens", 0) + cache_read = tu.get("cache_read_tokens", 0) + cache_write = tu.get("cache_write_tokens", 0) + total_tokens = tu.get("total_tokens", input_tokens + output_tokens + cache_read + cache_write) + num_turns = tu.get("num_turns", 0) + duration_ms = int(elapsed_seconds * 1000) + duration_api_ms = tu.get("duration_api_ms", 0) + model_id = tu.get("model", f"cli:{adapter_name}") + + # tokens section — CLI adapters have a single "executor" agent, no simulator + # No repeated context since CLI adapters are single-session + tokens_section: dict = { + "total": { + "input_tokens": input_tokens, + "output_tokens": output_tokens, + "total_tokens": total_tokens, + "cache_read_tokens": cache_read, + "cache_write_tokens": cache_write, + }, + "per_agent": { + "executor": { + "input_tokens": input_tokens, + "output_tokens": output_tokens, + "total_tokens": total_tokens, + "cache_read_tokens": cache_read, + "cache_write_tokens": cache_write, + }, + }, + "repeated_context": { + "input_tokens": 0, + "output_tokens": 0, + "total_tokens": 0, + "cache_read_tokens": 0, + "cache_write_tokens": 0, + }, + "api_total": { + "input_tokens": input_tokens, + "output_tokens": output_tokens, + "total_tokens": total_tokens, + "cache_read_tokens": cache_read, + "cache_write_tokens": cache_write, + }, + } + + # timing section — CLI adapters run as a single executor session + # Emit one handoff entry for the whole run (not per-turn, to avoid noise) + handoffs = [{ + "handoff": 1, + "node_id": "executor", + "duration_ms": duration_api_ms or duration_ms, + }] + + timing_section: dict = { + "total_wall_clock_ms": duration_ms, + "handoffs": handoffs, + } + + # handoff_patterns section + handoff_patterns: dict = { + "total_handoffs": 1, + "sequence": ["executor"], + "per_agent": { + "executor": { + "turn_count": num_turns, + "total_duration_ms": duration_api_ms or duration_ms, + "avg_turn_duration_ms": (duration_api_ms or duration_ms) // max(num_turns, 1), + }, + }, + } + + # errors section + errors_section: dict = { + "throttle_events": 0, + "timeout_events": 0, + "failed_tool_calls": 0, + "model_error_events": 0, + "service_unavailable_events": 0, + "validation_error_events": 0, + "details": [], + } + + # model_params section + model_params_section: dict = { + "executor": { + "model_id": model_id, + "provider": "bedrock", + }, + "aws_region": "", + } + + metrics = { + "tokens": tokens_section, + "timing": timing_section, + "handoff_patterns": handoff_patterns, + "artifacts": { + "workspace": _count_workspace_files(dst_workspace), + "aidlc_docs": _count_doc_files(dst_docs) if dst_docs.is_dir() else {}, + }, + "errors": errors_section, + "model_params": model_params_section, + } + # Add cost if available (not in the reference schema but useful) + if tu.get("total_cost_usd"): + metrics["cost_usd"] = tu["total_cost_usd"] + metrics_path = output_dir / "run-metrics.yaml" + with open(metrics_path, "w", encoding="utf-8") as f: + yaml.safe_dump(metrics, f, default_flow_style=False, sort_keys=False) + + return output_dir + + +def _count_workspace_files(workspace: Path) -> dict: + """Count files in the workspace by category.""" + if not workspace.is_dir(): + return {} + + source_exts = {".py", ".js", ".ts", ".jsx", ".tsx", ".rs", ".go", ".java"} + test_patterns = {"test_", "_test.", ".test.", ".spec."} + config_exts = {".yaml", ".yml", ".json", ".toml", ".cfg", ".ini"} + + source = test = config = other = 0 + total_lines = 0 + + for f in workspace.rglob("*"): + if not f.is_file(): + continue + ext = f.suffix.lower() + name = f.name.lower() + + is_test = any(p in name for p in test_patterns) + if is_test and ext in source_exts: + test += 1 + elif ext in source_exts: + source += 1 + try: + total_lines += len(f.read_text(errors="replace").splitlines()) + except OSError: + pass + elif ext in config_exts: + config += 1 + else: + other += 1 + + return { + "source_files": source, + "test_files": test, + "config_files": config, + "other_files": other, + "total_files": source + test + config + other, + "total_lines_of_code": total_lines, + } + + +def _count_doc_files(docs_dir: Path) -> dict: + """Count AIDLC doc files by phase.""" + inception = construction = other = 0 + for f in docs_dir.rglob("*.md"): + rel = str(f.relative_to(docs_dir)) + if rel.startswith("inception"): + inception += 1 + elif rel.startswith("construction"): + construction += 1 + else: + other += 1 + return { + "inception_files": inception, + "construction_files": construction, + "other_files": other, + "total_files": inception + construction + other, + } diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/orchestrator.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/orchestrator.py new file mode 100644 index 00000000..a837a4f8 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/orchestrator.py @@ -0,0 +1,218 @@ +"""Run orchestration — invoke a CLI adapter then run the evaluation pipeline.""" + +from __future__ import annotations + +import shutil +import subprocess +import sys +from pathlib import Path + +import yaml + +from cli_harness.adapter import AdapterConfig, AdapterResult, CLIAdapter +from cli_harness.normalizer import normalize_output, _count_workspace_files, _count_doc_files + + +REPO_ROOT = Path(__file__).resolve().parents[4] # packages/cli-harness/src/cli_harness -> repo root + +# Input files that adapters copy into workspace for the CLI tool to read. +# These should be cleaned out after the run so workspace only has generated code. +_WORKSPACE_INPUT_FILES = {"vision.md", "tech-env.md"} +_WORKSPACE_INPUT_DIRS = {"aidlc-rules", ".kiro"} + + +def _normalize_run_folder( + output_dir: Path, + *, + vision_path: Path, + tech_env_path: Path | None, + adapter_name: str, + profile: str, + region: str, + rules_source: str, + rules_ref: str, + rules_repo: str, +) -> None: + """Normalize the run folder layout to match the execution pipeline. + + After the adapter runs, the workspace contains input files (vision.md, + tech-env.md, aidlc-rules/) mixed with generated code. This function: + + 1. Copies vision.md and tech-env.md to the run root (like the execution runner). + 2. Removes input files and adapter scaffolding from workspace/. + 3. Enriches run-meta.yaml with rules config and relative paths. + 4. Recounts workspace files and updates run-metrics.yaml artifacts section. + """ + workspace = output_dir / "workspace" + + # 1. Copy input docs to run root (matching execution runner layout) + if vision_path.is_file(): + shutil.copy2(vision_path, output_dir / "vision.md") + if tech_env_path and tech_env_path.is_file(): + shutil.copy2(tech_env_path, output_dir / "tech-env.md") + + # 2. Remove input files from workspace so it only has generated code + if workspace.is_dir(): + for name in _WORKSPACE_INPUT_FILES: + p = workspace / name + if p.is_file(): + p.unlink() + for name in _WORKSPACE_INPUT_DIRS: + p = workspace / name + if p.is_dir(): + shutil.rmtree(p) + + # 3. Enrich run-meta.yaml with rules config and relative run_folder + meta_path = output_dir / "run-meta.yaml" + if meta_path.exists(): + with open(meta_path, encoding="utf-8") as f: + meta = yaml.safe_load(f) or {} + else: + meta = {} + + # Use relative path like the normal run + try: + meta["run_folder"] = str(output_dir.relative_to(Path.cwd())) + except ValueError: + meta["run_folder"] = str(output_dir) + + # Add rules config fields that the normal run includes + config_section = meta.get("config", {}) + config_section["aws_profile"] = profile + config_section["rules_source"] = rules_source + config_section["rules_ref"] = rules_ref + config_section["rules_repo"] = rules_repo + meta["config"] = config_section + + with open(meta_path, "w", encoding="utf-8") as f: + yaml.safe_dump(meta, f, default_flow_style=False, sort_keys=False) + + # 4. Recount workspace files (now that inputs are removed) and update metrics + metrics_path = output_dir / "run-metrics.yaml" + if metrics_path.exists(): + with open(metrics_path, encoding="utf-8") as f: + metrics = yaml.safe_load(f) or {} + + dst_docs = output_dir / "aidlc-docs" + metrics.setdefault("artifacts", {})["workspace"] = _count_workspace_files(workspace) + if dst_docs.is_dir(): + metrics["artifacts"]["aidlc_docs"] = _count_doc_files(dst_docs) + + with open(metrics_path, "w", encoding="utf-8") as f: + yaml.safe_dump(metrics, f, default_flow_style=False, sort_keys=False) + + +def run_cli_evaluation( + adapter: CLIAdapter, + vision_path: Path, + output_dir: Path, + golden_docs: Path, + rules_path: Path, + tech_env_path: Path | None = None, + openapi_path: Path | None = None, + baseline_path: Path | None = None, + profile: str | None = None, + region: str | None = None, + scorer_model: str = "us.anthropic.claude-sonnet-4-5-20250929-v1:0", + report_format: str = "both", + prompt_template: str | None = None, + model: str | None = None, + timeout_seconds: int = 7200, + rules_source: str = "git", + rules_ref: str = "main", + rules_repo: str = "https://github.com/awslabs/aidlc-workflows.git", +) -> tuple[AdapterResult, int]: + """Run the full CLI evaluation pipeline. + + Steps: + 1. Check adapter prerequisites + 2. Run the adapter to generate AIDLC outputs + 3. Normalize the run folder layout (clean workspace, copy inputs to root) + 4. Invoke run_evaluation.py --evaluate-only to score the output + + Returns: + (adapter_result, eval_exit_code) + """ + # 1. Check prerequisites + ok, msg = adapter.check_prerequisites() + if not ok: + print(f"[ERROR] {adapter.name} prerequisites not met: {msg}", file=sys.stderr) + return AdapterResult( + success=False, + output_dir=output_dir, + error=f"Prerequisites not met: {msg}", + ), 1 + + print(f"[OK] {adapter.name} prerequisites met: {msg}") + + # 2. Run the adapter + config = AdapterConfig( + vision_path=vision_path, + tech_env_path=tech_env_path, + rules_path=rules_path, + output_dir=output_dir, + prompt_template=prompt_template, + model=model, + aws_profile=profile, + timeout_seconds=timeout_seconds, + ) + + print(f"\nRunning {adapter.name} adapter...") + result = adapter.run(config) + + if not result.success: + print(f"[FAILED] {adapter.name}: {result.error}", file=sys.stderr) + return result, 1 + + print(f"[OK] {adapter.name} completed in {result.elapsed_seconds:.0f}s") + + # 3. Normalize run folder layout to match the execution pipeline + _normalize_run_folder( + output_dir, + vision_path=vision_path, + tech_env_path=tech_env_path, + adapter_name=adapter.name, + profile=profile, + region=region, + rules_source=rules_source, + rules_ref=rules_ref, + rules_repo=rules_repo, + ) + + # 4. Verify aidlc-docs were produced + aidlc_docs = result.aidlc_docs_dir or output_dir / "aidlc-docs" + if not aidlc_docs.is_dir(): + print(f"[ERROR] No aidlc-docs directory found at {aidlc_docs}", file=sys.stderr) + result.success = False + result.error = "No aidlc-docs produced" + return result, 1 + + doc_files = [f for f in aidlc_docs.rglob("*.md") + if f.name not in ("aidlc-state.md", "audit.md")] + if not doc_files: + print("[WARN] aidlc-docs exists but contains no substantive documents") + + # 5. Run evaluation pipeline (stages 2-6) + eval_cmd = [ + sys.executable, str(REPO_ROOT / "run_evaluation.py"), + "--evaluate-only", str(aidlc_docs), + "--golden", str(golden_docs), + "--results", str(output_dir / "qualitative-comparison.yaml"), + "--scorer-model", scorer_model, + "--report-format", report_format, + ] + if profile: + eval_cmd += ["--profile", profile] + if region: + eval_cmd += ["--region", region] + if openapi_path and openapi_path.is_file(): + eval_cmd += ["--openapi", str(openapi_path)] + if baseline_path and baseline_path.is_file(): + eval_cmd += ["--baseline", str(baseline_path)] + + print(f"\nRunning evaluation pipeline on {adapter.name} output...") + # nosec B603 - Executing trusted framework evaluation script with validated args + # nosemgrep: dangerous-subprocess-use-audit + eval_result = subprocess.run(eval_cmd) + + return result, eval_result.returncode diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/prompt_template.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/prompt_template.py new file mode 100644 index 00000000..3ef18884 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/prompt_template.py @@ -0,0 +1,123 @@ +"""Standard AIDLC prompt template for CLI-based AI assistants. + +Adapted from the EXECUTOR_SYSTEM_PROMPT used by the Strands-based execution +pipeline, but tailored for single-agent CLI tools (kiro-cli, claude-code) +that don't have a separate simulator agent for handoffs. +""" + +EXECUTOR_SYSTEM_PROMPT = """\ +You are the AIDLC Executor. Your job is to drive the COMPLETE AI-DLC (AI-Driven \ +Development Life Cycle) workflow for a software project from start to finish, including \ +generating all application code. + +## CRITICAL RULE: YOU MUST COMPLETE THE ENTIRE WORKFLOW + +You must execute ALL phases and ALL stages of the AIDLC workflow. You are NOT done until \ +the Construction phase is complete and working code has been generated. \ +NEVER stop in the middle of the workflow. + +## Input documents + +- Vision document: `vision.md` +- Technical environment: `tech-env.md` (if present) +- AIDLC rules: `aidlc-rules/` directory — read each rule file BEFORE executing its stage + +## Complete stage sequence + +Execute these stages in order. Load each rule file BEFORE executing its stage. + +### INCEPTION PHASE — "What to build and why" + +1. **Workspace Detection** (ALWAYS) — read `aidlc-rules/inception/workspace-detection.md` + - Scan the workspace, classify as greenfield or brownfield + - Proceed immediately to next stage + +2. **Reverse Engineering** (CONDITIONAL: brownfield only) \ +— read `aidlc-rules/inception/reverse-engineering.md` + - Skip for greenfield projects + +3. **Requirements Analysis** (ALWAYS) — read `aidlc-rules/inception/requirements-analysis.md` + - Read the vision file, analyze requirements + - Generate `aidlc-docs/inception/requirements/requirements.md` + - Generate `aidlc-docs/inception/requirements/requirement-verification-questions.md` + - Self-approve and CONTINUE to next stage + +4. **User Stories** (CONDITIONAL) — read `aidlc-rules/inception/user-stories.md` + - Generate user stories if project complexity warrants it + +5. **Workflow Planning** (ALWAYS) — read `aidlc-rules/inception/workflow-planning.md` + - Create `aidlc-docs/inception/plans/execution-plan.md` + - Create `aidlc-docs/inception/plans/application-design-plan.md` + +6. **Application Design** (CONDITIONAL) — read `aidlc-rules/inception/application-design.md` + - Create `aidlc-docs/inception/application-design/components.md` + - Create `aidlc-docs/inception/application-design/component-methods.md` + - Create `aidlc-docs/inception/application-design/component-dependency.md` + - Create `aidlc-docs/inception/application-design/services.md` + +7. **Units Generation** (CONDITIONAL) — read `aidlc-rules/inception/units-generation.md` + - Break system into units of work + +### CONSTRUCTION PHASE — "How to build it" + +For each unit of work (or the whole project if no units were defined): + +8. **Functional Design** (CONDITIONAL) — read `aidlc-rules/construction/functional-design.md` + +9. **NFR Requirements** (CONDITIONAL) — read `aidlc-rules/construction/nfr-requirements.md` + +10. **NFR Design** (CONDITIONAL) — read `aidlc-rules/construction/nfr-design.md` + +11. **Infrastructure Design** (CONDITIONAL) — read `aidlc-rules/construction/infrastructure-design.md` + +12. **Code Generation** (ALWAYS) — read `aidlc-rules/construction/code-generation.md` + - Create a detailed code generation plan in `aidlc-docs/construction/plans/` + - Generate ALL application code with proper package structure \ +(src/, tests/, pyproject.toml, etc.) + - Write every source file, test file, and configuration file + - Write COMPLETE, WORKING files — not stubs or placeholders + +13. **Build and Test** (ALWAYS) — read `aidlc-rules/construction/build-and-test.md` + - Create `aidlc-docs/construction/build-and-test/build-instructions.md` + - Create `aidlc-docs/construction/build-and-test/unit-test-instructions.md` + - Create `aidlc-docs/construction/build-and-test/integration-test-instructions.md` + - Install dependencies and run the test suite + - If tests fail, read the error output, fix the code, and re-run until tests pass + - Create `aidlc-docs/construction/build-and-test/build-and-test-summary.md` + +## File organization + +- All documentation and workflow artifacts: `aidlc-docs/` +- All generated application code: project root (alongside vision.md) +- NEVER mix documentation and code locations + +## Tracking + +- Create and maintain `aidlc-docs/aidlc-state.md` tracking progress through each phase +- Append to `aidlc-docs/audit.md` with ISO 8601 timestamps for each action + +## Important rules + +- Since you are running autonomously without a human reviewer, self-approve all stages \ +and continue immediately to the next one. Do NOT pause or wait for approval. +- Read the relevant rule file BEFORE starting each stage. +- Read common rules as needed (e.g. `aidlc-rules/common/content-validation.md` before \ +writing files, `aidlc-rules/common/question-format-guide.md` before creating questions). +- For CONDITIONAL stages, evaluate based on project scope and skip with justification if \ +not needed, but always continue to the next stage. +- When generating code, write COMPLETE, WORKING files — not stubs or placeholders. +- Generate complete, working code with full test coverage. +""" + + +def render_prompt(vision_path: str = "vision.md", tech_env_path: str = "tech-env.md") -> str: + r"""Render the AIDLC prompt with customized file paths. + + Only replaces backtick-delimited references (``\`vision.md\```) so that + prose mentions like "alongside vision.md" are left intact. + """ + return ( + EXECUTOR_SYSTEM_PROMPT + .replace("`vision.md`", f"`{vision_path}`") + .replace("`tech-env.md`", f"`{tech_env_path}`") + ) diff --git a/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/registry.py b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/registry.py new file mode 100644 index 00000000..025d6374 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/cli-harness/src/cli_harness/registry.py @@ -0,0 +1,39 @@ +"""Adapter registry — discover and instantiate CLI adapters by name.""" + +from __future__ import annotations + +from cli_harness.adapter import CLIAdapter + + +# Lazy imports to avoid pulling in adapter-specific deps at import time +_ADAPTER_MAP: dict[str, str] = { + "kiro-cli": "cli_harness.adapters.kiro_cli.KiroCLIAdapter", + "claude-code": "cli_harness.adapters.claude_code.ClaudeCodeAdapter", +} + + +def list_adapters() -> list[str]: + """Return sorted list of registered adapter names.""" + return sorted(_ADAPTER_MAP.keys()) + + +def get_adapter(name: str) -> CLIAdapter: + """Instantiate an adapter by name. + + Raises KeyError if the adapter is not registered. + Raises ImportError if the adapter module cannot be loaded. + """ + key = name.lower().strip() + if key not in _ADAPTER_MAP: + raise KeyError( + f"Unknown adapter '{name}'. Available: {', '.join(list_adapters())}" + ) + + fqn = _ADAPTER_MAP[key] + module_path, class_name = fqn.rsplit(".", 1) + + import importlib + # nosemgrep: non-literal-import - module_path validated against _ADAPTER_MAP whitelist + module = importlib.import_module(module_path) + cls = getattr(module, class_name) + return cls() diff --git a/scripts/aidlc-evaluator/packages/cli-harness/tests/test_normalizer.py b/scripts/aidlc-evaluator/packages/cli-harness/tests/test_normalizer.py new file mode 100644 index 00000000..2ade5b32 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/cli-harness/tests/test_normalizer.py @@ -0,0 +1,94 @@ +"""Tests for output normalization. + +The normalizer now expects adapters to work directly in ``<output_dir>/workspace/`` +and move ``aidlc-docs/`` up to ``<output_dir>/aidlc-docs/`` themselves. +The normalizer only writes ``run-meta.yaml`` and ``run-metrics.yaml``. +""" + +from pathlib import Path + +import yaml + +from cli_harness.normalizer import normalize_output + + +def test_normalize_creates_run_meta(tmp_path: Path) -> None: + """normalize_output should create run-meta.yaml.""" + output = tmp_path / "output" + workspace = output / "workspace" + workspace.mkdir(parents=True) + + normalize_output(workspace, output, adapter_name="test", elapsed_seconds=120.5) + + meta_path = output / "run-meta.yaml" + assert meta_path.exists() + meta = yaml.safe_load(meta_path.read_text()) + assert meta["status"] == "completed" + assert meta["execution_time_ms"] == 120500 + assert meta["config"]["executor_model"] == "cli:test" + + +def test_normalize_creates_metrics_with_workspace(tmp_path: Path) -> None: + """normalize_output should create run-metrics.yaml counting workspace files.""" + output = tmp_path / "output" + workspace = output / "workspace" + workspace.mkdir(parents=True) + (workspace / "app.py").write_text("x = 1\ny = 2\n") + (workspace / "tests").mkdir() + (workspace / "tests" / "test_app.py").write_text("def test_it(): pass") + + normalize_output(workspace, output, adapter_name="test", elapsed_seconds=60) + + metrics_path = output / "run-metrics.yaml" + assert metrics_path.exists() + metrics = yaml.safe_load(metrics_path.read_text()) + assert metrics["timing"]["total_wall_clock_ms"] == 60000 + assert metrics["artifacts"]["workspace"]["source_files"] == 1 + assert metrics["artifacts"]["workspace"]["test_files"] == 1 + + +def test_normalize_counts_aidlc_docs(tmp_path: Path) -> None: + """normalize_output should count aidlc-docs when present at output level.""" + output = tmp_path / "output" + workspace = output / "workspace" + workspace.mkdir(parents=True) + # aidlc-docs already moved to output_dir by the adapter + (output / "aidlc-docs" / "inception").mkdir(parents=True) + (output / "aidlc-docs" / "inception" / "requirements.md").write_text("# Reqs") + (output / "aidlc-docs" / "construction").mkdir(parents=True) + (output / "aidlc-docs" / "construction" / "plan.md").write_text("# Plan") + + normalize_output(workspace, output, adapter_name="test") + + metrics = yaml.safe_load((output / "run-metrics.yaml").read_text()) + assert metrics["artifacts"]["aidlc_docs"]["inception_files"] == 1 + assert metrics["artifacts"]["aidlc_docs"]["construction_files"] == 1 + assert metrics["artifacts"]["aidlc_docs"]["total_files"] == 2 + + +def test_normalize_with_token_usage(tmp_path: Path) -> None: + """normalize_output should populate token data from token_usage dict.""" + output = tmp_path / "output" + workspace = output / "workspace" + workspace.mkdir(parents=True) + + token_usage = { + "input_tokens": 1000, + "output_tokens": 500, + "total_tokens": 1500, + "cache_read_tokens": 0, + "cache_write_tokens": 0, + "num_turns": 5, + "duration_api_ms": 50000, + "model": "test-model", + } + + normalize_output(workspace, output, adapter_name="test", + elapsed_seconds=60, token_usage=token_usage) + + metrics = yaml.safe_load((output / "run-metrics.yaml").read_text()) + assert metrics["tokens"]["total"]["input_tokens"] == 1000 + assert metrics["tokens"]["total"]["output_tokens"] == 500 + assert metrics["tokens"]["per_agent"]["executor"]["total_tokens"] == 1500 + assert metrics["handoff_patterns"]["per_agent"]["executor"]["turn_count"] == 5 + assert metrics["model_params"]["executor"]["model_id"] == "test-model" diff --git a/scripts/aidlc-evaluator/packages/contracttest/pyproject.toml b/scripts/aidlc-evaluator/packages/contracttest/pyproject.toml new file mode 100644 index 00000000..41a02682 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/contracttest/pyproject.toml @@ -0,0 +1,16 @@ +[project] +name = "aidlc-contracttest" +version = "0.1.0" +description = "API contract test harness — validate generated code against an API spec" +requires-python = ">=3.13" +dependencies = [ + "pyyaml>=6.0", + "httpx>=0.28.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/contracttest"] diff --git a/scripts/aidlc-evaluator/packages/contracttest/src/contracttest/__init__.py b/scripts/aidlc-evaluator/packages/contracttest/src/contracttest/__init__.py new file mode 100644 index 00000000..dcdfe645 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/contracttest/src/contracttest/__init__.py @@ -0,0 +1,27 @@ +"""API contract test harness — validate generated code against an OpenAPI spec. + +The OpenAPI spec is a first-class project input (alongside vision.md and +tech-env.md). Each operation may include ``x-test-cases`` extensions that +carry request bodies and expected responses. + +Usage: + from contracttest import load_spec, run_contract_tests, write_results + spec = load_spec(Path("openapi.yaml")) + results = run_contract_tests(spec, workspace=Path("runs/.../workspace")) + write_results(results, Path("contract-test-results.yaml")) + +CLI: + python -m contracttest run <workspace> --openapi openapi.yaml [-o results.yaml] +""" + +from contracttest.runner import ContractTestResults, run_contract_tests, write_results, print_results +from contracttest.spec import ContractSpec, load_spec + +__all__ = [ + "ContractSpec", + "ContractTestResults", + "load_spec", + "print_results", + "run_contract_tests", + "write_results", +] diff --git a/scripts/aidlc-evaluator/packages/contracttest/src/contracttest/__main__.py b/scripts/aidlc-evaluator/packages/contracttest/src/contracttest/__main__.py new file mode 100644 index 00000000..041925d9 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/contracttest/src/contracttest/__main__.py @@ -0,0 +1,89 @@ +"""CLI entry point: python -m contracttest run <workspace> --openapi <file>.""" + +from __future__ import annotations + +import argparse +import io +import sys +from pathlib import Path + +if sys.stdout.encoding and sys.stdout.encoding.lower().replace("-", "") != "utf8": + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace") + +from contracttest.runner import print_results, run_contract_tests, write_results +from contracttest.spec import load_spec + + +def main() -> None: + parser = argparse.ArgumentParser( + prog="contracttest", + description="Run API contract tests derived from an OpenAPI specification", + ) + sub = parser.add_subparsers(dest="command") + + run_cmd = sub.add_parser("run", help="Run contract tests") + run_cmd.add_argument("workspace", type=Path, help="Path to workspace directory") + run_cmd.add_argument( + "--openapi", type=Path, required=True, + help="Path to OpenAPI 3.x YAML spec with x-test-cases extensions", + ) + run_cmd.add_argument( + "--output", "-o", type=Path, default=None, + help="Write contract-test-results.yaml to this path", + ) + + # Sandbox options + sandbox_group = run_cmd.add_mutually_exclusive_group() + sandbox_group.add_argument( + "--sandbox", action="store_true", default=True, + help="Run the generated server inside a Docker container (default)", + ) + sandbox_group.add_argument( + "--no-sandbox", action="store_false", dest="sandbox", + help="Run the generated server directly on the host", + ) + run_cmd.add_argument( + "--sandbox-image", default="aidlc-sandbox:latest", + help="Docker image for sandbox execution (default: aidlc-sandbox:latest)", + ) + + args = parser.parse_args() + if args.command != "run": + parser.print_help() + sys.exit(1) + + if not args.workspace.is_dir(): + print(f"Error: workspace not found: {args.workspace}", file=sys.stderr) + sys.exit(1) + if not args.openapi.is_file(): + print(f"Error: OpenAPI spec not found: {args.openapi}", file=sys.stderr) + sys.exit(1) + + spec = load_spec(args.openapi) + print(f"OpenAPI spec: {args.openapi}") + if spec.title: + print(f"API title: {spec.title} v{spec.version}") + print(f"Workspace: {args.workspace}") + print(f"App module: {spec.app.module}") + print(f"Test cases: {len(spec.test_cases)}") + print(f"Sandbox: {'enabled' if args.sandbox else 'disabled'}") + + results = run_contract_tests( + spec, + workspace=args.workspace, + use_sandbox=args.sandbox, + sandbox_image=args.sandbox_image, + ) + + if args.output: + write_results(results, args.output) + print(f"\nResults written to: {args.output}") + + print_results(results) + + sys.exit(0 if results.failed == 0 and results.errors == 0 else 1) + + +if __name__ == "__main__": + main() diff --git a/scripts/aidlc-evaluator/packages/contracttest/src/contracttest/runner.py b/scripts/aidlc-evaluator/packages/contracttest/src/contracttest/runner.py new file mode 100644 index 00000000..4198efa2 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/contracttest/src/contracttest/runner.py @@ -0,0 +1,231 @@ +"""Execute contract tests against a running server and produce results.""" + +from __future__ import annotations + +import math +import time +from dataclasses import asdict, dataclass, field +from pathlib import Path +from typing import Any + +import httpx +import yaml + +from contracttest.server import ServerProcess +from contracttest.spec import ContractSpec, TestCase + + +@dataclass +class CaseResult: + name: str + path: str + method: str + passed: bool + expected_status: int + actual_status: int | None = None + failures: list[str] = field(default_factory=list) + latency_ms: float | None = None + error: str | None = None + skipped: bool = False + + +@dataclass +class ContractTestResults: + total: int = 0 + passed: int = 0 + failed: int = 0 + errors: int = 0 + skipped: int = 0 + cases: list[CaseResult] = field(default_factory=list) + server_started: bool = False + server_error: str | None = None + + +def _match_body(expected: dict[str, Any], actual: dict[str, Any], prefix: str = "") -> list[str]: + """Recursively check that expected keys/values exist in actual. + + Only checks keys present in expected — extra keys in actual are fine. + For numeric values, allows a tolerance of 1e-6 for floating point. + """ + failures: list[str] = [] + for key, exp_val in expected.items(): + path = f"{prefix}.{key}" if prefix else key + if key not in actual: + failures.append(f"missing key '{path}'") + continue + act_val = actual[key] + if isinstance(exp_val, dict) and isinstance(act_val, dict): + failures.extend(_match_body(exp_val, act_val, prefix=path)) + elif isinstance(exp_val, (int, float)) and isinstance(act_val, (int, float)): + if not math.isclose(exp_val, act_val, rel_tol=1e-6, abs_tol=1e-9): + failures.append(f"'{path}': expected {exp_val}, got {act_val}") + elif exp_val != act_val: + failures.append(f"'{path}': expected {exp_val!r}, got {act_val!r}") + return failures + + +def _run_case(client: httpx.Client, base_url: str, case: TestCase) -> CaseResult: + """Execute a single test case and return the result.""" + url = f"{base_url}{case.path}" + start = time.monotonic() + try: + if case.method == "GET": + resp = client.get(url, timeout=5.0) + elif case.method == "POST": + resp = client.post(url, json=case.body, timeout=5.0) + else: + resp = client.request(case.method, url, json=case.body, timeout=5.0) + + latency = (time.monotonic() - start) * 1000 + except (httpx.ConnectError, httpx.ReadError, httpx.TimeoutException) as e: + return CaseResult( + name=case.name, path=case.path, method=case.method, + passed=False, expected_status=case.expected_status, + error=str(e), + ) + + failures: list[str] = [] + + if resp.status_code != case.expected_status: + failures.append( + f"status: expected {case.expected_status}, got {resp.status_code}" + ) + + if case.expected_body is not None: + try: + actual_body = resp.json() + except Exception: + failures.append("response is not valid JSON") + actual_body = None + if actual_body is not None: + failures.extend(_match_body(case.expected_body, actual_body)) + + return CaseResult( + name=case.name, + path=case.path, + method=case.method, + passed=len(failures) == 0, + expected_status=case.expected_status, + actual_status=resp.status_code, + failures=failures, + latency_ms=round(latency, 1), + ) + + +MAX_CONSECUTIVE_ERRORS = 3 + + +def run_contract_tests( + spec: ContractSpec, + workspace: Path, + use_sandbox: bool = False, + sandbox_image: str = "aidlc-sandbox:latest", + sandbox_memory: str = "2g", + sandbox_cpus: int = 2, +) -> ContractTestResults: + """Start the server, execute all test cases, and return results. + + When *use_sandbox* is ``True``, the generated server runs inside a + Docker container while the test client remains on the host. + + Aborts early if the server process dies or if MAX_CONSECUTIVE_ERRORS + consecutive requests fail with connection/timeout errors. + """ + results = ContractTestResults(total=len(spec.test_cases)) + + try: + server = ServerProcess( + workspace=workspace, + module=spec.app.module, + port=spec.app.port, + startup_timeout=spec.app.startup_timeout, + use_sandbox=use_sandbox, + sandbox_image=sandbox_image, + sandbox_memory=sandbox_memory, + sandbox_cpus=sandbox_cpus, + ) + except Exception as e: + results.server_error = str(e) + results.errors = results.total + return results + + try: + with server: + results.server_started = True + consecutive_errors = 0 + with httpx.Client(follow_redirects=True) as client: + for case in spec.test_cases: + if case.skip: + results.skipped += 1 + results.cases.append(CaseResult( + name=case.name, path=case.path, method=case.method, + passed=False, expected_status=case.expected_status, + skipped=True, + )) + continue + + # nosemgrep: is-function-without-parentheses - is_running is a @property, not a method + if not server.is_running: + remaining = results.total - results.passed - results.failed - results.errors - results.skipped + results.server_error = ( + f"server died " + f"after {results.passed + results.failed + results.errors} tests; " + f"{remaining} skipped" + ) + results.errors += remaining + break + + result = _run_case(client, server.base_url, case) + results.cases.append(result) + if result.error: + results.errors += 1 + consecutive_errors += 1 + if consecutive_errors >= MAX_CONSECUTIVE_ERRORS: + remaining = results.total - results.passed - results.failed - results.errors - results.skipped + results.server_error = ( + f"server unresponsive ({consecutive_errors} consecutive errors); " + f"{remaining} tests skipped" + ) + results.errors += remaining + break + else: + consecutive_errors = 0 + if result.passed: + results.passed += 1 + else: + results.failed += 1 + except (RuntimeError, TimeoutError) as e: + results.server_error = str(e) + results.errors = results.total - results.passed - results.failed - results.skipped + + return results + + +def write_results(results: ContractTestResults, output_path: Path) -> None: + """Write contract test results to YAML.""" + data = asdict(results) + with open(output_path, "w", encoding="utf-8") as f: + yaml.dump(data, f, default_flow_style=False, sort_keys=False) + + +def print_results(results: ContractTestResults) -> None: + """Print a human-readable summary.""" + if results.server_error: + print(f"\n Server error: {results.server_error}") + print(f"\n Total: {results.total} Passed: {results.passed} " + f"Failed: {results.failed} Errors: {results.errors} " + f"Skipped: {results.skipped}") + + for case in results.cases: + if case.skipped: + mark = "SKIP" + elif case.passed: + mark = "PASS" + else: + mark = "FAIL" + status_info = f"[{case.actual_status}]" if case.actual_status else "[---]" + print(f" {mark} {case.method} {case.path} {status_info} — {case.name}") + if case.error: + print(f" error: {case.error}") + for f in case.failures: + print(f" {f}") diff --git a/scripts/aidlc-evaluator/packages/contracttest/src/contracttest/server.py b/scripts/aidlc-evaluator/packages/contracttest/src/contracttest/server.py new file mode 100644 index 00000000..7bca45ff --- /dev/null +++ b/scripts/aidlc-evaluator/packages/contracttest/src/contracttest/server.py @@ -0,0 +1,277 @@ +"""Launch and manage the generated app as a subprocess (or Docker container).""" + +from __future__ import annotations + +import os +import shutil +import signal +import socket +import subprocess +import sys +import time +from pathlib import Path + +import httpx + +from shared.sandbox import ( + is_docker_available, + sandbox_logs, + sandbox_is_running, + sandbox_run, + sandbox_run_detached, + sandbox_stop, +) + + +def _find_free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("127.0.0.1", 0)) + return s.getsockname()[1] + + +class ServerProcess: + """Manages a uvicorn subprocess for contract testing. + + When *use_sandbox* is ``True`` and Docker is available the server + runs inside a container with the workspace bind-mounted. The test + client on the host connects via a port-forwarded localhost port. + """ + + def __init__( + self, + workspace: Path, + module: str, + port: int = 0, + startup_timeout: int = 15, + use_sandbox: bool = False, + sandbox_image: str = "aidlc-sandbox:latest", + sandbox_memory: str = "2g", + sandbox_cpus: int = 2, + ) -> None: + self.workspace = workspace + self.project_root = self._find_project_root(workspace) + self.module = module + self.port = port if port != 0 else _find_free_port() + self.startup_timeout = startup_timeout + self._process: subprocess.Popen | None = None + self._container_id: str | None = None + self.base_url = f"http://127.0.0.1:{self.port}" + + # Sandbox settings + self.use_sandbox = use_sandbox and is_docker_available() + if use_sandbox and not self.use_sandbox: + print( + "[WARN] Docker not available — running server on host without sandbox", + file=sys.stderr, + ) + self.sandbox_image = sandbox_image + self.sandbox_memory = sandbox_memory + self.sandbox_cpus = sandbox_cpus + + @staticmethod + def _find_project_root(workspace: Path) -> Path: + """Locate the directory containing pyproject.toml. + + The executor may place the project directly in workspace/ or in a + subdirectory like workspace/sci-calc/. Walk one level deep to find it. + """ + if (workspace / "pyproject.toml").exists(): + return workspace + for child in workspace.iterdir(): + if child.is_dir() and (child / "pyproject.toml").exists(): + return child + return workspace + + def _venv_python(self) -> Path | None: + """Return the project's venv Python if it exists.""" + venv = self.project_root / ".venv" + if not venv.is_dir(): + return None + if sys.platform == "win32": + py = venv / "Scripts" / "python.exe" + else: + py = venv / "bin" / "python" + return py if py.is_file() else None + + def _ensure_venv_host(self) -> Path: + """Ensure the project has its own venv (host execution path).""" + py = self._venv_python() + if py is not None: + return py + + root = str(self.project_root) + env = {**os.environ} + + if shutil.which("uv") is not None: + # nosec B603, B607 - Static uv venv command for isolated environment setup + # nosemgrep: dangerous-subprocess-use-audit + subprocess.run( + ["uv", "venv"], + cwd=root, env=env, capture_output=True, check=True, + ) + # nosec B603, B607 - Static uv pip install for dependency setup + # nosemgrep: dangerous-subprocess-use-audit + subprocess.run( + ["uv", "pip", "install", "-e", ".[dev]"], + cwd=root, env=env, capture_output=True, check=True, + ) + else: + # nosec B603, B607 - Static python venv command using sys.executable + subprocess.run( + [sys.executable, "-m", "venv", ".venv"], + cwd=root, env=env, capture_output=True, check=True, + ) + + py = self._venv_python() + if py is None: + raise RuntimeError(f"Failed to create venv in {self.project_root}") + return py + + def _ensure_venv_sandbox(self) -> None: + """Set up the venv inside a Docker container.""" + # Remove any host-created .venv before sandbox setup. + # The host venv contains symlinks to the host Python interpreter + # which are broken inside the container. + stale_venv = self.project_root / ".venv" + if stale_venv.is_dir(): + shutil.rmtree(stale_venv) + + setup_cmd = "uv sync --all-extras" + result = sandbox_run( + setup_cmd, + workspace=self.project_root, + image=self.sandbox_image, + timeout=120, + network=True, + memory=self.sandbox_memory, + cpus=self.sandbox_cpus, + ) + if result.exit_code != 0: + raise RuntimeError( + f"Sandbox venv setup failed (exit {result.exit_code}):\n" + f"{(result.stdout + result.stderr)[:2000]}" + ) + + def start(self) -> None: + """Start the server and wait for it to accept connections.""" + if self.use_sandbox: + self._start_sandbox() + else: + self._start_host() + self._wait_for_ready() + + def _start_host(self) -> None: + """Start the server as a host subprocess.""" + venv_python = self._ensure_venv_host() + + cmd = [ + str(venv_python), "-m", "uvicorn", + self.module, + "--host", "127.0.0.1", + "--port", str(self.port), + "--no-access-log", + ] + + env = {**os.environ, "VIRTUAL_ENV": str(venv_python.parent.parent)} + + # nosec B603 - cmd built from validated venv python and uvicorn parameters (localhost-only) + # nosemgrep: dangerous-subprocess-use-audit + self._process = subprocess.Popen( + cmd, + cwd=str(self.project_root), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env, + ) + + def _start_sandbox(self) -> None: + """Start the server inside a Docker container (detached).""" + self._ensure_venv_sandbox() + + server_cmd = ( + f".venv/bin/python -m uvicorn {self.module} " + f"--host 0.0.0.0 --port 8000 --no-access-log" + ) + + self._container_id = sandbox_run_detached( + server_cmd, + workspace=self.project_root, + image=self.sandbox_image, + network=True, + ports={self.port: 8000}, + memory=self.sandbox_memory, + cpus=self.sandbox_cpus, + ) + + def _wait_for_ready(self) -> None: + """Poll the health endpoint until the server responds or timeout.""" + deadline = time.monotonic() + self.startup_timeout + last_error: Exception | None = None + + while time.monotonic() < deadline: + # Check if the process/container has died + if self.use_sandbox: + if self._container_id and not sandbox_is_running(self._container_id): + stdout, stderr = sandbox_logs(self._container_id) + raise RuntimeError( + f"Server container exited early:\n{stderr[:2000]}" + ) + else: + if self._process and self._process.poll() is not None: + stderr = self._process.stderr.read().decode("utf-8", errors="replace") if self._process.stderr else "" + raise RuntimeError( + f"Server process exited early (code {self._process.returncode}):\n{stderr[:2000]}" + ) + try: + resp = httpx.get(f"{self.base_url}/health", timeout=2.0) + if resp.status_code == 200: + return + except (httpx.ConnectError, httpx.ReadError, httpx.RemoteProtocolError, httpx.TimeoutException) as e: + last_error = e + # nosemgrep: arbitrary-sleep - Intentional delay for server startup polling + time.sleep(0.5) + + self.stop() + raise TimeoutError( + f"Server did not become ready within {self.startup_timeout}s " + f"(last error: {last_error})" + ) + + def stop(self) -> None: + """Terminate the server process or container.""" + if self.use_sandbox and self._container_id: + sandbox_stop(self._container_id) + self._container_id = None + elif self._process is not None: + try: + if sys.platform == "win32": + self._process.terminate() + else: + self._process.send_signal(signal.SIGTERM) + self._process.wait(timeout=5) + except (subprocess.TimeoutExpired, OSError): + self._process.kill() + self._process.wait(timeout=5) + finally: + self._process = None + + @property + def is_running(self) -> bool: + """Check whether the server is still alive.""" + if self.use_sandbox: + return self._container_id is not None and sandbox_is_running(self._container_id) + return self._process is not None and self._process.poll() is None + + @property + def returncode(self) -> int | None: + """Return the exit code of the server process (host mode only).""" + if self._process is not None: + return self._process.poll() + return None + + def __enter__(self) -> ServerProcess: + self.start() + return self + + def __exit__(self, *args) -> None: + self.stop() diff --git a/scripts/aidlc-evaluator/packages/contracttest/src/contracttest/spec.py b/scripts/aidlc-evaluator/packages/contracttest/src/contracttest/spec.py new file mode 100644 index 00000000..609cb6ac --- /dev/null +++ b/scripts/aidlc-evaluator/packages/contracttest/src/contracttest/spec.py @@ -0,0 +1,97 @@ +"""Load an OpenAPI 3.x specification and derive executable test cases. + +The spec is a first-class project input (alongside vision.md and tech-env.md). +Each path/operation may contain an ``x-test-cases`` extension that carries +explicit inputs and expected outputs. The loader walks every path + method, +collects those extensions, and returns a flat list of ``TestCase`` objects +ready for the runner. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +import yaml + + +HTTP_METHODS = ("get", "post", "put", "patch", "delete", "head", "options") + + +@dataclass +class AppConfig: + """How to start the generated server.""" + module: str + framework: str = "fastapi" + startup_timeout: int = 15 + port: int = 0 + + +@dataclass +class TestCase: + """A single request → expected response assertion.""" + name: str + method: str + path: str + expected_status: int + body: dict[str, Any] | None = None + expected_body: dict[str, Any] | None = None + operation_id: str | None = None + skip: bool = False + + +@dataclass +class ContractSpec: + """Parsed OpenAPI spec ready for the contract test runner.""" + app: AppConfig + test_cases: list[TestCase] = field(default_factory=list) + title: str = "" + version: str = "" + + +def load_spec(path: Path) -> ContractSpec: + """Load an OpenAPI YAML spec and return a ContractSpec. + + Reads the ``x-app`` top-level extension for server configuration and + walks every ``paths`` entry to collect ``x-test-cases`` extensions. + """ + with open(path, encoding="utf-8") as f: + doc = yaml.safe_load(f) or {} + + # ── app config (x-app extension or sensible defaults) ────────── + x_app = doc.get("x-app", {}) + app = AppConfig( + module=x_app.get("module", ""), + framework=x_app.get("framework", "fastapi"), + startup_timeout=x_app.get("startup_timeout", 15), + port=x_app.get("port", 0), + ) + + info = doc.get("info", {}) + title = info.get("title", "") + version = info.get("version", "") + + # ── walk paths and collect test cases ─────────────────────────── + cases: list[TestCase] = [] + for path_str, path_item in (doc.get("paths") or {}).items(): + if not isinstance(path_item, dict): + continue + for method in HTTP_METHODS: + operation = path_item.get(method) + if not isinstance(operation, dict): + continue + op_id = operation.get("operationId") + for tc in operation.get("x-test-cases", []): + cases.append(TestCase( + name=tc.get("name", f"{method.upper()} {path_str}"), + method=method.upper(), + path=path_str, + expected_status=tc.get("expected_status", 200), + body=tc.get("body"), + expected_body=tc.get("expected_body"), + operation_id=op_id, + skip=bool(tc.get("skip", False)), + )) + + return ContractSpec(app=app, test_cases=cases, title=title, version=version) diff --git a/scripts/aidlc-evaluator/packages/contracttest/tests/test_runner.py b/scripts/aidlc-evaluator/packages/contracttest/tests/test_runner.py new file mode 100644 index 00000000..4038142a --- /dev/null +++ b/scripts/aidlc-evaluator/packages/contracttest/tests/test_runner.py @@ -0,0 +1,133 @@ +"""Tests for contract test runner logic. + +These tests validate the body matching and case execution without +requiring a real server. +""" + +from pathlib import Path +from unittest.mock import patch, MagicMock + +import httpx + +from contracttest.runner import _match_body, _run_case, ContractTestResults, write_results +from contracttest.spec import TestCase + +import yaml + + +class TestMatchBody: + def test_exact_match(self): + expected = {"status": "ok", "result": 42} + actual = {"status": "ok", "result": 42, "extra": "ignored"} + assert _match_body(expected, actual) == [] + + def test_missing_key(self): + expected = {"status": "ok", "result": 42} + actual = {"status": "ok"} + failures = _match_body(expected, actual) + assert len(failures) == 1 + assert "missing key 'result'" in failures[0] + + def test_wrong_value(self): + expected = {"status": "ok"} + actual = {"status": "error"} + failures = _match_body(expected, actual) + assert len(failures) == 1 + assert "'status'" in failures[0] + + def test_nested_match(self): + expected = {"error": {"code": "DOMAIN_ERROR"}} + actual = {"error": {"code": "DOMAIN_ERROR", "message": "sqrt of negative"}} + assert _match_body(expected, actual) == [] + + def test_nested_mismatch(self): + expected = {"error": {"code": "DOMAIN_ERROR"}} + actual = {"error": {"code": "OVERFLOW"}} + failures = _match_body(expected, actual) + assert len(failures) == 1 + assert "error.code" in failures[0] + + def test_float_tolerance(self): + expected = {"result": 3.0} + actual = {"result": 3.0000000001} + assert _match_body(expected, actual) == [] + + def test_float_mismatch(self): + expected = {"result": 3.0} + actual = {"result": 5.0} + failures = _match_body(expected, actual) + assert len(failures) == 1 + + +class TestRunCase: + def test_get_success(self): + case = TestCase(name="health", method="GET", path="/health", + expected_status=200, expected_body={"status": "ok"}) + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = {"status": "ok", "version": "0.1.0"} + + client = MagicMock() + client.get.return_value = mock_resp + + result = _run_case(client, "http://localhost:8000", case) + assert result.passed + assert result.actual_status == 200 + assert result.failures == [] + assert result.latency_ms is not None + + def test_wrong_status(self): + case = TestCase(name="not found", method="GET", path="/missing", + expected_status=404) + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = {} + + client = MagicMock() + client.get.return_value = mock_resp + + result = _run_case(client, "http://localhost:8000", case) + assert not result.passed + assert any("status" in f for f in result.failures) + + def test_post_body_mismatch(self): + case = TestCase(name="add", method="POST", path="/api/v1/arithmetic/add", + expected_status=200, body={"a": 1, "b": 2}, + expected_body={"status": "ok", "result": 3}) + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = {"status": "ok", "result": 99} + + client = MagicMock() + client.post.return_value = mock_resp + + result = _run_case(client, "http://localhost:8000", case) + assert not result.passed + assert any("result" in f for f in result.failures) + + def test_connection_error(self): + case = TestCase(name="health", method="GET", path="/health", + expected_status=200) + client = MagicMock() + client.get.side_effect = httpx.ConnectError("refused") + + result = _run_case(client, "http://localhost:9999", case) + assert not result.passed + assert result.error is not None + + +class TestWriteResults: + def test_roundtrip(self, tmp_path): + results = ContractTestResults( + total=3, passed=2, failed=1, errors=0, + server_started=True, + ) + out = tmp_path / "results.yaml" + write_results(results, out) + + with open(out) as f: + data = yaml.safe_load(f) + assert data["total"] == 3 + assert data["passed"] == 2 + assert data["failed"] == 1 + assert data["server_started"] is True diff --git a/scripts/aidlc-evaluator/packages/contracttest/tests/test_spec.py b/scripts/aidlc-evaluator/packages/contracttest/tests/test_spec.py new file mode 100644 index 00000000..2a8da571 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/contracttest/tests/test_spec.py @@ -0,0 +1,168 @@ +"""Tests for OpenAPI-based contract spec loading.""" + +from pathlib import Path +from contracttest.spec import load_spec + + +def test_load_openapi_spec(tmp_path): + spec_file = tmp_path / "openapi.yaml" + spec_file.write_text(""" +openapi: "3.1.0" +info: + title: Test API + version: "1.0.0" + +x-app: + module: "myapp.app:app" + framework: "fastapi" + startup_timeout: 10 + port: 8080 + +paths: + /health: + get: + operationId: health_check + x-test-cases: + - name: "health" + expected_status: 200 + expected_body: + status: "ok" + + /api/data: + post: + operationId: create_data + x-test-cases: + - name: "create item" + body: {"key": "value"} + expected_status: 201 + expected_body: + id: 1 + - name: "missing body - 422" + body: {} + expected_status: 422 +""") + spec = load_spec(spec_file) + assert spec.app.module == "myapp.app:app" + assert spec.app.framework == "fastapi" + assert spec.app.startup_timeout == 10 + assert spec.app.port == 8080 + assert spec.title == "Test API" + assert spec.version == "1.0.0" + assert len(spec.test_cases) == 3 + + c0 = spec.test_cases[0] + assert c0.name == "health" + assert c0.method == "GET" + assert c0.path == "/health" + assert c0.expected_status == 200 + assert c0.expected_body == {"status": "ok"} + assert c0.body is None + assert c0.operation_id == "health_check" + + c1 = spec.test_cases[1] + assert c1.method == "POST" + assert c1.body == {"key": "value"} + assert c1.operation_id == "create_data" + + c2 = spec.test_cases[2] + assert c2.expected_status == 422 + + +def test_load_spec_defaults(tmp_path): + """Minimal spec with no x-app — should use defaults.""" + spec_file = tmp_path / "openapi.yaml" + spec_file.write_text(""" +openapi: "3.1.0" +info: + title: Minimal + version: "0.0.1" + +x-app: + module: "app:app" + +paths: + /ping: + get: + x-test-cases: + - name: "ping" + expected_status: 200 +""") + spec = load_spec(spec_file) + assert spec.app.framework == "fastapi" + assert spec.app.startup_timeout == 15 + assert spec.app.port == 0 + assert len(spec.test_cases) == 1 + assert spec.test_cases[0].method == "GET" + assert spec.test_cases[0].body is None + assert spec.test_cases[0].expected_body is None + + +def test_load_spec_multiple_methods(tmp_path): + """Path with both GET and POST operations.""" + spec_file = tmp_path / "openapi.yaml" + spec_file.write_text(""" +openapi: "3.1.0" +info: + title: Multi + version: "0.1.0" +x-app: + module: "app:app" +paths: + /items: + get: + operationId: list_items + x-test-cases: + - name: "list all" + expected_status: 200 + post: + operationId: create_item + x-test-cases: + - name: "create" + body: {"name": "x"} + expected_status: 201 +""") + spec = load_spec(spec_file) + assert len(spec.test_cases) == 2 + methods = {tc.method for tc in spec.test_cases} + assert methods == {"GET", "POST"} + + +def test_load_spec_no_test_cases(tmp_path): + """Operations without x-test-cases are silently skipped.""" + spec_file = tmp_path / "openapi.yaml" + spec_file.write_text(""" +openapi: "3.1.0" +info: + title: Empty + version: "0.1.0" +x-app: + module: "app:app" +paths: + /hidden: + get: + operationId: hidden + summary: "No test cases here" +""") + spec = load_spec(spec_file) + assert len(spec.test_cases) == 0 + + +def test_load_real_openapi_spec(): + """Validate that the actual sci-calc OpenAPI spec loads correctly.""" + spec_path = Path(__file__).resolve().parents[3] / "test_cases" / "sci-calc" / "openapi.yaml" + if not spec_path.exists(): + return + spec = load_spec(spec_path) + assert spec.title == "Scientific Calculator API" + assert spec.version == "0.1.0" + assert spec.app.module == "sci_calc.app:app" + assert len(spec.test_cases) >= 60 + ops = {tc.operation_id for tc in spec.test_cases if tc.operation_id} + assert "health" in ops + assert "arithmetic_add" in ops + assert "powers_sqrt" in ops + assert "trig_sin" in ops + assert "log_ln" in ops + assert "stats_mean" in ops + assert "constants_pi" in ops + assert "convert_temperature" in ops diff --git a/scripts/aidlc-evaluator/packages/execution/README.md b/scripts/aidlc-evaluator/packages/execution/README.md new file mode 100644 index 00000000..b78d69ef --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/README.md @@ -0,0 +1,252 @@ +# aidlc-runner + +A two-agent orchestrator that drives the full AI-DLC (AI-Driven Development Life Cycle) workflow. Given a vision document and an optional technical environment document, aidlc-runner coordinates an **Executor** agent and a **Human Simulator** agent to carry a software project from requirements through code generation, producing all documentation artifacts and working application code. + +## How It Works + +aidlc-runner creates a [Strands Agents](https://github.com/strands-agents) swarm with two agents that hand off to each other: + +1. **Executor** — Drives the AIDLC workflow stage-by-stage. It loads the relevant rule file for each stage, produces artifacts (requirements, designs, code), and hands off to the simulator whenever human input is needed. +2. **Human Simulator** — Acts as a knowledgeable human stakeholder. It answers clarifying questions, approves documents, and reviews generated code based on the vision and technical environment documents, then hands back to the executor. + +The agents repeat this handoff loop through all Inception and Construction phases until the full application is generated. + +### Workflow Stages + +**Inception Phase** — what to build and why: + +| Stage | Condition | +| --------------------- | ---------------------- | +| Workspace Detection | Always | +| Reverse Engineering | Brownfield only | +| Requirements Analysis | Always | +| User Stories | If complexity warrants | +| Workflow Planning | Always | +| Application Design | Conditional | +| Units Generation | Conditional | + +**Construction Phase** — how to build it (runs per unit of work): + +| Stage | Condition | +| --------------------- | ----------- | +| Functional Design | Conditional | +| NFR Requirements | Conditional | +| NFR Design | Conditional | +| Infrastructure Design | Conditional | +| Code Generation | Always | +| Build and Test | Always | + +## Prerequisites + +- Python 3.13+ +- [uv](https://github.com/astral-sh/uv) +- Git (for cloning AIDLC rules; not needed if using `--rules-path`) +- AWS CLI configured with a profile that has Amazon Bedrock access + +## Installation + +From the repository root: + +```bash +cd aidlc-runner +uv sync +``` + +## Usage + +```bash +uv run aidlc-runner --vision <path-to-vision-file> [--tech-env <path-to-tech-env-file>] [options] +``` + +The only required argument is `--vision`, which points to a markdown file describing what to build. Optionally, `--tech-env` provides a technical environment document that defines how to build it (languages, frameworks, security controls, testing standards). See the [input document guide](GUIDE_TO_WRITING_VISION_DOCS.md) for details on writing these documents. + +### Examples + +Minimal — uses all defaults: + +```bash +uv run aidlc-runner --vision ./my-project-vision.md +``` + +With a technical environment document: + +```bash +uv run aidlc-runner --vision ./my-project-vision.md \ + --tech-env ./my-project-tech-env.md +``` + +Custom AWS profile and region: + +```bash +uv run aidlc-runner --vision ./my-project-vision.md \ + --aws-profile my-profile \ + --aws-region us-east-1 +``` + +Use a local copy of the AIDLC rules instead of cloning from GitHub: + +```bash +uv run aidlc-runner --vision ./my-project-vision.md \ + --rules-path /opt/aidlc-workflows +``` + +Custom output directory and config file: + +```bash +uv run aidlc-runner --vision ./my-project-vision.md \ + --config ./my-config.yaml \ + --output-dir ./my-runs +``` + +Override model IDs: + +```bash +uv run aidlc-runner --vision ./my-project-vision.md \ + --executor-model us.anthropic.claude-opus-4-20250514-v1:0 \ + --simulator-model us.anthropic.claude-opus-4-20250514-v1:0 +``` + +### CLI Reference + +| Flag | Required | Default | Description | +| ------------------------ | -------- | ----------------- | ----------------------------------------------- | +| `--vision PATH` | Yes | — | Path to the vision/constraints markdown file | +| `--tech-env PATH` | No | — | Path to the technical environment markdown file | +| `--config PATH` | No | Built-in default | Path to a YAML configuration file | +| `--aws-profile TEXT` | No | `default` | AWS profile name | +| `--aws-region TEXT` | No | `us-west-2` | AWS region for Bedrock | +| `--executor-model TEXT` | No | Claude Opus 4 | Model ID for the executor agent | +| `--simulator-model TEXT` | No | Claude Sonnet 4.5 | Model ID for the simulator agent | +| `--output-dir PATH` | No | `../runs` | Directory where run folders are created | +| `--rules-path PATH` | No | Cloned from Git | Path to a local AIDLC rules directory | +| `--no-exec` | No | Enabled | Disable in-workflow command execution | +| `--no-post-tests` | No | Enabled | Disable post-run test execution | + +## Configuration + +Settings are resolved in order of precedence: **CLI flags > YAML config > built-in defaults**. + +### YAML Config File + +Create a YAML file and pass it via `--config`. Any value not specified falls back to the built-in default. + +```yaml +aws: + profile: "my-profile" + region: "us-east-1" + +models: + executor: + provider: "bedrock" + model_id: "us.anthropic.claude-opus-4-20250514-v1:0" + simulator: + provider: "bedrock" + model_id: "us.anthropic.claude-opus-4-20250514-v1:0" + +aidlc: + rules_source: "git" # "git" or "local" + rules_repo: "https://github.com/awslabs/aidlc-workflows.git" + rules_local_path: null # set when rules_source is "local" + +swarm: + max_handoffs: 200 + max_iterations: 200 + execution_timeout: 14400 # 4 hours, in seconds + node_timeout: 3600 # 1 hour, in seconds + +runs: + output_dir: "../runs" +``` + +### Built-in Defaults + +The built-in defaults match the file above. The default config ships at `aidlc-runner/config/default.yaml`. + +## Run Output + +Each invocation creates a timestamped run folder under the output directory: + +```text +runs/ +└── 20260212T143022-a1b2c3d4e5f6.../ + ├── run-meta.yaml # Metadata: timestamps, config snapshot, status + ├── run-metrics.yaml # NFR metrics: tokens, timing, artifacts, errors + ├── test-results.yaml # Test pass/fail results (if post-run tests enabled) + ├── vision.md # Copy of the input vision file + ├── tech-env.md # Copy of the input tech-env file (if provided) + ├── aidlc-rules/ # AIDLC workflow rules (cloned or copied) + │ ├── aws-aidlc-rules/ + │ └── aws-aidlc-rule-details/ + ├── aidlc-docs/ # Documentation artifacts from the workflow + │ ├── inception/ # Requirements, user stories, designs, etc. + │ ├── construction/ # Functional design, code plans, reviews + │ ├── aidlc-state.md # Current workflow state tracker + │ └── audit.md # Timestamped audit log of all stages + └── workspace/ # Generated application code + ├── src/ + ├── tests/ + ├── pyproject.toml + └── ... +``` + +`run-meta.yaml` records the full execution context — start/end times, status, total handoffs, node history, and the config snapshot used. `run-metrics.yaml` captures NFR metrics — token usage (total and per-agent), handoff timing and patterns, generated artifact counts and lines of code, and error/retry events. + +## Development + +### Running Tests + +```bash +cd aidlc-runner +uv run pytest +``` + +### Linting + +```bash +uv run ruff check . && uv run ruff format . +``` + +### Project Structure + +```text +aidlc-runner/ +├── config/ +│ └── default.yaml # Default configuration +├── src/aidlc_runner/ +│ ├── __init__.py # Package version (0.1.0) +│ ├── __main__.py # python -m aidlc_runner entry point +│ ├── cli.py # Argument parsing and main() +│ ├── config.py # Configuration dataclasses and loading +│ ├── runner.py # Run folder creation, rules setup, swarm orchestration +│ ├── metrics.py # Metrics collection, artifact scanning, YAML output +│ ├── progress.py # Callback handlers and swarm hooks for progress reporting +│ ├── post_run.py # Post-run test evaluation +│ ├── agents/ +│ │ ├── executor.py # Executor agent factory +│ │ └── simulator.py # Simulator agent factory +│ └── tools/ +│ ├── file_ops.py # Sandboxed read/write/list file tools +│ ├── rule_loader.py # AIDLC rule file loader with path resolution +│ └── run_command.py # Sandboxed shell command execution tool +├── tests/ +│ ├── test_config.py # Configuration unit tests +│ ├── test_metrics.py # Metrics collection and artifact scanning tests +│ ├── test_post_run.py # Post-run evaluation tests +│ ├── test_run_command.py # Command execution and sandboxing tests +│ └── test_two_inputs.py # Two-input-document (vision + tech-env) tests +└── pyproject.toml +``` + +### Key Modules + +- **cli.py** — Parses CLI arguments (including `--vision` and optional `--tech-env`), loads config, and calls `runner.run()`. +- **config.py** — Defines `RunnerConfig` and nested dataclasses (`AwsConfig`, `ModelConfig`, `SwarmConfig`, etc.). Merges defaults, YAML, and CLI overrides. +- **runner.py** — Creates the run folder, copies the vision and optional tech-env files, sets up rules, builds both agents, creates a `Swarm`, executes it, and writes metrics. +- **metrics.py** — `MetricsCollector` accumulates handoff timings and error events during execution, then assembles token usage, artifact counts, and handoff patterns post-run into `run-metrics.yaml`. +- **progress.py** — `AgentProgressHandler` prints tool invocations and detects error events per agent. `SwarmProgressHook` tracks node-level handoff timing via Strands hook events. +- **post_run.py** — Post-run test evaluation: detects the project type in `workspace/`, installs dependencies, runs tests, and writes `test-results.yaml`. +- **agents/executor.py** — Builds the executor `Agent` with file-ops tools, a rule-loader tool, and an optional `run_command` tool. The system prompt encodes the complete AIDLC stage sequence and handoff protocol. +- **agents/simulator.py** — Builds the simulator `Agent` with file-ops tools. The system prompt is dynamically generated to embed the vision document content and, when provided, the technical environment document. +- **tools/file_ops.py** — `make_file_tools(run_folder)` returns sandboxed `read_file`, `write_file`, and `list_files` functions scoped to the run folder with path-traversal prevention. +- **tools/rule_loader.py** — `make_rule_loader(rules_dir)` returns a `load_rule` function that resolves shorthand paths (e.g., `"inception/requirements-analysis"`) to full rule file paths. +- **tools/run_command.py** — `make_run_command(run_folder)` returns a sandboxed `run_command` function for executing shell commands within the run folder during Build and Test. diff --git a/scripts/aidlc-evaluator/packages/execution/pyproject.toml b/scripts/aidlc-evaluator/packages/execution/pyproject.toml new file mode 100644 index 00000000..b383dedb --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/pyproject.toml @@ -0,0 +1,20 @@ +[project] +name = "aidlc-runner" +version = "0.1.0" +description = "Two-agent AIDLC workflow runner using Strands Agents" +requires-python = ">=3.13" +dependencies = [ + "strands-agents>=0.1.0", + "strands-agents-tools>=0.1.0", + "pyyaml>=6.0", +] + +[project.scripts] +aidlc-runner = "aidlc_runner.cli:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/aidlc_runner"] diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/__init__.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/__init__.py new file mode 100644 index 00000000..543fb226 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/__init__.py @@ -0,0 +1,3 @@ +"""AIDLC Runner - Two-agent AIDLC workflow runner using Strands Agents.""" + +__version__ = "0.1.0" diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/__main__.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/__main__.py new file mode 100644 index 00000000..2afc5993 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/__main__.py @@ -0,0 +1,5 @@ +"""Entry point for `python -m aidlc_runner`.""" + +from aidlc_runner.cli import main + +main() diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/__init__.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/executor.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/executor.py new file mode 100644 index 00000000..32c29f05 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/executor.py @@ -0,0 +1,232 @@ +"""AIDLC Executor agent — drives the AIDLC workflow.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Callable + +import boto3 +from botocore.config import Config as BotoConfig +from strands import Agent +from strands.models.bedrock import BedrockModel + +from aidlc_runner.config import ExecutionConfig, ModelConfig +from aidlc_runner.tools.file_ops import make_file_tools +from aidlc_runner.tools.rule_loader import make_rule_loader +from aidlc_runner.tools.run_command import make_run_command + +EXECUTOR_SYSTEM_PROMPT = """\ +You are the AIDLC Executor agent. Your job is to drive the COMPLETE AI-DLC (AI-Driven \ +Development Life Cycle) workflow for a software project from start to finish, including \ +generating all application code. + +## CRITICAL RULE: YOU MUST COMPLETE THE ENTIRE WORKFLOW + +You must execute ALL phases and ALL stages of the AIDLC workflow. You are NOT done until \ +the Construction phase is complete and working code has been generated in workspace/. \ +After every interaction with the simulator agent, you MUST continue to the next stage. \ +NEVER stop in the middle of the workflow. + +## Complete stage sequence + +Execute these stages in order. Load each rule file BEFORE executing its stage. + +### INCEPTION PHASE — "What to build and why" + +1. **Workspace Detection** (ALWAYS) — load_rule('inception/workspace-detection.md') + - Scan workspace/ directory, classify as greenfield or brownfield + - No human input needed — proceed immediately to next stage + +2. **Reverse Engineering** (CONDITIONAL: brownfield only) \ +— load_rule('inception/reverse-engineering.md') + - Skip for greenfield projects + +3. **Requirements Analysis** (ALWAYS) — load_rule('inception/requirements-analysis.md') + - Read the vision file, analyze requirements + - Create clarifying questions → handoff to simulator for answers + - After receiving answers, generate requirements.md + - Handoff to simulator for approval + - After approval, CONTINUE to next stage + +4. **User Stories** (CONDITIONAL) — load_rule('inception/user-stories.md') + - Generate user stories and personas if project complexity warrants it + - Handoff to simulator for approval + +5. **Workflow Planning** (ALWAYS) — load_rule('inception/workflow-planning.md') + - Create execution plan deciding which Construction stages to run + - Handoff to simulator for approval + +6. **Application Design** (CONDITIONAL) — load_rule('inception/application-design.md') + - Design components, services, and dependencies + - Handoff to simulator for approval + +7. **Units Generation** (CONDITIONAL) — load_rule('inception/units-generation.md') + - Break system into units of work + +### CONSTRUCTION PHASE — "How to build it" + +For each unit of work (or the whole project if no units were defined): + +8. **Functional Design** (CONDITIONAL) — load_rule('construction/functional-design.md') + - Design business logic, domain models, entity definitions + +9. **NFR Requirements** (CONDITIONAL) — load_rule('construction/nfr-requirements.md') + - Establish non-functional requirements and technology decisions + +10. **NFR Design** (CONDITIONAL) — load_rule('construction/nfr-design.md') + - Integrate NFR requirements into architecture + +11. **Infrastructure Design** (CONDITIONAL) — load_rule('construction/infrastructure-design.md') + - Map logical components to deployment infrastructure + +12. **Code Generation** (ALWAYS) — load_rule('construction/code-generation.md') + - Part 1: Create detailed code generation plan with exact file paths + - Handoff to simulator for plan approval + - Part 2: Generate ALL application code in workspace/ + - Write every source file, test file, and configuration file + - Handoff to simulator for code review + +13. **Build and Test** (ALWAYS) — load_rule('construction/build-and-test.md') + - Document build instructions and test procedures + - Use run_command to install dependencies, build the project, and run tests + - If tests fail, read the error output, fix the code, and re-run until tests pass + - Generate build-and-test summary including test results + +## File organization + +- Input documents (vision.md, tech-env.md if provided): run folder root +- All documentation and workflow artifacts: aidlc-docs/ +- All generated application code: workspace/ +- NEVER mix documentation and code locations. +- Code goes in workspace/ with proper package structure (src/, tests/, pyproject.toml, etc.) + +## Working with the Human Simulator + +When you need human input (clarifying questions, approvals, or reviews): + +1. Write the question or document file to the appropriate location in aidlc-docs/ +2. Handoff to the "simulator" agent with a message that includes: + - What type of input you need (answer questions / approve document / review) + - The path to the file they need to read and respond to + - What stage you are currently executing +3. AFTER receiving a response, ALWAYS continue to the next stage. NEVER stop. + +## Question format + +When creating question files, follow the AIDLC question format: +- Use multiple-choice format with options A through D +- Option E should always be "Other" +- The human responds with [Answer]: tags + +## Command execution + +You have a run_command tool for executing shell commands in the workspace. +Use it during Build and Test to: +1. Install dependencies (e.g. `uv pip install -e ".[dev]"`, `npm install`) +2. Run the test suite (e.g. `uv run pytest`, `npm test`) +3. Run linters or type checkers if configured +4. Fix any failures and re-run + +The command runs in workspace/ by default. Each command has a timeout — keep \ +individual commands focused. If a command fails, read the output and fix the issue. + +## Important rules + +- NEVER end your turn without either handing off to the simulator OR completing the \ +entire workflow through Build and Test. +- Load the relevant rule file BEFORE starting each stage. +- Load common rules as needed (e.g. load_rule('common/content-validation.md') before \ +writing files, load_rule('common/question-format-guide.md') before creating questions). +- Update aidlc-docs/aidlc-state.md after completing each stage. +- Append to aidlc-docs/audit.md with ISO 8601 timestamps for each action. +- Never assume answers — always ask via handoff to the simulator. +- For CONDITIONAL stages, evaluate based on project scope and skip with justification if \ +not needed, but always continue to the next stage. +- When generating code, write COMPLETE, WORKING files — not stubs or placeholders. +""" + +# Variant of the system prompt when run_command is disabled. +_EXECUTOR_PROMPT_NO_EXEC = EXECUTOR_SYSTEM_PROMPT.replace( + " - Use run_command to install dependencies, build the project, and run tests\n" + " - If tests fail, read the error output, fix the code, and re-run until tests pass\n" + " - Generate build-and-test summary including test results", + " - Generate build-and-test summary", +).replace( + """## Command execution + +You have a run_command tool for executing shell commands in the workspace. +Use it during Build and Test to: +1. Install dependencies (e.g. `uv pip install -e ".[dev]"`, `npm install`) +2. Run the test suite (e.g. `uv run pytest`, `npm test`) +3. Run linters or type checkers if configured +4. Fix any failures and re-run + +The command runs in workspace/ by default. Each command has a timeout — keep \ +individual commands focused. If a command fails, read the output and fix the issue. + +## Important rules""", + "## Important rules", +) + + +def create_executor( + run_folder: Path, + rules_dir: Path, + model_config: ModelConfig, + aws_profile: str | None = None, + aws_region: str | None = None, + callback_handler: Callable[..., Any] | None = None, + execution_config: ExecutionConfig | None = None, +) -> Agent: + """Create the AIDLC Executor agent. + + Args: + run_folder: Path to the run folder for this execution. + rules_dir: Path to the AIDLC rules directory. + model_config: Model configuration for this agent. + aws_profile: AWS profile name for Bedrock. + aws_region: AWS region for Bedrock. + callback_handler: Optional callback handler for progress reporting. + execution_config: Optional execution config controlling run_command availability. + + Returns: + Configured Strands Agent instance. + """ + if execution_config is None: + execution_config = ExecutionConfig() + + file_tools = make_file_tools(run_folder) + rule_loader = make_rule_loader(rules_dir) + + tools = [*file_tools, rule_loader] + if execution_config.enabled: + run_cmd = make_run_command(run_folder, timeout=execution_config.command_timeout) + tools.append(run_cmd) + system_prompt = EXECUTOR_SYSTEM_PROMPT + else: + system_prompt = _EXECUTOR_PROMPT_NO_EXEC + + session_kwargs: dict = {} + if aws_profile: + session_kwargs["profile_name"] = aws_profile + if aws_region: + session_kwargs["region_name"] = aws_region + boto_session = boto3.Session(**session_kwargs) + boto_client_config = BotoConfig( + read_timeout=900, + connect_timeout=30, + retries={"max_attempts": 10, "mode": "adaptive"}, + ) + model = BedrockModel( + model_id=model_config.model_id, + boto_session=boto_session, + boto_client_config=boto_client_config, + ) + + return Agent( + name="executor", + system_prompt=system_prompt, + model=model, + tools=tools, + callback_handler=callback_handler, + ) diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/simulator.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/simulator.py new file mode 100644 index 00000000..8355400c --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/agents/simulator.py @@ -0,0 +1,149 @@ +"""Human Simulator agent — acts as a knowledgeable human stakeholder.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Callable + +import boto3 +from botocore.config import Config as BotoConfig +from strands import Agent +from strands.models.bedrock import BedrockModel + +from aidlc_runner.config import ModelConfig +from aidlc_runner.tools.file_ops import make_file_tools + +SIMULATOR_SYSTEM_PROMPT_TEMPLATE = """\ +You are the Human Simulator agent. You are simulating a knowledgeable human project \ +stakeholder who is working with an AI-DLC workflow executor. + +## CRITICAL RULE: ALWAYS HAND BACK TO THE EXECUTOR + +After completing your response (answering questions, approving documents, or providing \ +reviews), you MUST ALWAYS handoff back to the "executor" agent. NEVER end your turn \ +without handing off. The executor needs to continue driving the workflow through all \ +remaining stages. + +## Your role + +You represent the human decision-maker in the AIDLC workflow. You provide: +- The project vision and requirements +- Answers to clarifying questions +- Approvals or change requests for documents and designs +- Technical constraints and preferences + +## The project vision + +The following is the vision and constraints document that defines what you want built. \ +Use this as your primary source of truth when answering questions and making decisions: + +--- +{vision_content} +--- +{tech_env_section} +## How you work + +1. When you receive a handoff from the "executor" agent, read the file path mentioned \ +in the handoff message. + +2. Based on the file type: + - **Question files**: Read the questions, then answer each one based on the vision \ +document above and your best technical judgment. Write answers using the AIDLC format \ +with [Answer]: tags followed by the letter choice (A, B, C, D, or E). If choosing E (Other), \ +provide your custom answer. + - **Approval requests**: Review the document against the vision. If it aligns, write \ +an approval. Bias toward approving documents that are directionally correct — do not \ +block progress on minor issues. If there are critical misalignments with the vision, \ +describe what needs to change. + - **Review requests**: Read the document, provide brief feedback, and approve. Only \ +request revisions for significant issues that contradict the vision. + - **Code review**: Review generated code for correctness against the vision spec. \ +Approve if it implements the required functionality. Do not block on style issues. + +3. Write your response to the same file (appending) or to a response file as directed \ +by the question format. + +4. IMMEDIATELY handoff back to the "executor" agent with a summary of what you did \ +and tell it to continue to the next stage. + +## Decision-making principles + +- Stay consistent with the vision document above. +- When the vision doesn't specify a detail, use practical, mainstream technical choices. +- Prefer simplicity over complexity. +- Prefer well-established patterns over novel approaches. +- When genuinely uncertain, choose the option that keeps the most flexibility. +- Always provide a clear, decisive answer — do not punt back without a decision. +- Bias toward approval to keep the workflow moving. Only reject for critical issues. +- Keep your responses concise — the executor needs to continue working. +""" + + +def create_simulator( + run_folder: Path, + vision_content: str, + model_config: ModelConfig, + aws_profile: str | None = None, + aws_region: str | None = None, + callback_handler: Callable[..., Any] | None = None, + tech_env_content: str | None = None, +) -> Agent: + """Create the Human Simulator agent. + + Args: + run_folder: Path to the run folder for this execution. + vision_content: The full text content of the vision/constraints file. + model_config: Model configuration for this agent. + aws_profile: AWS profile name for Bedrock. + aws_region: AWS region for Bedrock. + callback_handler: Optional callback handler for progress reporting. + tech_env_content: Optional full text of the technical environment file. + + Returns: + Configured Strands Agent instance. + """ + file_tools = make_file_tools(run_folder) + + if tech_env_content: + tech_env_section = ( + "\n## The technical environment\n\n" + "The following is the technical environment document that defines HOW the project " + "must be built — languages, frameworks, cloud services, security controls, testing " + "standards, and prohibited technologies. Use this as a binding reference when " + "answering technical questions and reviewing designs and code:\n\n" + "---\n" + f"{tech_env_content}\n" + "---\n" + ) + else: + tech_env_section = "" + + system_prompt = SIMULATOR_SYSTEM_PROMPT_TEMPLATE.format( + vision_content=vision_content, + tech_env_section=tech_env_section, + ) + + session_kwargs: dict = {} + if aws_profile: + session_kwargs["profile_name"] = aws_profile + if aws_region: + session_kwargs["region_name"] = aws_region + boto_session = boto3.Session(**session_kwargs) + boto_client_config = BotoConfig( + read_timeout=900, + connect_timeout=30, + retries={"max_attempts": 10, "mode": "adaptive"}, + ) + model = BedrockModel( + model_id=model_config.model_id, + boto_session=boto_session, + boto_client_config=boto_client_config, + ) + + return Agent( + name="simulator", + system_prompt=system_prompt, + model=model, + tools=file_tools, + callback_handler=callback_handler, + ) diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/cli.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/cli.py new file mode 100644 index 00000000..de49dc75 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/cli.py @@ -0,0 +1,146 @@ +"""Command-line interface for AIDLC Runner.""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +from aidlc_runner.config import default_config_path, load_config +from aidlc_runner.runner import run + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="aidlc-runner", + description="Run the AIDLC workflow with two Strands agents (executor + human simulator).", + ) + parser.add_argument( + "--vision", + required=True, + type=Path, + help="Path to the vision/constraints markdown file.", + ) + parser.add_argument( + "--tech-env", + type=Path, + default=None, + help="Path to the technical environment markdown file (optional).", + ) + parser.add_argument( + "--config", + type=Path, + default=None, + help="Path to YAML config file. Defaults to bundled config/default.yaml.", + ) + parser.add_argument( + "--aws-profile", + default=None, + help="Override AWS profile name.", + ) + parser.add_argument( + "--aws-region", + default=None, + help="Override AWS region.", + ) + parser.add_argument( + "--executor-model", + default=None, + help="Override model ID for the AIDLC executor agent.", + ) + parser.add_argument( + "--simulator-model", + default=None, + help="Override model ID for the human simulator agent.", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=None, + help="Override run output directory.", + ) + parser.add_argument( + "--rules-path", + type=Path, + default=None, + help="Path to local AIDLC rules directory (overrides git clone).", + ) + parser.add_argument( + "--rules-ref", + default=None, + help="Git ref (branch/tag/commit) for AIDLC rules repo.", + ) + parser.add_argument( + "--no-exec", + action="store_true", + default=False, + help="Disable in-workflow command execution (run_command tool not available).", + ) + parser.add_argument( + "--no-post-tests", + action="store_true", + default=False, + help="Disable post-run test execution.", + ) + return parser + + +def _build_cli_overrides(args: argparse.Namespace) -> dict: + """Convert parsed CLI args into a nested dict for config merging.""" + overrides: dict = {} + + if args.aws_profile is not None: + overrides.setdefault("aws", {})["profile"] = args.aws_profile + if args.aws_region is not None: + overrides.setdefault("aws", {})["region"] = args.aws_region + + if args.executor_model is not None: + overrides.setdefault("models", {}).setdefault("executor", {})[ + "model_id" + ] = args.executor_model + if args.simulator_model is not None: + overrides.setdefault("models", {}).setdefault("simulator", {})[ + "model_id" + ] = args.simulator_model + + if args.output_dir is not None: + overrides.setdefault("runs", {})["output_dir"] = str(args.output_dir) + + if args.rules_path is not None: + overrides.setdefault("aidlc", {})["rules_source"] = "local" + overrides["aidlc"]["rules_local_path"] = str(args.rules_path) + + if args.rules_ref is not None: + overrides.setdefault("aidlc", {})["rules_ref"] = args.rules_ref + + if args.no_exec: + overrides.setdefault("execution", {})["enabled"] = False + if args.no_post_tests: + overrides.setdefault("execution", {})["post_run_tests"] = False + + return overrides + + +def main(argv: list[str] | None = None) -> None: + parser = build_parser() + args = parser.parse_args(argv) + + # Validate vision file exists + if not args.vision.exists(): + print(f"Error: Vision file not found: {args.vision}", file=sys.stderr) + sys.exit(1) + + # Validate tech-env file exists if provided + if args.tech_env is not None and not args.tech_env.exists(): + print(f"Error: Technical environment file not found: {args.tech_env}", file=sys.stderr) + sys.exit(1) + + # Resolve config path + config_path = args.config if args.config else default_config_path() + + # Load config with CLI overrides + cli_overrides = _build_cli_overrides(args) + config = load_config(config_path=config_path, cli_overrides=cli_overrides) + + # Run the workflow + run(config=config, vision_path=args.vision, tech_env_path=args.tech_env) diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/config.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/config.py new file mode 100644 index 00000000..dff27a4e --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/config.py @@ -0,0 +1,131 @@ +"""Configuration loading and management for AIDLC Runner.""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from pathlib import Path + +import yaml + +logger = logging.getLogger(__name__) + + +@dataclass +class AwsConfig: + profile: str | None = None + region: str | None = None + + +@dataclass +class ModelConfig: + provider: str = "bedrock" + model_id: str = "global.anthropic.claude-opus-4-6-v1" + + +@dataclass +class ModelsConfig: + executor: ModelConfig = field(default_factory=ModelConfig) + simulator: ModelConfig = field(default_factory=ModelConfig) + + +@dataclass +class AidlcConfig: + rules_source: str = "git" + rules_repo: str = "https://github.com/awslabs/aidlc-workflows.git" + rules_local_path: str | None = None + rules_ref: str = "main" + + +@dataclass +class SwarmConfig: + max_handoffs: int = 200 + max_iterations: int = 200 + execution_timeout: float = 14400.0 + node_timeout: float = 3600.0 + + +@dataclass +class RunsConfig: + output_dir: str = "./runs" + + +@dataclass +class SandboxConfig: + enabled: bool = True + image: str = "aidlc-sandbox:latest" + memory: str = "2g" + cpus: int = 2 + + +@dataclass +class ExecutionConfig: + enabled: bool = True + command_timeout: int = 120 + post_run_tests: bool = True + post_run_timeout: int = 300 + sandbox: SandboxConfig = field(default_factory=SandboxConfig) + + +@dataclass +class RunnerConfig: + aws: AwsConfig = field(default_factory=AwsConfig) + models: ModelsConfig = field(default_factory=ModelsConfig) + aidlc: AidlcConfig = field(default_factory=AidlcConfig) + swarm: SwarmConfig = field(default_factory=SwarmConfig) + runs: RunsConfig = field(default_factory=RunsConfig) + execution: ExecutionConfig = field(default_factory=ExecutionConfig) + + +def _merge_dict_into_dataclass(dc: object, data: dict) -> None: + """Recursively merge a dict into a dataclass instance.""" + for key, value in data.items(): + if not hasattr(dc, key): + logger.warning("Unknown config key %r (ignored) — check for typos", key) + continue + current = getattr(dc, key) + if isinstance(value, dict) and hasattr(current, "__dataclass_fields__"): + _merge_dict_into_dataclass(current, value) + elif value is not None: + setattr(dc, key, value) + + +def load_config( + config_path: str | Path | None = None, + cli_overrides: dict | None = None, +) -> RunnerConfig: + """Load configuration from YAML file and apply CLI overrides. + + Args: + config_path: Path to YAML config file. If None, uses built-in defaults. + cli_overrides: Dict of CLI argument overrides to apply on top. + + Returns: + Fully resolved RunnerConfig. + """ + config = RunnerConfig() + + if config_path is not None: + path = Path(config_path) + if path.exists(): + with open(path, encoding="utf-8") as f: + yaml_data = yaml.safe_load(f) or {} + _merge_dict_into_dataclass(config, yaml_data) + + if cli_overrides: + _merge_dict_into_dataclass(config, cli_overrides) + + return config + + +def default_config_path() -> Path: + """Return the path to the default.yaml config at the repo root. + + Checks for config/default.yaml relative to cwd first (the expected + layout when run from the repo root), then falls back to a path + relative to this source file for backwards compatibility. + """ + cwd_candidate = Path.cwd() / "config" / "default.yaml" + if cwd_candidate.is_file(): + return cwd_candidate + return Path(__file__).resolve().parent.parent.parent / "config" / "default.yaml" diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/metrics.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/metrics.py new file mode 100644 index 00000000..aefd21be --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/metrics.py @@ -0,0 +1,347 @@ +"""Metrics collection and persistence for AIDLC Runner — Phase 1 instrumentation.""" + +from __future__ import annotations + +import statistics +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import yaml +from shared.io import atomic_yaml_dump +from strands.multiagent.base import MultiAgentResult + +from aidlc_runner.config import RunnerConfig + +# File extensions considered "source code" +_SOURCE_EXTENSIONS = frozenset({ + ".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".go", ".rs", + ".c", ".cpp", ".h", ".hpp", ".cs", ".rb", ".swift", ".kt", + ".scala", ".sh", ".bash", ".sql", ".html", ".css", ".scss", +}) + +# File names / extensions considered "config" +_CONFIG_FILENAMES = frozenset({ + "pyproject.toml", "package.json", "package-lock.json", "tsconfig.json", + "cargo.toml", "makefile", "dockerfile", "docker-compose.yml", + "docker-compose.yaml", ".gitignore", ".eslintrc.json", ".prettierrc", + "ruff.toml", "setup.cfg", "setup.py", "requirements.txt", +}) + +_CONFIG_EXTENSIONS = frozenset({".toml", ".ini", ".cfg"}) + +# Generated lock/dependency files that inflate LOC counts +_LOCK_FILENAMES = frozenset({ + "package-lock.json", "yarn.lock", "pnpm-lock.yaml", + "poetry.lock", "uv.lock", "pipfile.lock", "cargo.lock", + "composer.lock", "gemfile.lock", "bun.lockb", +}) + + +def _is_test_file(path: Path) -> bool: + """Heuristic: file is a test if its name or parent directory suggests tests.""" + name = path.name.lower() + parts = [p.lower() for p in path.parts] + if any(p in ("tests", "test", "__tests__", "spec") for p in parts): + return True + if name.startswith("test_") or name.endswith("_test.py") or name.endswith(".test.js"): + return True + if name.endswith("_test.ts") or name.endswith(".test.ts") or name.endswith(".test.tsx"): + return True + if name.endswith("_spec.py") or name.endswith(".spec.js") or name.endswith(".spec.ts"): + return True + return False + + +def _is_config_file(path: Path) -> bool: + """Heuristic: file is a config/build file.""" + if path.name.lower() in _CONFIG_FILENAMES: + return True + if path.suffix.lower() in _CONFIG_EXTENSIONS: + return True + if path.suffix.lower() in (".yaml", ".yml"): + return True + return False + + +def _count_lines(path: Path) -> int: + """Count lines in a text file, returning 0 for binary/unreadable files.""" + try: + return len(path.read_text(encoding="utf-8", errors="strict").splitlines()) + except (UnicodeDecodeError, OSError): + return 0 + + +def _scan_artifacts(run_folder: Path) -> dict[str, Any]: + """Scan workspace/ and aidlc-docs/ to count generated artifacts. + + Returns a dict with workspace and aidlc_docs sections. + """ + workspace = run_folder / "workspace" + aidlc_docs = run_folder / "aidlc-docs" + + # --- workspace --- + source_files = 0 + test_files = 0 + config_files = 0 + other_files = 0 + total_loc = 0 + + if workspace.exists(): + for f in workspace.rglob("*"): + if not f.is_file(): + continue + rel = f.relative_to(workspace) + is_lock = f.name.lower() in _LOCK_FILENAMES + if _is_test_file(rel): + test_files += 1 + elif f.suffix.lower() in _SOURCE_EXTENSIONS: + source_files += 1 + elif _is_config_file(rel): + config_files += 1 + else: + other_files += 1 + if not is_lock: + total_loc += _count_lines(f) + + ws_total = source_files + test_files + config_files + other_files + + # --- aidlc-docs --- + inception_files = 0 + construction_files = 0 + other_doc_files = 0 + + if aidlc_docs.exists(): + for f in aidlc_docs.rglob("*"): + if not f.is_file(): + continue + rel = f.relative_to(aidlc_docs) + parts = rel.parts + if parts and parts[0] == "inception": + inception_files += 1 + elif parts and parts[0] == "construction": + construction_files += 1 + else: + other_doc_files += 1 + + doc_total = inception_files + construction_files + other_doc_files + + return { + "workspace": { + "source_files": source_files, + "test_files": test_files, + "config_files": config_files, + "other_files": other_files, + "total_files": ws_total, + "total_lines_of_code": total_loc, + }, + "aidlc_docs": { + "inception_files": inception_files, + "construction_files": construction_files, + "other_files": other_doc_files, + "total_files": doc_total, + }, + } + + +def _usage_to_dict(usage: dict[str, int]) -> dict[str, int]: + """Normalise a Usage TypedDict to a plain dict with snake_case keys.""" + return { + "input_tokens": usage.get("inputTokens", 0), + "output_tokens": usage.get("outputTokens", 0), + "total_tokens": usage.get("totalTokens", 0), + "cache_read_tokens": usage.get("cacheReadInputTokens", 0), + "cache_write_tokens": usage.get("cacheWriteInputTokens", 0), + } + + +class MetricsCollector: + """Accumulates metrics during a run and serializes them to run-metrics.yaml. + + Live data (handoff timings, error events) is recorded via callbacks during + execution. Token counts and artifact data are extracted post-run from the + Strands result and the filesystem. + """ + + def __init__(self, config: RunnerConfig) -> None: + self._config = config + self._handoffs: list[dict[str, Any]] = [] + self._errors: list[dict[str, str]] = [] + self._context_samples: list[dict[str, Any]] = [] + + # -- Live recording (called during execution) -- + + def record_handoff(self, handoff_num: int, node_id: str, duration_ms: int) -> None: + """Record a completed handoff with its duration.""" + self._handoffs.append({ + "handoff": handoff_num, + "node_id": node_id, + "duration_ms": duration_ms, + }) + + def record_error(self, error_type: str, message: str) -> None: + """Record an error/retry event observed during streaming.""" + self._errors.append({ + "timestamp": datetime.now(timezone.utc).isoformat(), + "type": error_type, + "message": message, + }) + + def record_context_sample(self, agent_name: str, input_tokens: int) -> None: + """Record the input token count from a single model invocation. + + Each sample represents the context window size at that point in the + conversation — input_tokens grows as the message history accumulates. + """ + self._context_samples.append({ + "agent": agent_name, + "input_tokens": input_tokens, + }) + + # -- Post-run assembly -- + + @staticmethod + def _compute_context_stats(samples: list[int]) -> dict[str, int | float]: + """Compute min/max/avg/median over a list of input-token counts.""" + if not samples: + return { + "min_tokens": 0, + "max_tokens": 0, + "avg_tokens": 0, + "median_tokens": 0, + "sample_count": 0, + } + return { + "min_tokens": min(samples), + "max_tokens": max(samples), + "avg_tokens": int(statistics.mean(samples)), + "median_tokens": int(statistics.median(samples)), + "sample_count": len(samples), + } + + def build_metrics(self, result: MultiAgentResult, run_folder: Path) -> dict[str, Any]: + """Assemble the full metrics dict from the swarm result and run folder. + + Should be called after the swarm completes. + """ + metrics: dict[str, Any] = {} + + # --- Tokens --- + # Extract per-agent token counts (unique tokens per agent) + per_agent: dict[str, dict[str, int]] = {} + for node_id, node_result in result.results.items(): + per_agent[node_id] = _usage_to_dict(node_result.accumulated_usage) + + # Calculate sum of per-agent tokens (unique tokens across all agents) + unique_total = { + "input_tokens": sum(agent["input_tokens"] for agent in per_agent.values()), + "output_tokens": sum(agent["output_tokens"] for agent in per_agent.values()), + "total_tokens": sum(agent["total_tokens"] for agent in per_agent.values()), + "cache_read_tokens": sum(agent["cache_read_tokens"] for agent in per_agent.values()), + "cache_write_tokens": sum(agent["cache_write_tokens"] for agent in per_agent.values()), + } + + # Get raw accumulated usage from all API calls (includes repeated context) + api_total = _usage_to_dict(result.accumulated_usage) + + # Calculate repeated context (tokens re-sent across multiple turns) + repeated_context = { + "input_tokens": api_total["input_tokens"] - unique_total["input_tokens"], + "output_tokens": api_total["output_tokens"] - unique_total["output_tokens"], + "total_tokens": api_total["total_tokens"] - unique_total["total_tokens"], + "cache_read_tokens": api_total["cache_read_tokens"] - unique_total["cache_read_tokens"], + "cache_write_tokens": api_total["cache_write_tokens"] - unique_total["cache_write_tokens"], + } + + metrics["tokens"] = { + "total": unique_total, # Sum of per-agent unique tokens + "per_agent": per_agent, + "repeated_context": repeated_context, # Context re-sent on subsequent turns + "api_total": api_total, # Raw total from all API calls + } + + # --- Timing --- + metrics["timing"] = { + "total_wall_clock_ms": result.execution_time, + "handoffs": list(self._handoffs), + } + + # --- Handoff patterns --- + sequence = [h["node_id"] for h in self._handoffs] + agent_stats: dict[str, dict[str, Any]] = {} + for h in self._handoffs: + nid = h["node_id"] + if nid not in agent_stats: + agent_stats[nid] = {"turn_count": 0, "total_duration_ms": 0} + agent_stats[nid]["turn_count"] += 1 + agent_stats[nid]["total_duration_ms"] += h["duration_ms"] + + for stats in agent_stats.values(): + if stats["turn_count"] > 0: + stats["avg_turn_duration_ms"] = stats["total_duration_ms"] // stats["turn_count"] + else: + stats["avg_turn_duration_ms"] = 0 + + metrics["handoff_patterns"] = { + "total_handoffs": len(self._handoffs), + "sequence": sequence, + "per_agent": agent_stats, + } + + # --- Artifacts --- + metrics["artifacts"] = _scan_artifacts(run_folder) + + # --- Errors --- + error_counts: dict[str, int] = {} + for e in self._errors: + error_counts[e["type"]] = error_counts.get(e["type"], 0) + 1 + + metrics["errors"] = { + "throttle_events": error_counts.get("throttle", 0), + "timeout_events": error_counts.get("timeout", 0), + "failed_tool_calls": error_counts.get("failed_tool", 0), + "model_error_events": error_counts.get("model_error", 0), + "service_unavailable_events": error_counts.get("service_unavailable", 0), + "validation_error_events": error_counts.get("validation_error", 0), + "details": list(self._errors), + } + + # --- Context size --- + all_tokens = [s["input_tokens"] for s in self._context_samples] + per_agent_tokens: dict[str, list[int]] = {} + for s in self._context_samples: + per_agent_tokens.setdefault(s["agent"], []).append(s["input_tokens"]) + + metrics["context_size"] = { + "total": self._compute_context_stats(all_tokens), + "per_agent": { + agent: self._compute_context_stats(tokens) + for agent, tokens in per_agent_tokens.items() + }, + "samples": list(self._context_samples), + } + + # --- Model params --- + metrics["model_params"] = { + "executor": { + "model_id": self._config.models.executor.model_id, + "provider": self._config.models.executor.provider, + }, + "simulator": { + "model_id": self._config.models.simulator.model_id, + "provider": self._config.models.simulator.provider, + }, + "aws_region": self._config.aws.region, + } + + return metrics + + def write(self, result: MultiAgentResult, run_folder: Path) -> Path: + """Build metrics and write run-metrics.yaml to the run folder. + + Returns the path to the written file. + """ + metrics = self.build_metrics(result, run_folder) + out_path = run_folder / "run-metrics.yaml" + atomic_yaml_dump(metrics, out_path) + return out_path diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py new file mode 100644 index 00000000..ae82ed2d --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/post_run.py @@ -0,0 +1,393 @@ +"""Post-run test evaluation — detect project type, install deps, run tests.""" + +from __future__ import annotations + +import os +import re +import shlex +import shutil +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import yaml +from shared.io import atomic_yaml_dump + +from aidlc_runner.config import RunnerConfig +from shared.sandbox import is_docker_available, sandbox_run + +_MAX_OUTPUT_CHARS = 10_000 +_MAX_SEARCH_DEPTH = 3 + +# Project markers in priority order. +_PROJECT_MARKERS: list[tuple[str, str, str, str]] = [ + # (marker_file, project_type, install_cmd, test_cmd) + ("pyproject.toml", "python", 'uv pip install -qq -e ".[dev]"', "uv run pytest --tb=short -q --no-header -o console_output_style=classic"), + ("package.json", "node", "npm install", "npm test"), + ("Cargo.toml", "rust", "cargo build", "cargo test"), + ("go.mod", "go", "go build ./...", "go test ./..."), + ("setup.py", "python-legacy", 'pip install -e ".[dev]"', "python -m pytest --tb=short -q --no-header -o console_output_style=classic"), +] + +_SKIP_DIRS = frozenset({ + ".venv", "venv", ".env", "env", + "node_modules", + "__pycache__", ".pytest_cache", ".ruff_cache", ".mypy_cache", + ".git", ".hg", ".svn", + "target", # Rust/Maven build output + "dist", "build", ".tox", ".nox", + ".cache", +}) + + +@dataclass +class ProjectInfo: + project_type: str + install_cmd: str + test_cmd: str + project_root: Path + + +def _check_markers(directory: Path) -> ProjectInfo | None: + """Check a single directory for project marker files.""" + for marker_file, project_type, install_cmd, test_cmd in _PROJECT_MARKERS: + if (directory / marker_file).exists(): + return ProjectInfo( + project_type=project_type, + install_cmd=install_cmd, + test_cmd=test_cmd, + project_root=directory, + ) + return None + + +def detect_project(workspace: Path) -> ProjectInfo | None: + """Detect the project type from marker files in workspace/. + + Performs a breadth-first search starting at workspace/ and descending up + to ``_MAX_SEARCH_DEPTH`` levels. Hidden directories (dot-prefixed) and + common vendor/cache directories are skipped to avoid false positives and + slow traversal through large dependency trees. + + Returns ProjectInfo or None if no recognisable project found. + """ + if not workspace.is_dir(): + return None + + result = _check_markers(workspace) + if result is not None: + return result + + # BFS through subdirectories up to _MAX_SEARCH_DEPTH levels deep. + queue: list[tuple[Path, int]] = [(workspace, 0)] + while queue: + current, depth = queue.pop(0) + if depth >= _MAX_SEARCH_DEPTH: + continue + try: + children = sorted( + p for p in current.iterdir() + if p.is_dir() and not p.name.startswith(".") and p.name not in _SKIP_DIRS + ) + except OSError: + continue + for child in children: + result = _check_markers(child) + if result is not None: + return result + queue.append((child, depth + 1)) + + return None + + +def _truncate(text: str, limit: int = _MAX_OUTPUT_CHARS) -> str: + if len(text) <= limit: + return text + return text[:limit] + "\n... (output truncated)" + + +def _run_step( + command: str, + cwd: Path, + timeout: int, + use_sandbox: bool = False, + sandbox_image: str = "aidlc-sandbox:latest", + sandbox_memory: str = "2g", + sandbox_cpus: int = 2, +) -> dict[str, Any]: + """Run a subprocess step and return structured result. + + When *use_sandbox* is ``True`` and Docker is available the command + runs inside a container via :func:`sandbox_run`. Otherwise it falls + back to direct host execution (with a warning when the caller asked + for sandboxing but Docker is absent). + """ + if use_sandbox and is_docker_available(): + result = sandbox_run( + command, + workspace=cwd, + image=sandbox_image, + timeout=timeout, + network=True, + memory=sandbox_memory, + cpus=sandbox_cpus, + ) + output = result.stdout + result.stderr + data: dict[str, Any] = { + "command": command, + "exit_code": result.exit_code, + "success": result.exit_code == 0, + "output": _truncate(output), + "sandboxed": True, + } + if result.timed_out: + data["timed_out"] = True + return data + + if use_sandbox: + print( + "[WARN] Docker not available — running on host without sandbox", + file=sys.stderr, + ) + + # Host execution — use shlex.split to avoid shell=True + env = { + k: v for k, v in os.environ.items() + if k not in ("VIRTUAL_ENV", "CONDA_PREFIX") + } + env["HOME"] = str(cwd) + + try: + # nosec B603 - Using shlex.split with shell=False, executing generated project tests + # nosemgrep: dangerous-subprocess-use-audit + result_proc = subprocess.run( + shlex.split(command), + shell=False, + cwd=str(cwd), + capture_output=True, + text=True, + timeout=timeout, + env=env, + ) + output = result_proc.stdout + result_proc.stderr + return { + "command": command, + "exit_code": result_proc.returncode, + "success": result_proc.returncode == 0, + "output": _truncate(output), + "sandboxed": False, + } + except subprocess.TimeoutExpired as e: + partial = "" + if e.stdout: + partial += e.stdout if isinstance(e.stdout, str) else e.stdout.decode("utf-8", errors="replace") + if e.stderr: + partial += e.stderr if isinstance(e.stderr, str) else e.stderr.decode("utf-8", errors="replace") + return { + "command": command, + "exit_code": None, + "success": False, + "output": _truncate(partial), + "timed_out": True, + "sandboxed": False, + } + except OSError as e: + return { + "command": command, + "exit_code": None, + "success": False, + "output": str(e), + "sandboxed": False, + } + + +# --------------------------------------------------------------------------- +# Test output parsers +# --------------------------------------------------------------------------- + +def _parse_pytest(output: str) -> dict[str, int | None]: + """Parse pytest summary line like '5 passed, 2 failed, 1 error in 3.2s'.""" + results: dict[str, int | None] = {"passed": None, "failed": None, "errors": None, "skipped": None} + # Match the final summary line + m = re.search(r"=+\s*([\d\w\s,]+)\s+in\s+[\d.]+", output) + if not m: + # Try shorter form: "5 passed" + m = re.search(r"(\d+\s+passed(?:,\s*\d+\s+\w+)*)", output) + if m: + summary = m.group(1) if m else "" + for key in ("passed", "failed", "error", "skipped", "warning", "deselected"): + count_match = re.search(rf"(\d+)\s+{key}", summary) + if count_match: + mapped_key = "errors" if key == "error" else key + if mapped_key in results: + results[mapped_key] = int(count_match.group(1)) + return results + + +def _parse_jest(output: str) -> dict[str, int | None]: + """Parse Jest/Vitest summary.""" + results: dict[str, int | None] = {"passed": None, "failed": None, "errors": None, "skipped": None} + # Jest: "Tests: 2 failed, 5 passed, 7 total" + m = re.search(r"Tests:\s+(.+total)", output) + if m: + summary = m.group(1) + for key, mapped in [("passed", "passed"), ("failed", "failed"), ("skipped", "skipped")]: + count_match = re.search(rf"(\d+)\s+{key}", summary) + if count_match: + results[mapped] = int(count_match.group(1)) + return results + # Vitest: "Tests 5 passed | 2 failed (7)" + m = re.search(r"Tests\s+(.+\))", output) + if m: + summary = m.group(1) + for key, mapped in [("passed", "passed"), ("failed", "failed")]: + count_match = re.search(rf"(\d+)\s+{key}", summary) + if count_match: + results[mapped] = int(count_match.group(1)) + return results + + +def _parse_cargo(output: str) -> dict[str, int | None]: + """Parse cargo test summary like 'test result: ok. 5 passed; 0 failed; 0 ignored'.""" + results: dict[str, int | None] = {"passed": None, "failed": None, "errors": None, "skipped": None} + m = re.search(r"test result:.*?(\d+)\s+passed;\s*(\d+)\s+failed;\s*(\d+)\s+ignored", output) + if m: + results["passed"] = int(m.group(1)) + results["failed"] = int(m.group(2)) + results["skipped"] = int(m.group(3)) + return results + + +def _parse_go(output: str) -> dict[str, int | None]: + """Parse go test output by counting --- PASS and --- FAIL lines.""" + results: dict[str, int | None] = {"passed": None, "failed": None, "errors": None, "skipped": None} + passed = len(re.findall(r"--- PASS:", output)) + failed = len(re.findall(r"--- FAIL:", output)) + skipped = len(re.findall(r"--- SKIP:", output)) + if passed or failed or skipped: + results["passed"] = passed + results["failed"] = failed + results["skipped"] = skipped + return results + + +_PARSERS = { + "python": _parse_pytest, + "python-legacy": _parse_pytest, + "node": _parse_jest, + "rust": _parse_cargo, + "go": _parse_go, +} + + +def parse_test_output(project_type: str, output: str) -> dict[str, int | None]: + """Parse test output for the given project type. + + Returns a dict with keys: passed, failed, errors, skipped. + Values are None if parsing fails for that field. + """ + parser = _PARSERS.get(project_type) + if parser is None: + return {"passed": None, "failed": None, "errors": None, "skipped": None, "total": None} + results = parser(output) + # Compute total if we have any parsed values + counts = [v for v in results.values() if v is not None] + results["total"] = sum(counts) if counts else None + return results + + +# --------------------------------------------------------------------------- +# Main entry point +# --------------------------------------------------------------------------- + +def run_post_evaluation( + run_folder: Path, + config: RunnerConfig, + use_sandbox: bool | None = None, +) -> Path | None: + """Run post-run test evaluation on the generated workspace. + + Detects project type, installs dependencies, runs tests, parses results, + and writes test-results.yaml. + + When *use_sandbox* is ``None`` the setting is read from + ``config.execution.sandbox.enabled``. + + Returns the path to test-results.yaml, or None if no project was detected. + """ + workspace = run_folder / "workspace" + out_path = run_folder / "test-results.yaml" + timeout = config.execution.post_run_timeout + + sandbox_cfg = config.execution.sandbox + if use_sandbox is None: + use_sandbox = sandbox_cfg.enabled + + if not workspace.exists(): + _write_results(out_path, {"status": "skipped", "reason": "no workspace directory"}) + return out_path + + project = detect_project(workspace) + if project is None: + _write_results(out_path, {"status": "skipped", "reason": "no recognised project markers"}) + return out_path + + project_root = project.project_root + + # Remove any host-created .venv before sandbox steps. + # A host venv has symlinks to the host Python interpreter which are + # broken inside the container. + if use_sandbox: + stale_venv = project_root / ".venv" + if stale_venv.is_dir(): + shutil.rmtree(stale_venv) + + data: dict[str, Any] = { + "status": "completed", + "project_type": project.project_type, + "project_root": str(project_root.relative_to(run_folder)), + } + + # Install dependencies + # In sandbox mode for Python projects, use `uv sync` which + # auto-creates a fresh .venv and installs from the lockfile. + install_cmd = project.install_cmd + if use_sandbox and project.project_type in ("python", "python-legacy"): + install_cmd = "uv sync --all-extras" + install_result = _run_step( + install_cmd, project_root, timeout, + use_sandbox=use_sandbox, + sandbox_image=sandbox_cfg.image, + sandbox_memory=sandbox_cfg.memory, + sandbox_cpus=sandbox_cfg.cpus, + ) + data["install"] = install_result + if install_result.get("timed_out"): + data["status"] = "install_timeout" + elif not install_result["success"]: + data["status"] = "install_failed" + + # Run tests (even if install failed — may still produce useful output) + test_result = _run_step( + project.test_cmd, project_root, timeout, + use_sandbox=use_sandbox, + sandbox_image=sandbox_cfg.image, + sandbox_memory=sandbox_cfg.memory, + sandbox_cpus=sandbox_cfg.cpus, + ) + data["test"] = test_result + if test_result.get("timed_out"): + data["status"] = "test_timeout" + + # Parse test output + parsed = parse_test_output(project.project_type, test_result.get("output", "")) + data["test"]["parsed_results"] = parsed + + _write_results(out_path, data) + return out_path + + +def _write_results(path: Path, data: dict[str, Any]) -> None: + atomic_yaml_dump(data, path) diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/progress.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/progress.py new file mode 100644 index 00000000..6fa8feba --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/progress.py @@ -0,0 +1,126 @@ +"""Progress reporting for AIDLC Runner — callback handlers and swarm hooks.""" + +from __future__ import annotations + +import sys +import time +from datetime import datetime, timezone +from typing import TYPE_CHECKING, Any + +from strands.hooks.events import AfterNodeCallEvent, BeforeNodeCallEvent +from strands.hooks.registry import HookRegistry + +if TYPE_CHECKING: + from aidlc_runner.metrics import MetricsCollector + +# Error event keys in the Bedrock streaming response that we track. +_ERROR_EVENT_KEYS = { + "throttlingException": "throttle", + "modelStreamErrorException": "model_error", + "internalServerException": "model_error", + "serviceUnavailableException": "service_unavailable", + "validationException": "validation_error", +} + + +class AgentProgressHandler: + """Callback handler that prints concise tool-use progress to stderr. + + Shows which tools each agent is invoking (e.g. load_rule, write_file) + without streaming the full LLM text output. + + Optionally records error/retry events to a MetricsCollector. + """ + + def __init__( + self, + agent_name: str, + collector: MetricsCollector | None = None, + ) -> None: + self.agent_name = agent_name + self.tool_count = 0 + self._collector = collector + + def __call__(self, **kwargs: Any) -> None: + event = kwargs.get("event") + if not event: + return + + # Tool start events + if "contentBlockStart" in event: + start = event["contentBlockStart"].get("start", {}) + if "toolUse" in start: + tool_name = start["toolUse"].get("name", "unknown") + self.tool_count += 1 + _print_status(f" [{self.agent_name}] tool #{self.tool_count}: {tool_name}") + + # Tool input — show key details for important tools + if "contentBlockDelta" in event: + delta = event["contentBlockDelta"].get("delta", {}) + if "toolUse" in delta: + # We could parse partial JSON here for file paths, but + # the tool result event is more reliable. Keep it simple. + pass + + # Metadata event — capture per-invocation context size (input tokens) + if "metadata" in event and self._collector is not None: + usage = event["metadata"].get("usage", {}) + input_tokens = usage.get("inputTokens", 0) + if input_tokens > 0: + self._collector.record_context_sample(self.agent_name, input_tokens) + + # Error/retry event detection + if self._collector is not None: + for event_key, error_type in _ERROR_EVENT_KEYS.items(): + if event_key in event: + detail = event[event_key] + message = detail.get("message", "") if isinstance(detail, dict) else str(detail) + self._collector.record_error( + error_type, + f"[{self.agent_name}] {event_key}: {message}", + ) + + +class SwarmProgressHook: + """Hook provider that prints node start/stop and timing to stderr. + + Optionally records per-handoff durations to a MetricsCollector. + """ + + def __init__(self, collector: MetricsCollector | None = None) -> None: + self._node_start_times: dict[str, float] = {} + self._handoff_count = 0 + self._collector = collector + + def register_hooks(self, registry: HookRegistry, **kwargs: Any) -> None: + registry.add_callback(BeforeNodeCallEvent, self._on_before_node) + registry.add_callback(AfterNodeCallEvent, self._on_after_node) + + def _on_before_node(self, event: BeforeNodeCallEvent) -> None: + self._handoff_count += 1 + self._node_start_times[event.node_id] = time.monotonic() + ts = datetime.now(timezone.utc).strftime("%H:%M:%S") + _print_status( + f"\n[{ts}] === Handoff #{self._handoff_count}: {event.node_id} starting ===" + ) + + def _on_after_node(self, event: AfterNodeCallEvent) -> None: + duration_ms = 0 + start = self._node_start_times.pop(event.node_id, None) + if start is not None: + duration_ms = int((time.monotonic() - start) * 1000) + mins, secs_rem = divmod(duration_ms // 1000, 60) + elapsed = f" ({mins}m {secs_rem}s)" + else: + elapsed = "" + + ts = datetime.now(timezone.utc).strftime("%H:%M:%S") + _print_status(f"[{ts}] === {event.node_id} finished{elapsed} ===") + + if self._collector is not None: + self._collector.record_handoff(self._handoff_count, event.node_id, duration_ms) + + +def _print_status(msg: str) -> None: + """Print a status message to stderr so it doesn't mix with agent output.""" + print(msg, file=sys.stderr, flush=True) diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/runner.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/runner.py new file mode 100644 index 00000000..6b5f8829 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/runner.py @@ -0,0 +1,300 @@ +"""Main runner — orchestrates run folder creation, rules setup, and swarm execution.""" + +from __future__ import annotations + +import os +import re +import shutil +import stat +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path +from urllib.parse import urlparse + +import yaml +from shared.io import atomic_yaml_dump +from strands.multiagent import Swarm + +from aidlc_runner.agents.executor import create_executor +from aidlc_runner.agents.simulator import create_simulator +from aidlc_runner.config import AidlcConfig, RunnerConfig +from aidlc_runner.metrics import MetricsCollector +from aidlc_runner.post_run import run_post_evaluation +from aidlc_runner.progress import AgentProgressHandler, SwarmProgressHook + +_SLUG_MAX_LEN = 80 + + +def _rules_slug(aidlc: AidlcConfig) -> str: + """Derive a filesystem-safe slug from the AIDLC rules configuration.""" + if aidlc.rules_source == "local" and aidlc.rules_local_path: + raw = f"local_{Path(aidlc.rules_local_path).name}" + else: + repo_url = aidlc.rules_repo or "" + path = urlparse(repo_url).path.rstrip("/") + repo_name = Path(path).stem # strips .git suffix + raw = f"{repo_name}_{aidlc.rules_ref}" + + slug = raw.replace(" ", "-") + slug = re.sub(r"[^a-zA-Z0-9._-]", "", slug) + return slug[:_SLUG_MAX_LEN] + + +_SENTINEL_NAME = ".last_run_folder" + + +def create_run_folder(output_dir: str | Path, config: RunnerConfig) -> Path: + """Create a timestamped run folder named after the rules source. + + Format: {ISO8601_compact}-{rules_slug} + Example: 20260224T214917-aidlc-workflows_v0.1.0 + + Also writes a sentinel file (``{output_dir}/.last_run_folder``) containing + the absolute path of the new run folder so that parent orchestrators can + discover the folder without racy before/after directory listing. + + Returns: + Path to the created run folder. + """ + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S") + slug = _rules_slug(config.aidlc) + folder_name = f"{timestamp}-{slug}" + run_folder = output_dir / folder_name + + run_folder.mkdir() + (run_folder / "aidlc-docs" / "inception").mkdir(parents=True) + (run_folder / "aidlc-docs" / "construction").mkdir(parents=True) + (run_folder / "workspace").mkdir() + + # Write sentinel for orchestrator discovery (atomic via os.replace) + sentinel = output_dir / _SENTINEL_NAME + sentinel.write_text(str(run_folder.resolve()), encoding="utf-8") + + return run_folder + + +def setup_rules(run_folder: Path, config: RunnerConfig) -> Path: + """Set up AIDLC rules in the run folder. + + Either clones the git repo or copies from a local path. + + Returns: + Path to the aidlc-rules directory within the run folder. + """ + rules_dest = run_folder / "aidlc-rules" + + if config.aidlc.rules_source == "local" and config.aidlc.rules_local_path: + local_path = Path(config.aidlc.rules_local_path) + if not local_path.exists(): + raise FileNotFoundError(f"Local rules path not found: {local_path}") + shutil.copytree(local_path / "aidlc-rules", rules_dest) + else: + # Git clone + try: + # nosec B603, B607 - Git clone of trusted AIDLC rules repository with validated config + result = subprocess.run( + ["git", "clone", "--branch", config.aidlc.rules_ref, "--depth", "1", config.aidlc.rules_repo, str(rules_dest / "_repo")], + capture_output=True, + text=True, + check=False, + timeout=120, + ) + except subprocess.TimeoutExpired: + raise RuntimeError( + "Timed out cloning AIDLC rules repo after 120s. " + "Check network connectivity and repo URL." + ) + if result.returncode != 0: + raise RuntimeError(f"Failed to clone AIDLC rules repo:\n{result.stderr}") + # Move aidlc-rules content up + repo_rules = rules_dest / "_repo" / "aidlc-rules" + if repo_rules.exists(): + for item in repo_rules.iterdir(): + shutil.move(str(item), str(rules_dest / item.name)) + # Clean up the full repo clone (force-remove read-only git pack files on Windows) + def _force_remove_readonly(func, path, _exc_info): + os.chmod(path, stat.S_IWRITE) + func(path) + + # onexc was added in Python 3.12; fall back to onerror on older versions + if sys.version_info >= (3, 12): + shutil.rmtree(rules_dest / "_repo", onexc=_force_remove_readonly) + else: + shutil.rmtree(rules_dest / "_repo", onerror=_force_remove_readonly) + + return rules_dest + + +def write_run_meta( + run_folder: Path, + config: RunnerConfig, + vision_path: Path, + tech_env_path: Path | None = None, +) -> None: + """Write run metadata to run-meta.yaml.""" + # Use paths relative to the current working directory for portability + try: + vision_rel = str(vision_path.resolve().relative_to(Path.cwd())) + except ValueError: + vision_rel = str(vision_path) + try: + tech_env_rel = str(tech_env_path.resolve().relative_to(Path.cwd())) if tech_env_path else None + except ValueError: + tech_env_rel = str(tech_env_path) if tech_env_path else None + + meta = { + "run_folder": str(run_folder), + "started_at": datetime.now(timezone.utc).isoformat(), + "vision_file": vision_rel, + "tech_env_file": tech_env_rel, + "config": { + "aws_profile": config.aws.profile, + "aws_region": config.aws.region, + "executor_model": config.models.executor.model_id, + "simulator_model": config.models.simulator.model_id, + "rules_source": config.aidlc.rules_source, + "rules_ref": config.aidlc.rules_ref, + "rules_repo": config.aidlc.rules_repo, + "execution_enabled": config.execution.enabled, + "command_timeout": config.execution.command_timeout, + "post_run_tests": config.execution.post_run_tests, + "post_run_timeout": config.execution.post_run_timeout, + "swarm_max_handoffs": config.swarm.max_handoffs, + "swarm_max_iterations": config.swarm.max_iterations, + "swarm_execution_timeout": config.swarm.execution_timeout, + "swarm_node_timeout": config.swarm.node_timeout, + }, + } + atomic_yaml_dump(meta, run_folder / "run-meta.yaml") + + +def run(config: RunnerConfig, vision_path: Path, tech_env_path: Path | None = None) -> None: + """Execute a full AIDLC workflow run. + + Args: + config: Fully resolved runner configuration. + vision_path: Path to the vision/constraints markdown file. + tech_env_path: Optional path to the technical environment markdown file. + """ + # 1. Create run folder + run_folder = create_run_folder(config.runs.output_dir, config) + print(f"Run folder: {run_folder}") + + # 2. Copy vision file + vision_content = vision_path.read_text(encoding="utf-8") + (run_folder / "vision.md").write_text(vision_content, encoding="utf-8") + + # 2b. Copy tech-env file if provided + tech_env_content: str | None = None + if tech_env_path is not None: + tech_env_content = tech_env_path.read_text(encoding="utf-8") + (run_folder / "tech-env.md").write_text(tech_env_content, encoding="utf-8") + + # 3. Set up AIDLC rules + print("Setting up AIDLC rules...") + rules_dir = setup_rules(run_folder, config) + print(f"Rules ready: {rules_dir}") + + # 4. Write run metadata + write_run_meta(run_folder, config, vision_path, tech_env_path=tech_env_path) + + # 5. Create metrics collector and agents with progress handlers + print("Creating agents...") + collector = MetricsCollector(config) + executor_handler = AgentProgressHandler("executor", collector=collector) + simulator_handler = AgentProgressHandler("simulator", collector=collector) + + executor = create_executor( + run_folder=run_folder, + rules_dir=rules_dir, + model_config=config.models.executor, + aws_profile=config.aws.profile, + aws_region=config.aws.region, + callback_handler=executor_handler, + execution_config=config.execution, + ) + simulator = create_simulator( + run_folder=run_folder, + vision_content=vision_content, + model_config=config.models.simulator, + aws_profile=config.aws.profile, + aws_region=config.aws.region, + callback_handler=simulator_handler, + tech_env_content=tech_env_content, + ) + + # 6. Create and run the Swarm + print("Starting AIDLC workflow swarm...") + initial_prompt = ( + "Begin the AIDLC workflow and execute it TO COMPLETION through ALL phases. " + "The project vision is available at vision.md in the run folder. " + ) + if tech_env_content is not None: + initial_prompt += ( + "The technical environment document is available at tech-env.md in the run folder. " + "It defines the required languages, frameworks, cloud services, security controls, " + "testing standards, and prohibited technologies. Follow it as a binding reference " + "during all Construction stages. " + ) + initial_prompt += ( + "Start by loading the core workflow rules and the process overview, then " + "execute every stage of the Inception phase followed by every stage of the " + "Construction phase. The workspace directory is 'workspace/' (currently empty — " + "this is a greenfield project). You MUST generate all application code in " + "workspace/ before the workflow is complete. Do NOT stop after requirements — " + "continue through application design, code generation, and build-and-test." + ) + + swarm = Swarm( + [executor, simulator], + entry_point=executor, + max_handoffs=config.swarm.max_handoffs, + max_iterations=config.swarm.max_iterations, + execution_timeout=config.swarm.execution_timeout, + node_timeout=config.swarm.node_timeout, + repetitive_handoff_detection_window=5, + repetitive_handoff_min_unique_agents=2, + ) + + # Register progress hook for node-level events + progress_hook = SwarmProgressHook(collector=collector) + swarm.hooks.add_hook(progress_hook) + + result = swarm(initial_prompt) + + # 7. Log results + print(f"\nSwarm completed with status: {result.status}") + print(f"Execution time: {result.execution_time}ms") + print(f"Total handoffs: {len(result.node_history)}") + + # 8. Write run metrics + metrics_path = collector.write(result, run_folder) + print(f"Metrics written to: {metrics_path}") + + # 9. Post-run test evaluation + if config.execution.post_run_tests: + print("Running post-run test evaluation...") + test_results_path = run_post_evaluation(run_folder, config) + if test_results_path: + print(f"Test results written to: {test_results_path}") + else: + print("No testable project detected in workspace/ — skipped.") + else: + print("Post-run test evaluation disabled.") + + # Update run-meta with completion info + meta_path = run_folder / "run-meta.yaml" + with open(meta_path, encoding="utf-8") as f: + meta = yaml.safe_load(f) + meta["completed_at"] = datetime.now(timezone.utc).isoformat() + meta["status"] = str(result.status) + meta["execution_time_ms"] = result.execution_time + meta["total_handoffs"] = len(result.node_history) + meta["node_history"] = [node.node_id for node in result.node_history] + atomic_yaml_dump(meta, meta_path) + + print(f"\nRun complete. Artifacts saved to: {run_folder}") diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/tools/__init__.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/tools/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/tools/file_ops.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/tools/file_ops.py new file mode 100644 index 00000000..4e6160c4 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/tools/file_ops.py @@ -0,0 +1,92 @@ +"""File operation tools scoped to the run folder. + +These tools are created via factory functions that bind a specific run_folder, +ensuring all file access stays within the run boundary. +""" + +from __future__ import annotations + +from pathlib import Path + +from strands import tool + + +def _resolve_safe(run_folder: Path, relative_path: str) -> Path: + """Resolve a relative path within the run folder, preventing traversal.""" + resolved = (run_folder / relative_path).resolve() + run_resolved = run_folder.resolve() + if not str(resolved).startswith(str(run_resolved)): + raise ValueError(f"Path traversal denied: {relative_path}") + return resolved + + +def make_file_tools(run_folder: Path) -> list: + """Create file operation tools bound to a specific run folder. + + Args: + run_folder: Absolute path to the run folder. + + Returns: + List of tool-decorated functions: [read_file, write_file, list_files]. + """ + run_folder = run_folder.resolve() + + @tool + def read_file(path: str) -> str: + """Read the contents of a file in the run folder. + + Args: + path: File path relative to the run folder (e.g. 'aidlc-docs/aidlc-state.md'). + """ + try: + target = _resolve_safe(run_folder, path) + if not target.exists(): + return f"Error: File not found: {path}" + if not target.is_file(): + return f"Error: Not a file: {path}" + return target.read_text(encoding="utf-8") + except ValueError as e: + return f"Error: {e}" + + @tool + def write_file(path: str, content: str) -> str: + """Write content to a file in the run folder. Creates parent directories if needed. + + Args: + path: Relative to run folder (e.g. 'aidlc-docs/inception/requirements.md'). + content: The text content to write to the file. + """ + try: + target = _resolve_safe(run_folder, path) + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(content, encoding="utf-8") + return f"Written: {path} ({len(content)} chars)" + except ValueError as e: + return f"Error: {e}" + + @tool + def list_files(directory: str = ".") -> str: + """List files and directories within a path in the run folder. + + Args: + directory: Directory path relative to the run folder. Defaults to the run folder root. + """ + try: + target = _resolve_safe(run_folder, directory) + if not target.exists(): + return f"Error: Directory not found: {directory}" + if not target.is_dir(): + return f"Error: Not a directory: {directory}" + entries = sorted(target.iterdir()) + lines = [] + for entry in entries: + rel = entry.relative_to(run_folder) + suffix = "/" if entry.is_dir() else "" + lines.append(f" {rel}{suffix}") + if not lines: + return f"(empty directory: {directory})" + return "\n".join(lines) + except ValueError as e: + return f"Error: {e}" + + return [read_file, write_file, list_files] diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/tools/rule_loader.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/tools/rule_loader.py new file mode 100644 index 00000000..e3b9eed2 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/tools/rule_loader.py @@ -0,0 +1,77 @@ +"""AIDLC rule loading tool. + +Provides a tool for agents to dynamically read AIDLC rule files on demand, +keeping context window usage low by only loading rules as the workflow needs them. +""" + +from __future__ import annotations + +from pathlib import Path + +from strands import tool + + +def make_rule_loader(rules_dir: Path) -> object: + """Create a rule loader tool bound to a specific rules directory. + + Args: + rules_dir: Path to the cloned/copied aidlc-rules directory + (the folder containing aws-aidlc-rules/ and aws-aidlc-rule-details/). + + Returns: + A tool-decorated function: load_rule. + """ + rules_dir = rules_dir.resolve() + + @tool + def load_rule(rule_path: str) -> str: + """Load an AIDLC rule file by path. + + Use this to read AIDLC workflow rules as you progress through stages. + + Args: + rule_path: Path relative to the rules directory. Examples: + - 'core-workflow' (shorthand for aws-aidlc-rules/core-workflow.md) + - 'common/process-overview.md' (loads from aws-aidlc-rule-details/) + - 'inception/requirements-analysis.md' (loads from aws-aidlc-rule-details/) + - 'construction/code-generation.md' (loads from aws-aidlc-rule-details/) + """ + # Handle the core-workflow shorthand + if rule_path in ("core-workflow", "core-workflow.md"): + target = rules_dir / "aws-aidlc-rules" / "core-workflow.md" + else: + # Default: look in aws-aidlc-rule-details/ + target = rules_dir / "aws-aidlc-rule-details" / rule_path + if not target.suffix: + target = target.with_suffix(".md") + + resolved = target.resolve() + # Safety: stay within rules_dir + if not str(resolved).startswith(str(rules_dir)): + return f"Error: Path traversal denied: {rule_path}" + + if not resolved.exists(): + # List available rules to help the agent + available = _list_available_rules(rules_dir) + return f"Error: Rule file not found: {rule_path}\n\nAvailable rules:\n{available}" + + return resolved.read_text(encoding="utf-8") + + return load_rule + + +def _list_available_rules(rules_dir: Path) -> str: + """List all available rule files for error messages.""" + lines = [] + + core = rules_dir / "aws-aidlc-rules" / "core-workflow.md" + if core.exists(): + lines.append(" core-workflow (shorthand)") + + details_dir = rules_dir / "aws-aidlc-rule-details" + if details_dir.exists(): + for md_file in sorted(details_dir.rglob("*.md")): + rel = md_file.relative_to(details_dir) + lines.append(f" {rel}") + + return "\n".join(lines) if lines else " (no rules found)" diff --git a/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/tools/run_command.py b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/tools/run_command.py new file mode 100644 index 00000000..51f6c379 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/src/aidlc_runner/tools/run_command.py @@ -0,0 +1,117 @@ +"""Shell command execution tool scoped to the run folder. + +Created via a factory function that binds a specific run_folder and timeout, +ensuring all command execution stays within the run boundary. + +Security: All command output is scrubbed for credentials before being returned +to prevent accidental exposure of AWS keys, tokens, or other secrets. +""" + +from __future__ import annotations + +import os +import shlex +import subprocess +from pathlib import Path + +from strands import tool + +from shared.credential_scrubber import scrub_credentials + +_MAX_OUTPUT_CHARS = 50_000 + + +def _resolve_safe(run_folder: Path, relative_path: str) -> Path: + """Resolve a relative path within the run folder, preventing traversal.""" + resolved = (run_folder / relative_path).resolve() + run_resolved = run_folder.resolve() + if not str(resolved).startswith(str(run_resolved)): + raise ValueError(f"Path traversal denied: {relative_path}") + return resolved + + +def make_run_command(run_folder: Path, timeout: int = 120) -> object: + """Create a run_command tool bound to a specific run folder. + + Args: + run_folder: Absolute path to the run folder. + timeout: Default per-command timeout in seconds. + + Returns: + A tool-decorated function for executing shell commands. + """ + run_folder = run_folder.resolve() + + @tool + def run_command(command: str, working_directory: str = "workspace") -> str: + """Execute a shell command in the run folder. + + Use this during Build and Test to install dependencies, run tests, and + fix issues. The command runs in a shell with the working directory set + to the specified path (default: workspace/). + + Args: + command: The shell command to execute. + working_directory: Directory relative to the run folder to run in (default: workspace/). + """ + if not command or not command.strip(): + return "[error: empty command]" + + try: + cwd = _resolve_safe(run_folder, working_directory) + except ValueError as e: + return f"[error: {e}]" + + if not cwd.exists(): + return f"[error: working directory not found: {working_directory}]" + if not cwd.is_dir(): + return f"[error: not a directory: {working_directory}]" + + # Build a restricted environment: preserve PATH for tool access, + # set HOME to run_folder to avoid reading host user config. + env = { + "PATH": os.environ.get("PATH", "/usr/bin:/bin"), + "HOME": str(run_folder), + "LANG": os.environ.get("LANG", "C.UTF-8"), + "TERM": "dumb", + } + # Propagate common tool env vars if present (needed for uv, npm, etc.) + for var in ("UV_CACHE_DIR", "UV_PYTHON", "NODE_PATH", "NPM_CONFIG_CACHE", + "VIRTUAL_ENV", "PYTHONPATH"): + val = os.environ.get(var) + if val is not None: + env[var] = val + + try: + # nosec B603 - Using shlex.split with shell=False and path validated via _resolve_safe + # nosemgrep: dangerous-subprocess-use-audit + result = subprocess.run( + shlex.split(command), + shell=False, + cwd=str(cwd), + capture_output=True, + text=True, + timeout=timeout, + env=env, + ) + output = result.stdout + result.stderr + # Scrub credentials before truncation to ensure redaction markers are visible + output = scrub_credentials(output) + if len(output) > _MAX_OUTPUT_CHARS: + output = output[:_MAX_OUTPUT_CHARS] + "\n... (output truncated)" + return f"[exit code: {result.returncode}]\n{output}" + + except subprocess.TimeoutExpired as e: + partial = "" + if e.stdout: + partial += e.stdout if isinstance(e.stdout, str) else e.stdout.decode("utf-8", errors="replace") + if e.stderr: + partial += e.stderr if isinstance(e.stderr, str) else e.stderr.decode("utf-8", errors="replace") + # Scrub credentials from partial output + partial = scrub_credentials(partial) + return f"[error: command timed out after {timeout}s]\n{partial}" + + except OSError as e: + return f"[error: {e}]" + + return run_command diff --git a/scripts/aidlc-evaluator/packages/execution/tests/test_config.py b/scripts/aidlc-evaluator/packages/execution/tests/test_config.py new file mode 100644 index 00000000..8159b844 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/tests/test_config.py @@ -0,0 +1,101 @@ +"""Tests for configuration loading.""" + +from __future__ import annotations + +import textwrap +from pathlib import Path + +from aidlc_runner.config import RunnerConfig, load_config + + +class TestRunnerConfigDefaults: + def test_default_aws_profile(self): + config = RunnerConfig() + assert config.aws.profile is None + + def test_default_aws_region(self): + config = RunnerConfig() + assert config.aws.region is None + + def test_default_executor_model(self): + config = RunnerConfig() + assert config.models.executor.provider == "bedrock" + assert "opus" in config.models.executor.model_id + + def test_default_simulator_model(self): + config = RunnerConfig() + assert config.models.simulator.provider == "bedrock" + assert "opus" in config.models.simulator.model_id + + def test_default_swarm_settings(self): + config = RunnerConfig() + assert config.swarm.max_handoffs == 200 + assert config.swarm.max_iterations == 200 + assert config.swarm.execution_timeout == 14400.0 + assert config.swarm.node_timeout == 3600.0 + + +class TestLoadConfig: + def test_load_without_file_returns_defaults(self): + config = load_config() + assert config.aws.profile is None + assert config.runs.output_dir == "./runs" + + def test_load_from_yaml(self, tmp_path: Path): + yaml_content = textwrap.dedent("""\ + aws: + profile: "custom-profile" + region: "eu-west-1" + runs: + output_dir: "/tmp/custom-runs" + """) + config_file = tmp_path / "test-config.yaml" + config_file.write_text(yaml_content) + + config = load_config(config_path=config_file) + assert config.aws.profile == "custom-profile" + assert config.aws.region == "eu-west-1" + assert config.runs.output_dir == "/tmp/custom-runs" # nosec B108 - Test assertion, not creating temp files + # Unchanged defaults + assert config.swarm.max_handoffs == 200 + + def test_cli_overrides_applied(self): + overrides = { + "aws": {"profile": "override-profile"}, + "models": {"executor": {"model_id": "some-other-model"}}, + } + config = load_config(cli_overrides=overrides) + assert config.aws.profile == "override-profile" + assert config.models.executor.model_id == "some-other-model" + # Unaffected fields + assert config.aws.region is None + assert config.models.simulator.model_id != "some-other-model" + + def test_cli_overrides_on_top_of_yaml(self, tmp_path: Path): + yaml_content = textwrap.dedent("""\ + aws: + profile: "yaml-profile" + region: "ap-southeast-1" + """) + config_file = tmp_path / "config.yaml" + config_file.write_text(yaml_content) + + overrides = {"aws": {"profile": "cli-wins"}} + config = load_config(config_path=config_file, cli_overrides=overrides) + assert config.aws.profile == "cli-wins" + assert config.aws.region == "ap-southeast-1" + + def test_nonexistent_config_file_returns_defaults(self): + config = load_config(config_path="/nonexistent/path.yaml") + assert config.aws.profile is None + + def test_rules_source_override(self): + overrides = { + "aidlc": { + "rules_source": "local", + "rules_local_path": "/some/path", + }, + } + config = load_config(cli_overrides=overrides) + assert config.aidlc.rules_source == "local" + assert config.aidlc.rules_local_path == "/some/path" diff --git a/scripts/aidlc-evaluator/packages/execution/tests/test_metrics.py b/scripts/aidlc-evaluator/packages/execution/tests/test_metrics.py new file mode 100644 index 00000000..de19f08a --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/tests/test_metrics.py @@ -0,0 +1,582 @@ +"""Tests for metrics collection, artifact scanning, and YAML output.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +import yaml + +from aidlc_runner.config import RunnerConfig +from aidlc_runner.metrics import MetricsCollector, _scan_artifacts +from aidlc_runner.progress import AgentProgressHandler, SwarmProgressHook + + +# --------------------------------------------------------------------------- +# Helpers — lightweight fakes for Strands result types +# --------------------------------------------------------------------------- + + +@dataclass +class FakeNodeResult: + accumulated_usage: dict[str, int] = field(default_factory=lambda: { + "inputTokens": 0, "outputTokens": 0, "totalTokens": 0, + }) + + +@dataclass +class FakeMultiAgentResult: + accumulated_usage: dict[str, int] = field(default_factory=lambda: { + "inputTokens": 0, "outputTokens": 0, "totalTokens": 0, + }) + results: dict[str, FakeNodeResult] = field(default_factory=dict) + execution_time: int = 0 + + +# --------------------------------------------------------------------------- +# Artifact scanning +# --------------------------------------------------------------------------- + + +class TestScanArtifacts: + def test_empty_workspace(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + (tmp_path / "aidlc-docs" / "inception").mkdir(parents=True) + (tmp_path / "aidlc-docs" / "construction").mkdir(parents=True) + + result = _scan_artifacts(tmp_path) + ws = result["workspace"] + assert ws["source_files"] == 0 + assert ws["test_files"] == 0 + assert ws["config_files"] == 0 + assert ws["other_files"] == 0 + assert ws["total_files"] == 0 + assert ws["total_lines_of_code"] == 0 + + docs = result["aidlc_docs"] + assert docs["total_files"] == 0 + + def test_source_files_counted(self, tmp_path: Path): + ws = tmp_path / "workspace" / "src" + ws.mkdir(parents=True) + (ws / "main.py").write_text("print('hello')\nprint('world')\n") + (ws / "utils.js").write_text("// util\n") + (tmp_path / "aidlc-docs").mkdir() + + result = _scan_artifacts(tmp_path) + assert result["workspace"]["source_files"] == 2 + assert result["workspace"]["total_lines_of_code"] == 3 + + def test_test_files_by_name(self, tmp_path: Path): + ws = tmp_path / "workspace" + ws.mkdir() + (ws / "test_main.py").write_text("def test_it(): pass\n") + (ws / "app.spec.js").write_text("describe('app', () => {})\n") + (tmp_path / "aidlc-docs").mkdir() + + result = _scan_artifacts(tmp_path) + assert result["workspace"]["test_files"] == 2 + assert result["workspace"]["source_files"] == 0 + + def test_test_files_by_directory(self, tmp_path: Path): + tests_dir = tmp_path / "workspace" / "tests" + tests_dir.mkdir(parents=True) + (tests_dir / "conftest.py").write_text("") + (tests_dir / "helpers.py").write_text("x = 1\n") + (tmp_path / "aidlc-docs").mkdir() + + result = _scan_artifacts(tmp_path) + assert result["workspace"]["test_files"] == 2 + + def test_config_files(self, tmp_path: Path): + ws = tmp_path / "workspace" + ws.mkdir() + (ws / "pyproject.toml").write_text("[project]\nname = 'x'\n") + (ws / "Dockerfile").write_text("FROM python:3.13\n") + (tmp_path / "aidlc-docs").mkdir() + + result = _scan_artifacts(tmp_path) + assert result["workspace"]["config_files"] == 2 + + def test_aidlc_docs_categorised(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + inc = tmp_path / "aidlc-docs" / "inception" + con = tmp_path / "aidlc-docs" / "construction" + inc.mkdir(parents=True) + con.mkdir(parents=True) + + (inc / "requirements.md").write_text("# Req\n") + (inc / "user-stories.md").write_text("# Stories\n") + (con / "functional-design.md").write_text("# Design\n") + (tmp_path / "aidlc-docs" / "audit.md").write_text("# Audit\n") + + result = _scan_artifacts(tmp_path) + docs = result["aidlc_docs"] + assert docs["inception_files"] == 2 + assert docs["construction_files"] == 1 + assert docs["other_files"] == 1 + assert docs["total_files"] == 4 + + def test_lock_files_excluded_from_loc(self, tmp_path: Path): + ws = tmp_path / "workspace" + ws.mkdir() + (ws / "main.py").write_text("print('hello')\n") + (ws / "package-lock.json").write_text("{\n" * 50000) + (ws / "yarn.lock").write_text("dep:\n" * 10000) + (tmp_path / "aidlc-docs").mkdir() + + result = _scan_artifacts(tmp_path) + # Lock files are still counted in file totals but NOT in LOC + assert result["workspace"]["total_lines_of_code"] == 1 + + def test_nested_yaml_counted_as_config(self, tmp_path: Path): + ws = tmp_path / "workspace" + (ws / ".github" / "workflows").mkdir(parents=True) + (ws / ".github" / "workflows" / "ci.yml").write_text("name: CI\n") + (tmp_path / "aidlc-docs").mkdir() + + result = _scan_artifacts(tmp_path) + assert result["workspace"]["config_files"] == 1 + + def test_binary_files_zero_loc(self, tmp_path: Path): + ws = tmp_path / "workspace" + ws.mkdir() + # Write bytes that are invalid UTF-8 + (ws / "image.png").write_bytes(b"\x89PNG\r\n\x1a\n" + b"\xff" * 50) + (tmp_path / "aidlc-docs").mkdir() + + result = _scan_artifacts(tmp_path) + assert result["workspace"]["other_files"] == 1 + assert result["workspace"]["total_lines_of_code"] == 0 + + def test_missing_directories_handled(self, tmp_path: Path): + # No workspace/ or aidlc-docs/ at all + result = _scan_artifacts(tmp_path) + assert result["workspace"]["total_files"] == 0 + assert result["aidlc_docs"]["total_files"] == 0 + + +# --------------------------------------------------------------------------- +# MetricsCollector +# --------------------------------------------------------------------------- + + +class TestMetricsCollectorRecording: + def test_record_handoff(self): + collector = MetricsCollector(RunnerConfig()) + collector.record_handoff(1, "executor", 5000) + collector.record_handoff(2, "simulator", 2000) + + assert len(collector._handoffs) == 2 + assert collector._handoffs[0] == {"handoff": 1, "node_id": "executor", "duration_ms": 5000} + assert collector._handoffs[1] == {"handoff": 2, "node_id": "simulator", "duration_ms": 2000} + + def test_record_error(self): + collector = MetricsCollector(RunnerConfig()) + collector.record_error("throttle", "Rate exceeded") + + assert len(collector._errors) == 1 + assert collector._errors[0]["type"] == "throttle" + assert collector._errors[0]["message"] == "Rate exceeded" + assert "timestamp" in collector._errors[0] + + def test_record_context_sample(self): + collector = MetricsCollector(RunnerConfig()) + collector.record_context_sample("executor", 50000) + collector.record_context_sample("simulator", 12000) + collector.record_context_sample("executor", 80000) + + assert len(collector._context_samples) == 3 + assert collector._context_samples[0] == {"agent": "executor", "input_tokens": 50000} + assert collector._context_samples[1] == {"agent": "simulator", "input_tokens": 12000} + assert collector._context_samples[2] == {"agent": "executor", "input_tokens": 80000} + + +class TestMetricsCollectorBuild: + def _make_result(self) -> FakeMultiAgentResult: + return FakeMultiAgentResult( + accumulated_usage={ + "inputTokens": 100000, + "outputTokens": 40000, + "totalTokens": 140000, + "cacheReadInputTokens": 5000, + "cacheWriteInputTokens": 2000, + }, + results={ + "executor": FakeNodeResult(accumulated_usage={ + "inputTokens": 70000, + "outputTokens": 30000, + "totalTokens": 100000, + }), + "simulator": FakeNodeResult(accumulated_usage={ + "inputTokens": 30000, + "outputTokens": 10000, + "totalTokens": 40000, + }), + }, + execution_time=60000, + ) + + def test_tokens_section(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + (tmp_path / "aidlc-docs").mkdir() + + collector = MetricsCollector(RunnerConfig()) + metrics = collector.build_metrics(self._make_result(), tmp_path) + + # Check per-agent tokens + executor = metrics["tokens"]["per_agent"]["executor"] + assert executor["input_tokens"] == 70000 + assert executor["output_tokens"] == 30000 + assert executor["total_tokens"] == 100000 + + simulator = metrics["tokens"]["per_agent"]["simulator"] + assert simulator["input_tokens"] == 30000 + assert simulator["output_tokens"] == 10000 + assert simulator["total_tokens"] == 40000 + + # Check total is sum of per-agent (unique tokens) + total = metrics["tokens"]["total"] + assert total["input_tokens"] == 100000 # 70k + 30k + assert total["output_tokens"] == 40000 # 30k + 10k + assert total["total_tokens"] == 140000 # 100k + 40k + assert total["cache_read_tokens"] == 0 + assert total["cache_write_tokens"] == 0 + + # Check repeated context (no repetition in this test case) + repeated = metrics["tokens"]["repeated_context"] + assert repeated["input_tokens"] == 0 + assert repeated["output_tokens"] == 0 + assert repeated["total_tokens"] == 0 + assert repeated["cache_read_tokens"] == 5000 # Cache only in api_total + assert repeated["cache_write_tokens"] == 2000 + + # Check api_total includes cache tokens + api_total = metrics["tokens"]["api_total"] + assert api_total["input_tokens"] == 100000 + assert api_total["output_tokens"] == 40000 + assert api_total["total_tokens"] == 140000 + assert api_total["cache_read_tokens"] == 5000 + assert api_total["cache_write_tokens"] == 2000 + + def test_timing_section(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + (tmp_path / "aidlc-docs").mkdir() + + collector = MetricsCollector(RunnerConfig()) + collector.record_handoff(1, "executor", 5000) + collector.record_handoff(2, "simulator", 3000) + metrics = collector.build_metrics(self._make_result(), tmp_path) + + assert metrics["timing"]["total_wall_clock_ms"] == 60000 + assert len(metrics["timing"]["handoffs"]) == 2 + assert metrics["timing"]["handoffs"][0]["duration_ms"] == 5000 + + def test_handoff_patterns(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + (tmp_path / "aidlc-docs").mkdir() + + collector = MetricsCollector(RunnerConfig()) + collector.record_handoff(1, "executor", 4000) + collector.record_handoff(2, "simulator", 2000) + collector.record_handoff(3, "executor", 6000) + collector.record_handoff(4, "simulator", 3000) + + metrics = collector.build_metrics(self._make_result(), tmp_path) + patterns = metrics["handoff_patterns"] + assert patterns["total_handoffs"] == 4 + assert patterns["sequence"] == ["executor", "simulator", "executor", "simulator"] + assert patterns["per_agent"]["executor"]["turn_count"] == 2 + assert patterns["per_agent"]["executor"]["total_duration_ms"] == 10000 + assert patterns["per_agent"]["executor"]["avg_turn_duration_ms"] == 5000 + assert patterns["per_agent"]["simulator"]["turn_count"] == 2 + assert patterns["per_agent"]["simulator"]["avg_turn_duration_ms"] == 2500 + + def test_errors_section(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + (tmp_path / "aidlc-docs").mkdir() + + collector = MetricsCollector(RunnerConfig()) + collector.record_error("throttle", "Rate exceeded") + collector.record_error("throttle", "Rate exceeded again") + collector.record_error("model_error", "Model stream failed") + + metrics = collector.build_metrics(self._make_result(), tmp_path) + errors = metrics["errors"] + assert errors["throttle_events"] == 2 + assert errors["model_error_events"] == 1 + assert errors["timeout_events"] == 0 + assert len(errors["details"]) == 3 + + def test_model_params_from_config(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + (tmp_path / "aidlc-docs").mkdir() + + config = RunnerConfig() + collector = MetricsCollector(config) + metrics = collector.build_metrics(self._make_result(), tmp_path) + + params = metrics["model_params"] + assert params["executor"]["model_id"] == config.models.executor.model_id + assert params["simulator"]["model_id"] == config.models.simulator.model_id + assert params["aws_region"] == config.aws.region + + +class TestContextSizeStats: + def test_compute_context_stats_basic(self): + stats = MetricsCollector._compute_context_stats([10000, 50000, 30000, 90000]) + assert stats["min_tokens"] == 10000 + assert stats["max_tokens"] == 90000 + assert stats["avg_tokens"] == 45000 + assert stats["median_tokens"] == 40000 # median of [10000, 30000, 50000, 90000] + assert stats["sample_count"] == 4 + + def test_compute_context_stats_single(self): + stats = MetricsCollector._compute_context_stats([42000]) + assert stats["min_tokens"] == 42000 + assert stats["max_tokens"] == 42000 + assert stats["avg_tokens"] == 42000 + assert stats["median_tokens"] == 42000 + assert stats["sample_count"] == 1 + + def test_compute_context_stats_empty(self): + stats = MetricsCollector._compute_context_stats([]) + assert stats["min_tokens"] == 0 + assert stats["max_tokens"] == 0 + assert stats["avg_tokens"] == 0 + assert stats["median_tokens"] == 0 + assert stats["sample_count"] == 0 + + def test_context_size_in_build_metrics(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + (tmp_path / "aidlc-docs").mkdir() + + collector = MetricsCollector(RunnerConfig()) + collector.record_context_sample("executor", 25000) + collector.record_context_sample("executor", 75000) + collector.record_context_sample("simulator", 12000) + + result = FakeMultiAgentResult( + accumulated_usage={"inputTokens": 100, "outputTokens": 50, "totalTokens": 150}, + results={"executor": FakeNodeResult(), "simulator": FakeNodeResult()}, + execution_time=1000, + ) + metrics = collector.build_metrics(result, tmp_path) + + ctx = metrics["context_size"] + assert ctx["total"]["sample_count"] == 3 + assert ctx["total"]["min_tokens"] == 12000 + assert ctx["total"]["max_tokens"] == 75000 + + assert ctx["per_agent"]["executor"]["sample_count"] == 2 + assert ctx["per_agent"]["executor"]["min_tokens"] == 25000 + assert ctx["per_agent"]["executor"]["max_tokens"] == 75000 + + assert ctx["per_agent"]["simulator"]["sample_count"] == 1 + assert ctx["per_agent"]["simulator"]["min_tokens"] == 12000 + + assert len(ctx["samples"]) == 3 + + def test_context_size_empty_when_no_samples(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + (tmp_path / "aidlc-docs").mkdir() + + collector = MetricsCollector(RunnerConfig()) + result = FakeMultiAgentResult( + accumulated_usage={"inputTokens": 10, "outputTokens": 5, "totalTokens": 15}, + results={"executor": FakeNodeResult()}, + execution_time=1000, + ) + metrics = collector.build_metrics(result, tmp_path) + + ctx = metrics["context_size"] + assert ctx["total"]["sample_count"] == 0 + assert ctx["total"]["min_tokens"] == 0 + assert ctx["per_agent"] == {} + assert ctx["samples"] == [] + + +class TestMetricsCollectorWrite: + def test_write_produces_valid_yaml(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + (tmp_path / "aidlc-docs").mkdir() + + result = FakeMultiAgentResult( + accumulated_usage={"inputTokens": 10, "outputTokens": 5, "totalTokens": 15}, + results={"executor": FakeNodeResult(accumulated_usage={ + "inputTokens": 10, "outputTokens": 5, "totalTokens": 15, + })}, + execution_time=1000, + ) + + collector = MetricsCollector(RunnerConfig()) + out_path = collector.write(result, tmp_path) + + assert out_path == tmp_path / "run-metrics.yaml" + assert out_path.exists() + + with open(out_path) as f: + data = yaml.safe_load(f) + + # Verify top-level keys + assert "tokens" in data + assert "timing" in data + assert "handoff_patterns" in data + assert "artifacts" in data + assert "errors" in data + assert "model_params" in data + assert "context_size" in data + + +# --------------------------------------------------------------------------- +# Progress hooks integration with MetricsCollector +# --------------------------------------------------------------------------- + + +class TestAgentProgressHandlerContextSamples: + def test_metadata_event_records_context_sample(self): + collector = MetricsCollector(RunnerConfig()) + handler = AgentProgressHandler("executor", collector=collector) + + handler(event={"metadata": {"usage": {"inputTokens": 54321, "outputTokens": 1234, "totalTokens": 55555}}}) + + assert len(collector._context_samples) == 1 + assert collector._context_samples[0] == {"agent": "executor", "input_tokens": 54321} + + def test_metadata_event_zero_tokens_skipped(self): + collector = MetricsCollector(RunnerConfig()) + handler = AgentProgressHandler("executor", collector=collector) + + handler(event={"metadata": {"usage": {"inputTokens": 0, "outputTokens": 0, "totalTokens": 0}}}) + + assert len(collector._context_samples) == 0 + + def test_metadata_event_no_collector_does_not_crash(self): + handler = AgentProgressHandler("executor", collector=None) + handler(event={"metadata": {"usage": {"inputTokens": 10000}}}) + + def test_metadata_event_missing_usage_key(self): + collector = MetricsCollector(RunnerConfig()) + handler = AgentProgressHandler("executor", collector=collector) + + handler(event={"metadata": {"metrics": {"latencyMs": 100}}}) + + assert len(collector._context_samples) == 0 + + def test_multiple_metadata_events_accumulated(self): + collector = MetricsCollector(RunnerConfig()) + handler = AgentProgressHandler("executor", collector=collector) + + handler(event={"metadata": {"usage": {"inputTokens": 10000}}}) + handler(event={"metadata": {"usage": {"inputTokens": 30000}}}) + handler(event={"metadata": {"usage": {"inputTokens": 80000}}}) + + assert len(collector._context_samples) == 3 + tokens = [s["input_tokens"] for s in collector._context_samples] + assert tokens == [10000, 30000, 80000] + + +class TestAgentProgressHandlerErrors: + def test_throttle_event_recorded(self): + collector = MetricsCollector(RunnerConfig()) + handler = AgentProgressHandler("executor", collector=collector) + + handler(event={"throttlingException": {"message": "Rate exceeded"}}) + + assert len(collector._errors) == 1 + assert collector._errors[0]["type"] == "throttle" + assert "executor" in collector._errors[0]["message"] + assert "Rate exceeded" in collector._errors[0]["message"] + + def test_model_stream_error_recorded(self): + collector = MetricsCollector(RunnerConfig()) + handler = AgentProgressHandler("simulator", collector=collector) + + handler(event={"modelStreamErrorException": {"message": "Stream broken"}}) + + assert len(collector._errors) == 1 + assert collector._errors[0]["type"] == "model_error" + + def test_service_unavailable_recorded(self): + collector = MetricsCollector(RunnerConfig()) + handler = AgentProgressHandler("executor", collector=collector) + + handler(event={"serviceUnavailableException": {"message": "Service down"}}) + + assert len(collector._errors) == 1 + assert collector._errors[0]["type"] == "service_unavailable" + + def test_no_collector_does_not_crash(self): + handler = AgentProgressHandler("executor", collector=None) + # Should not raise even with error events + handler(event={"throttlingException": {"message": "Rate exceeded"}}) + + def test_non_error_event_not_recorded(self): + collector = MetricsCollector(RunnerConfig()) + handler = AgentProgressHandler("executor", collector=collector) + + handler(event={"contentBlockStart": {"start": {"toolUse": {"name": "write_file"}}}}) + + assert len(collector._errors) == 0 + + def test_tool_count_still_works_with_collector(self): + collector = MetricsCollector(RunnerConfig()) + handler = AgentProgressHandler("executor", collector=collector) + + handler(event={"contentBlockStart": {"start": {"toolUse": {"name": "write_file"}}}}) + handler(event={"contentBlockStart": {"start": {"toolUse": {"name": "read_file"}}}}) + + assert handler.tool_count == 2 + + +class TestSwarmProgressHookRecording: + def test_handoff_recorded_to_collector(self): + collector = MetricsCollector(RunnerConfig()) + hook = SwarmProgressHook(collector=collector) + + # Simulate before/after node events + before_event = _FakeBeforeNodeCallEvent("executor") + hook._on_before_node(before_event) + + after_event = _FakeAfterNodeCallEvent("executor") + hook._on_after_node(after_event) + + assert len(collector._handoffs) == 1 + assert collector._handoffs[0]["handoff"] == 1 + assert collector._handoffs[0]["node_id"] == "executor" + assert collector._handoffs[0]["duration_ms"] >= 0 + + def test_multiple_handoffs_numbered(self): + collector = MetricsCollector(RunnerConfig()) + hook = SwarmProgressHook(collector=collector) + + for node_id in ["executor", "simulator", "executor"]: + hook._on_before_node(_FakeBeforeNodeCallEvent(node_id)) + hook._on_after_node(_FakeAfterNodeCallEvent(node_id)) + + assert len(collector._handoffs) == 3 + assert collector._handoffs[0]["handoff"] == 1 + assert collector._handoffs[1]["handoff"] == 2 + assert collector._handoffs[2]["handoff"] == 3 + + def test_no_collector_does_not_crash(self): + hook = SwarmProgressHook(collector=None) + hook._on_before_node(_FakeBeforeNodeCallEvent("executor")) + hook._on_after_node(_FakeAfterNodeCallEvent("executor")) + # No exception raised + + +# --------------------------------------------------------------------------- +# Fake hook events (minimal stubs for testing) +# --------------------------------------------------------------------------- + + +class _FakeBeforeNodeCallEvent: + def __init__(self, node_id: str): + self.node_id = node_id + + +class _FakeAfterNodeCallEvent: + def __init__(self, node_id: str): + self.node_id = node_id diff --git a/scripts/aidlc-evaluator/packages/execution/tests/test_post_run.py b/scripts/aidlc-evaluator/packages/execution/tests/test_post_run.py new file mode 100644 index 00000000..30914572 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/tests/test_post_run.py @@ -0,0 +1,359 @@ +"""Tests for post-run evaluation — project detection, output parsing, result files.""" + +from __future__ import annotations + +from pathlib import Path + +import yaml + +from aidlc_runner.config import RunnerConfig +from aidlc_runner.post_run import ( + _parse_cargo, + _parse_go, + _parse_jest, + _parse_pytest, + _truncate, + detect_project, + parse_test_output, + run_post_evaluation, +) + + +# --------------------------------------------------------------------------- +# Project detection +# --------------------------------------------------------------------------- + + +class TestDetectProject: + def test_pyproject_toml(self, tmp_path: Path): + (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n") + info = detect_project(tmp_path) + assert info is not None + assert info.project_type == "python" + assert "uv" in info.install_cmd + assert "pytest" in info.test_cmd + assert info.project_root == tmp_path + + def test_package_json(self, tmp_path: Path): + (tmp_path / "package.json").write_text('{"name": "x"}') + info = detect_project(tmp_path) + assert info is not None + assert info.project_type == "node" + assert "npm install" in info.install_cmd + + def test_cargo_toml(self, tmp_path: Path): + (tmp_path / "Cargo.toml").write_text("[package]\nname='x'\n") + info = detect_project(tmp_path) + assert info is not None + assert info.project_type == "rust" + + def test_go_mod(self, tmp_path: Path): + (tmp_path / "go.mod").write_text("module example.com/x\n") + info = detect_project(tmp_path) + assert info is not None + assert info.project_type == "go" + + def test_setup_py(self, tmp_path: Path): + (tmp_path / "setup.py").write_text("from setuptools import setup\nsetup()") + info = detect_project(tmp_path) + assert info is not None + assert info.project_type == "python-legacy" + + def test_no_markers(self, tmp_path: Path): + (tmp_path / "README.md").write_text("# Hello") + info = detect_project(tmp_path) + assert info is None + + def test_priority_pyproject_over_package_json(self, tmp_path: Path): + (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n") + (tmp_path / "package.json").write_text('{"name": "x"}') + info = detect_project(tmp_path) + assert info is not None + assert info.project_type == "python" + + def test_empty_directory(self, tmp_path: Path): + info = detect_project(tmp_path) + assert info is None + + def test_subdirectory_detection(self, tmp_path: Path): + """Detect project in workspace/my-app/ when workspace/ has no markers.""" + subdir = tmp_path / "my-app" + subdir.mkdir() + (subdir / "pyproject.toml").write_text("[project]\nname='x'\n") + info = detect_project(tmp_path) + assert info is not None + assert info.project_type == "python" + assert info.project_root == subdir + + def test_subdirectory_not_checked_when_root_has_marker(self, tmp_path: Path): + """Root marker takes priority over subdirectory marker.""" + (tmp_path / "package.json").write_text('{"name": "root"}') + subdir = tmp_path / "sub" + subdir.mkdir() + (subdir / "pyproject.toml").write_text("[project]\nname='sub'\n") + info = detect_project(tmp_path) + assert info is not None + assert info.project_type == "node" + assert info.project_root == tmp_path + + def test_hidden_subdirectories_skipped(self, tmp_path: Path): + """Dot-prefixed directories like .cache should not be searched.""" + hidden = tmp_path / ".cache" + hidden.mkdir() + (hidden / "pyproject.toml").write_text("[project]\nname='x'\n") + info = detect_project(tmp_path) + assert info is None + + def test_vendor_directories_skipped(self, tmp_path: Path): + """Vendor dirs like .venv and node_modules should not be searched.""" + for vendor in (".venv", "node_modules", "__pycache__"): + d = tmp_path / vendor + d.mkdir(exist_ok=True) + (d / "pyproject.toml").write_text("[project]\nname='x'\n") + info = detect_project(tmp_path) + assert info is None + + def test_deeply_nested_project(self, tmp_path: Path): + """Detect project inside workspace/sci-calc/app/ (2 levels deep).""" + nested = tmp_path / "sci-calc" / "app" + nested.mkdir(parents=True) + (nested / "pyproject.toml").write_text("[project]\nname='x'\n") + info = detect_project(tmp_path) + assert info is not None + assert info.project_type == "python" + assert info.project_root == nested + + def test_max_depth_exceeded(self, tmp_path: Path): + """Projects beyond _MAX_SEARCH_DEPTH levels are not detected.""" + deep = tmp_path / "a" / "b" / "c" / "d" + deep.mkdir(parents=True) + (deep / "pyproject.toml").write_text("[project]\nname='x'\n") + info = detect_project(tmp_path) + assert info is None + + def test_nonexistent_workspace(self, tmp_path: Path): + info = detect_project(tmp_path / "does-not-exist") + assert info is None + + def test_shallowest_project_preferred(self, tmp_path: Path): + """BFS should find the shallowest project first.""" + shallow = tmp_path / "app" + shallow.mkdir() + (shallow / "package.json").write_text('{"name":"shallow"}') + deep = tmp_path / "deep" / "nested" + deep.mkdir(parents=True) + (deep / "pyproject.toml").write_text("[project]\nname='deep'\n") + info = detect_project(tmp_path) + assert info is not None + assert info.project_root == shallow + + +# --------------------------------------------------------------------------- +# Test output parsers +# --------------------------------------------------------------------------- + + +class TestParsePytest: + def test_all_passed(self): + output = "========================= 5 passed in 1.23s =========================" + result = _parse_pytest(output) + assert result["passed"] == 5 + assert result["failed"] is None + + def test_mixed_results(self): + output = "============ 3 passed, 2 failed, 1 error in 4.56s ============" + result = _parse_pytest(output) + assert result["passed"] == 3 + assert result["failed"] == 2 + assert result["errors"] == 1 + + def test_with_skipped(self): + output = "========= 10 passed, 1 skipped, 1 warning in 2.00s =========" + result = _parse_pytest(output) + assert result["passed"] == 10 + assert result["skipped"] == 1 + + def test_no_summary(self): + output = "some random output\nno test summary here" + result = _parse_pytest(output) + assert result["passed"] is None + + def test_short_form(self): + output = "5 passed" + result = _parse_pytest(output) + assert result["passed"] == 5 + + +class TestParseJest: + def test_jest_summary(self): + output = "Tests: 2 failed, 5 passed, 7 total" + result = _parse_jest(output) + assert result["passed"] == 5 + assert result["failed"] == 2 + + def test_jest_all_passed(self): + output = "Tests: 10 passed, 10 total" + result = _parse_jest(output) + assert result["passed"] == 10 + assert result["failed"] is None + + def test_vitest_format(self): + output = "Tests 5 passed | 2 failed (7)" + result = _parse_jest(output) + assert result["passed"] == 5 + assert result["failed"] == 2 + + def test_no_summary(self): + output = "running tests..." + result = _parse_jest(output) + assert result["passed"] is None + + +class TestParseCargo: + def test_ok_result(self): + output = "test result: ok. 10 passed; 0 failed; 2 ignored; 0 measured" + result = _parse_cargo(output) + assert result["passed"] == 10 + assert result["failed"] == 0 + assert result["skipped"] == 2 + + def test_failed_result(self): + output = "test result: FAILED. 8 passed; 2 failed; 0 ignored; 0 measured" + result = _parse_cargo(output) + assert result["passed"] == 8 + assert result["failed"] == 2 + + def test_no_summary(self): + output = "compiling..." + result = _parse_cargo(output) + assert result["passed"] is None + + +class TestParseGo: + def test_mixed_results(self): + output = ( + "--- PASS: TestAdd (0.00s)\n" + "--- PASS: TestSub (0.00s)\n" + "--- FAIL: TestDiv (0.01s)\n" + ) + result = _parse_go(output) + assert result["passed"] == 2 + assert result["failed"] == 1 + + def test_all_pass(self): + output = "--- PASS: TestOne (0.00s)\n--- PASS: TestTwo (0.00s)\n" + result = _parse_go(output) + assert result["passed"] == 2 + assert result["failed"] == 0 + + def test_no_results(self): + output = "building..." + result = _parse_go(output) + assert result["passed"] is None + + +class TestParseTestOutput: + def test_total_computed(self): + result = parse_test_output("python", "===== 3 passed, 1 failed in 1.0s =====") + assert result["total"] == 4 + + def test_unknown_project_type(self): + result = parse_test_output("unknown", "some output") + assert result["passed"] is None + assert result["total"] is None + + +# --------------------------------------------------------------------------- +# Output truncation +# --------------------------------------------------------------------------- + + +class TestTruncate: + def test_short_text_unchanged(self): + assert _truncate("hello", 100) == "hello" + + def test_long_text_truncated(self): + text = "x" * 20000 + result = _truncate(text, 10000) + assert len(result) < 11000 + assert "truncated" in result + + def test_exact_limit(self): + text = "x" * 10000 + assert _truncate(text, 10000) == text + + +# --------------------------------------------------------------------------- +# Full run_post_evaluation integration +# --------------------------------------------------------------------------- + + +class TestRunPostEvaluation: + def test_no_workspace(self, tmp_path: Path): + # No workspace/ directory at all + config = RunnerConfig() + result_path = run_post_evaluation(tmp_path, config) + assert result_path is not None + with open(result_path) as f: + data = yaml.safe_load(f) + assert data["status"] == "skipped" + + def test_empty_workspace(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + config = RunnerConfig() + result_path = run_post_evaluation(tmp_path, config) + assert result_path is not None + with open(result_path) as f: + data = yaml.safe_load(f) + assert data["status"] == "skipped" + assert "no recognised" in data["reason"] + + def test_python_project_detected(self, tmp_path: Path): + ws = tmp_path / "workspace" + ws.mkdir() + # Create a minimal Python project that will fail install but still produce output + (ws / "pyproject.toml").write_text( + '[project]\nname = "test-proj"\nversion = "0.1.0"\n' + ) + + config = RunnerConfig() + result_path = run_post_evaluation(tmp_path, config) + assert result_path == tmp_path / "test-results.yaml" + assert result_path.exists() + + with open(result_path) as f: + data = yaml.safe_load(f) + assert data["project_type"] == "python" + assert "install" in data + assert "test" in data + assert "command" in data["install"] + assert "command" in data["test"] + + def test_result_yaml_schema(self, tmp_path: Path): + ws = tmp_path / "workspace" + ws.mkdir() + (ws / "pyproject.toml").write_text( + '[project]\nname = "test-proj"\nversion = "0.1.0"\n' + ) + + config = RunnerConfig() + result_path = run_post_evaluation(tmp_path, config) + + with open(result_path) as f: + data = yaml.safe_load(f) + + # Verify required top-level keys + assert "status" in data + assert "project_type" in data + assert "project_root" in data + assert "install" in data + assert "test" in data + + # Verify install structure + assert "command" in data["install"] + assert "exit_code" in data["install"] or data["install"].get("timed_out") + assert "output" in data["install"] + + # Verify test structure + assert "command" in data["test"] + assert "parsed_results" in data["test"] diff --git a/scripts/aidlc-evaluator/packages/execution/tests/test_run_command.py b/scripts/aidlc-evaluator/packages/execution/tests/test_run_command.py new file mode 100644 index 00000000..6b721108 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/tests/test_run_command.py @@ -0,0 +1,165 @@ +"""Tests for the run_command tool — sandboxing, timeout, and output.""" + +from __future__ import annotations + +import textwrap +from pathlib import Path + +from aidlc_runner.config import ExecutionConfig, RunnerConfig, load_config +from aidlc_runner.tools.run_command import make_run_command + + +def _call(run_cmd, command: str, working_directory: str = "workspace") -> str: + """Call the run_command tool and return the string result.""" + return run_cmd(command=command, working_directory=working_directory) + + +class TestRunCommandSandbox: + def test_runs_in_workspace(self, tmp_path: Path): + ws = tmp_path / "workspace" + ws.mkdir() + (ws / "hello.txt").write_text("hello world") + + run_cmd = make_run_command(tmp_path) + result = _call(run_cmd, "cat hello.txt", "workspace") + assert "hello world" in result + assert "[exit code: 0]" in result + + def test_default_working_directory_is_workspace(self, tmp_path: Path): + ws = tmp_path / "workspace" + ws.mkdir() + (ws / "marker.txt").write_text("found") + + run_cmd = make_run_command(tmp_path) + result = _call(run_cmd, "cat marker.txt") + assert "found" in result + + def test_path_traversal_denied(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + + run_cmd = make_run_command(tmp_path) + result = _call(run_cmd, "ls", "../../") + assert "Path traversal denied" in result + + def test_nonexistent_working_directory(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + + run_cmd = make_run_command(tmp_path) + result = _call(run_cmd, "ls", "nonexistent") + assert "not found" in result + + def test_can_write_files_in_workspace(self, tmp_path: Path): + ws = tmp_path / "workspace" + ws.mkdir() + + run_cmd = make_run_command(tmp_path) + _call(run_cmd, "python3 -c \"from pathlib import Path; Path('output.txt').write_text('test content')\"") + assert (ws / "output.txt").exists() + assert "test content" in (ws / "output.txt").read_text() + + +class TestRunCommandTimeout: + def test_timeout_returns_error(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + + run_cmd = make_run_command(tmp_path, timeout=1) + result = _call(run_cmd, "sleep 30") + assert "timed out after 1s" in result + + +class TestRunCommandOutput: + def test_exit_code_included(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + + run_cmd = make_run_command(tmp_path) + result = _call(run_cmd, "python3 -c \"import sys; sys.exit(42)\"") + assert "[exit code: 42]" in result + + def test_stderr_captured(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + + run_cmd = make_run_command(tmp_path) + result = _call(run_cmd, "python3 -c \"import sys; sys.stderr.write('err msg\\n')\"") + assert "err msg" in result + + def test_long_output_truncated(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + + run_cmd = make_run_command(tmp_path) + result = _call(run_cmd, "python3 -c \"print('x' * 60000)\"") + assert "truncated" in result + # Should be capped at around 50k + header + assert len(result) < 55000 + + def test_command_not_found(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + + run_cmd = make_run_command(tmp_path) + result = _call(run_cmd, "nonexistent_command_xyz") + # shell=False raises OSError ([error:]), shell=True returns [exit code: 127] + is_error = "[error:" in result + is_nonzero_exit = "[exit code:" in result and "[exit code: 0]" not in result + assert is_error or is_nonzero_exit + + +class TestRunCommandEdgeCases: + def test_empty_command(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + + run_cmd = make_run_command(tmp_path) + result = _call(run_cmd, "") + assert "empty command" in result + + def test_whitespace_only_command(self, tmp_path: Path): + (tmp_path / "workspace").mkdir() + + run_cmd = make_run_command(tmp_path) + result = _call(run_cmd, " ") + assert "empty command" in result + + +class TestExecutionConfig: + def test_defaults(self): + config = ExecutionConfig() + assert config.enabled is True + assert config.command_timeout == 120 + assert config.post_run_tests is True + assert config.post_run_timeout == 300 + + def test_runner_config_includes_execution(self): + config = RunnerConfig() + assert config.execution.enabled is True + assert config.execution.command_timeout == 120 + + def test_yaml_loading(self, tmp_path: Path): + yaml_content = textwrap.dedent("""\ + execution: + enabled: false + command_timeout: 60 + post_run_tests: false + post_run_timeout: 120 + """) + config_file = tmp_path / "config.yaml" + config_file.write_text(yaml_content) + + config = load_config(config_path=config_file) + assert config.execution.enabled is False + assert config.execution.command_timeout == 60 + assert config.execution.post_run_tests is False + assert config.execution.post_run_timeout == 120 + + def test_no_exec_cli_flag(self): + from aidlc_runner.cli import build_parser, _build_cli_overrides + + parser = build_parser() + args = parser.parse_args(["--vision", "v.md", "--no-exec"]) + overrides = _build_cli_overrides(args) + assert overrides["execution"]["enabled"] is False + + def test_no_post_tests_cli_flag(self): + from aidlc_runner.cli import build_parser, _build_cli_overrides + + parser = build_parser() + args = parser.parse_args(["--vision", "v.md", "--no-post-tests"]) + overrides = _build_cli_overrides(args) + assert overrides["execution"]["post_run_tests"] is False diff --git a/scripts/aidlc-evaluator/packages/execution/tests/test_two_inputs.py b/scripts/aidlc-evaluator/packages/execution/tests/test_two_inputs.py new file mode 100644 index 00000000..36b99deb --- /dev/null +++ b/scripts/aidlc-evaluator/packages/execution/tests/test_two_inputs.py @@ -0,0 +1,178 @@ +"""Tests for two-input-document support (vision + tech-env).""" + +from __future__ import annotations + +from pathlib import Path + +import yaml + +from aidlc_runner.agents.simulator import SIMULATOR_SYSTEM_PROMPT_TEMPLATE, create_simulator +from aidlc_runner.cli import build_parser +from aidlc_runner.runner import write_run_meta +from aidlc_runner.config import RunnerConfig + + +# --------------------------------------------------------------------------- +# CLI argument parsing +# --------------------------------------------------------------------------- + + +class TestCliTechEnvArgument: + def test_tech_env_accepted(self, tmp_path: Path): + vision = tmp_path / "vision.md" + vision.write_text("# Vision") + tech_env = tmp_path / "tech-env.md" + tech_env.write_text("# Tech Env") + + parser = build_parser() + args = parser.parse_args(["--vision", str(vision), "--tech-env", str(tech_env)]) + + assert args.tech_env == tech_env + assert args.vision == vision + + def test_tech_env_defaults_to_none(self, tmp_path: Path): + vision = tmp_path / "vision.md" + vision.write_text("# Vision") + + parser = build_parser() + args = parser.parse_args(["--vision", str(vision)]) + + assert args.tech_env is None + + def test_tech_env_validation_in_main(self, tmp_path: Path): + """main() should exit with error when --tech-env file doesn't exist.""" + import sys + + vision = tmp_path / "vision.md" + vision.write_text("# Vision") + + from aidlc_runner.cli import main + + try: + main(["--vision", str(vision), "--tech-env", str(tmp_path / "missing.md")]) + assert False, "Should have called sys.exit" + except SystemExit as e: + assert e.code == 1 + + +# --------------------------------------------------------------------------- +# Runner: write_run_meta records tech-env path +# --------------------------------------------------------------------------- + + +class TestRunMetaTechEnv: + def test_meta_includes_tech_env_path(self, tmp_path: Path): + vision = tmp_path / "vision.md" + vision.write_text("# Vision") + tech_env = tmp_path / "tech-env.md" + tech_env.write_text("# Tech Env") + + run_folder = tmp_path / "run" + run_folder.mkdir() + + write_run_meta(run_folder, RunnerConfig(), vision, tech_env_path=tech_env) + + meta_path = run_folder / "run-meta.yaml" + with open(meta_path) as f: + meta = yaml.safe_load(f) + + assert meta["tech_env_file"] == str(tech_env.resolve()) + + def test_meta_tech_env_null_when_omitted(self, tmp_path: Path): + vision = tmp_path / "vision.md" + vision.write_text("# Vision") + + run_folder = tmp_path / "run" + run_folder.mkdir() + + write_run_meta(run_folder, RunnerConfig(), vision) + + meta_path = run_folder / "run-meta.yaml" + with open(meta_path) as f: + meta = yaml.safe_load(f) + + assert meta["tech_env_file"] is None + + +# --------------------------------------------------------------------------- +# Simulator: tech-env injection into system prompt +# --------------------------------------------------------------------------- + + +class TestSimulatorTechEnvPrompt: + def test_template_has_tech_env_placeholder(self): + assert "{tech_env_section}" in SIMULATOR_SYSTEM_PROMPT_TEMPLATE + + def test_prompt_includes_tech_env_when_provided(self): + result = SIMULATOR_SYSTEM_PROMPT_TEMPLATE.format( + vision_content="Build a calculator", + tech_env_section="\n## The technical environment\n\nUse Python 3.12\n", + ) + assert "The technical environment" in result + assert "Use Python 3.12" in result + assert "Build a calculator" in result + + def test_prompt_excludes_tech_env_when_empty(self): + result = SIMULATOR_SYSTEM_PROMPT_TEMPLATE.format( + vision_content="Build a calculator", + tech_env_section="", + ) + assert "technical environment" not in result.lower().split("how you work")[0] + assert "Build a calculator" in result + + def test_prompt_backward_compatible_with_no_tech_env(self): + """When tech_env_section is empty, prompt should be identical to old behavior.""" + result = SIMULATOR_SYSTEM_PROMPT_TEMPLATE.format( + vision_content="My vision doc", + tech_env_section="", + ) + # The vision content is still present + assert "My vision doc" in result + # "How you work" section immediately follows the vision closing --- + # No extra blank sections between vision and how-you-work + assert "---\n\n## How you work" in result + + +# --------------------------------------------------------------------------- +# Initial prompt: tech-env mention is conditional +# --------------------------------------------------------------------------- + + +class TestInitialPromptTechEnv: + @staticmethod + def _build_initial_prompt(tech_env_content: str | None) -> str: + """Reproduce the initial prompt logic from runner.py.""" + initial_prompt = ( + "Begin the AIDLC workflow and execute it TO COMPLETION through ALL phases. " + "The project vision is available at vision.md in the run folder. " + ) + if tech_env_content is not None: + initial_prompt += ( + "The technical environment document is available at tech-env.md in the run folder. " + "It defines the required languages, frameworks, cloud services, security controls, " + "testing standards, and prohibited technologies. Follow it as a binding reference " + "during all Construction stages. " + ) + initial_prompt += ( + "Start by loading the core workflow rules and the process overview, then " + "execute every stage of the Inception phase followed by every stage of the " + "Construction phase. The workspace directory is 'workspace/' (currently empty — " + "this is a greenfield project). You MUST generate all application code in " + "workspace/ before the workflow is complete. Do NOT stop after requirements — " + "continue through application design, code generation, and build-and-test." + ) + return initial_prompt + + def test_prompt_mentions_tech_env_when_present(self): + prompt = self._build_initial_prompt("# Tech Env Content") + assert "tech-env.md" in prompt + assert "binding reference" in prompt + + def test_prompt_omits_tech_env_when_absent(self): + prompt = self._build_initial_prompt(None) + assert "tech-env.md" not in prompt + assert "technical environment" not in prompt.lower() + + def test_prompt_always_mentions_vision(self): + assert "vision.md" in self._build_initial_prompt(None) + assert "vision.md" in self._build_initial_prompt("some content") diff --git a/scripts/aidlc-evaluator/packages/ide-harness/pyproject.toml b/scripts/aidlc-evaluator/packages/ide-harness/pyproject.toml new file mode 100644 index 00000000..f1695efc --- /dev/null +++ b/scripts/aidlc-evaluator/packages/ide-harness/pyproject.toml @@ -0,0 +1,20 @@ +[project] +name = "aidlc-ide-harness" +version = "0.1.0" +description = "Automated IDE instrumentation for AIDLC evaluation" +requires-python = ">=3.13" +dependencies = [ + "pyyaml>=6.0", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8.0.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/ide_harness"] diff --git a/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/__init__.py b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/__init__.py new file mode 100644 index 00000000..0b410b9d --- /dev/null +++ b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/__init__.py @@ -0,0 +1,17 @@ +"""Automated IDE instrumentation for AIDLC evaluation. + +Provides a common adapter interface for driving AIDLC workflows through +IDE-based AI coding assistants (Kiro, Cursor, Cline, CoPilot, Windsurf, +Antigravity) and capturing evaluation-compatible output. +""" + +from ide_harness.adapter import AdapterConfig, AdapterResult, IDEAdapter +from ide_harness.registry import get_adapter, list_adapters + +__all__ = [ + "AdapterConfig", + "AdapterResult", + "IDEAdapter", + "get_adapter", + "list_adapters", +] diff --git a/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapter.py b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapter.py new file mode 100644 index 00000000..d8b6394d --- /dev/null +++ b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapter.py @@ -0,0 +1,67 @@ +"""Abstract adapter interface for IDE automation.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from pathlib import Path + + +@dataclass +class AdapterConfig: + """Configuration for an IDE adapter run.""" + + vision_path: Path + output_dir: Path + rules_path: Path + tech_env_path: Path | None = None + prompt_template: str | None = None + timeout_seconds: int = 7200 # 2 hours max + + +@dataclass +class AdapterResult: + """Result from an IDE adapter run.""" + + success: bool + output_dir: Path + aidlc_docs_dir: Path | None = None + workspace_dir: Path | None = None + error: str | None = None + elapsed_seconds: float = 0.0 + token_estimate: int | None = None + extra: dict = field(default_factory=dict) + + +class IDEAdapter(ABC): + """Abstract base for IDE-specific automation adapters.""" + + @property + @abstractmethod + def name(self) -> str: + """Human-readable IDE name (e.g., 'Cursor', 'Cline').""" + ... + + @abstractmethod + def check_prerequisites(self) -> tuple[bool, str]: + """Verify IDE is installed, configured, and accessible. + + Returns: + (ok, message) — True with a success message, or False with + a description of what's missing. + """ + ... + + @abstractmethod + def run(self, config: AdapterConfig) -> AdapterResult: + """Execute the AIDLC process through the IDE and capture outputs. + + The implementation should: + 1. Set up a clean workspace with vision.md, tech-env.md, and rules + 2. Launch the IDE or connect to a running instance + 3. Send the AIDLC prompt to the IDE's AI chat + 4. Monitor for completion (all AIDLC phases done) + 5. Extract aidlc-docs/ and workspace/ from the IDE output + 6. Generate run-meta.yaml with timing and adapter info + """ + ... diff --git a/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/__init__.py b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/__init__.py new file mode 100644 index 00000000..8ff7cdc1 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/__init__.py @@ -0,0 +1 @@ +"""IDE-specific adapter implementations.""" diff --git a/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/antigravity.py b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/antigravity.py new file mode 100644 index 00000000..6885f85a --- /dev/null +++ b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/antigravity.py @@ -0,0 +1,29 @@ +"""Antigravity adapter — AI coding assistant.""" + +from __future__ import annotations + +from ide_harness.adapter import AdapterConfig, AdapterResult, IDEAdapter + + +class AntigravityAdapter(IDEAdapter): + """Adapter for Antigravity AI coding assistant. + + TODO: Research Antigravity's automation capabilities: + - CLI or API availability + - Extension or standalone application + - Scripted interaction support + """ + + @property + def name(self) -> str: + return "Antigravity" + + def check_prerequisites(self) -> tuple[bool, str]: + return False, "Antigravity adapter requires manual configuration. See docs/ide-automation-research.md." + + def run(self, config: AdapterConfig) -> AdapterResult: + return AdapterResult( + success=False, + output_dir=config.output_dir, + error="Antigravity adapter not yet implemented", + ) diff --git a/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/cline.py b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/cline.py new file mode 100644 index 00000000..79269a3d --- /dev/null +++ b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/cline.py @@ -0,0 +1,616 @@ +"""Cline adapter — VS Code extension for AI-assisted coding. + +Cline (extension ID: ``saoudrizwan.claude-dev``) runs as a VS Code extension +and exposes a ``ClineAPI`` interface for programmatic control: + + - ``startNewTask(task?, images?)``: Promise<void> + - ``sendMessage(message?, images?)``: Promise<void> + - ``pressPrimaryButton()``: Promise<void> + - ``pressSecondaryButton()``: Promise<void> + +Full headless automation requires a custom VS Code extension that imports +ClineAPI via the VS Code extension API and drives the workflow. Since that +bridge extension is not yet available, this adapter implements a +**semi-automated** approach: + +1. Prepare a workspace with vision.md, tech-env.md, and ``.clinerules/`` + containing the AIDLC rules. +2. Write an ``INSTRUCTIONS.md`` file with the rendered AIDLC prompt. +3. Create a ``.vscode/tasks.json`` stub for future task-based triggering. +4. Launch VS Code (``code --wait``) pointed at the workspace. +5. Poll the workspace for ``aidlc-docs/`` output (file-watcher loop). +6. Normalize output via the shared normalizer once VS Code exits or output + is detected. + +AIDLC rules are injected through the ``.clinerules/`` directory, which Cline +reads automatically when present in the workspace root. + +Prerequisites: + - ``code`` CLI on PATH (VS Code). + - Cline extension installed in VS Code (``saoudrizwan.claude-dev``). +""" + +from __future__ import annotations + +import json +import logging +import shutil +import subprocess +import tempfile +import time +from pathlib import Path + +from ide_harness.adapter import AdapterConfig, AdapterResult, IDEAdapter +from ide_harness.normalizer import normalize_output +from ide_harness.prompt_template import render_prompt + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- +_VSCODE_CLI = "code" +_CLINE_EXTENSION_ID = "saoudrizwan.claude-dev" +_RULES_SUBDIR = ".clinerules" +_AIDLC_DOCS_DIR = "aidlc-docs" +_POLL_INTERVAL_SECONDS = 10 +# Minimum number of files in aidlc-docs/ to consider the run "complete". +# The AIDLC process produces many documents; we use a conservative threshold. +_MIN_AIDLC_FILES_FOR_COMPLETION = 5 +# How long to wait (seconds) after detecting output before finalizing, to +# allow any trailing writes to flush. +_QUIESCE_SECONDS = 30 + + +class ClineAdapter(IDEAdapter): + """Adapter for Cline (VS Code extension). + + Cline runs as a VS Code extension. Automation approaches: + + - **Semi-automated (current):** Workspace preparation + VS Code launch + + file-watcher loop that detects ``aidlc-docs/`` output and normalizes it. + - **Full automation (future):** A custom VS Code test extension that + exercises ClineAPI via ``@vscode/test-electron``. + + The semi-automated mode is suitable for regression runs where a human + operator monitors the VS Code window while the harness handles workspace + setup, output detection, and normalization. + """ + + # ------------------------------------------------------------------ # + # IDEAdapter interface + # ------------------------------------------------------------------ # + + @property + def name(self) -> str: + return "Cline" + + def check_prerequisites(self) -> tuple[bool, str]: + """Verify that VS Code CLI (``code``) is available on PATH. + + This does *not* verify that the Cline extension is installed because + there is no reliable CLI-only way to query extension presence without + launching VS Code. A warning is logged instead. + """ + if not shutil.which(_VSCODE_CLI): + return ( + False, + f"VS Code CLI ('{_VSCODE_CLI}') not found in PATH. " + "Install VS Code and ensure the 'code' command is available " + "(Shell Command: Install 'code' command in PATH).", + ) + + # Best-effort check: try `code --list-extensions` for Cline. + # This can fail in CI or if VS Code has never been launched, so + # we treat absence as a warning rather than a hard failure. + try: + # nosec B603 - Static VSCode extension list command for prerequisite check + proc = subprocess.run( + [_VSCODE_CLI, "--list-extensions"], + capture_output=True, + text=True, + timeout=30, + ) + extensions = proc.stdout.strip().splitlines() + cline_installed = any( + ext.strip().lower() == _CLINE_EXTENSION_ID + for ext in extensions + ) + if not cline_installed: + logger.warning( + "Cline extension (%s) not found in VS Code extensions. " + "The adapter will still launch VS Code, but Cline must " + "be installed for the AIDLC workflow to run.", + _CLINE_EXTENSION_ID, + ) + return ( + True, + f"VS Code CLI found, but Cline extension ({_CLINE_EXTENSION_ID}) " + "was not detected. Please ensure it is installed.", + ) + except (subprocess.TimeoutExpired, OSError) as exc: + logger.debug( + "Could not query VS Code extensions: %s. " + "Proceeding with prerequisite check passed.", + exc, + ) + + return True, f"VS Code CLI ('{_VSCODE_CLI}') found with Cline extension" + + def run(self, config: AdapterConfig) -> AdapterResult: + """Execute the AIDLC process through Cline in VS Code. + + Semi-automated workflow: + 1. Verify prerequisites. + 2. Create a temporary workspace directory. + 3. Copy vision.md and tech-env.md into the workspace. + 4. Inject AIDLC rules into ``.clinerules/`` directory. + 5. Copy AIDLC rules into ``aidlc-rules/`` for prompt references. + 6. Write ``INSTRUCTIONS.md`` with the rendered AIDLC prompt. + 7. Create ``.vscode/tasks.json`` stub for future automation. + 8. Launch VS Code via ``code --wait <workspace>``. + 9. Poll the workspace for ``aidlc-docs/`` output. + 10. Normalize the workspace output. + 11. Return an :class:`AdapterResult`. + + .. note:: + + Because VS Code is launched with ``--wait``, the subprocess blocks + until the user closes the VS Code window. The polling loop runs + in a background thread if full automation is ever added, but in + semi-automated mode the poll happens *after* VS Code exits. + + TODO: Full ClineAPI automation path + - Build a VS Code test extension using ``@vscode/test-electron`` + that acquires the ClineAPI handle from the Cline extension: + ``const clineApi = vscode.extensions.getExtension( + 'saoudrizwan.claude-dev' + )?.exports;`` + - Call ``clineApi.startNewTask(prompt)`` to kick off the AIDLC + workflow without human interaction. + - Use ``clineApi.pressPrimaryButton()`` to auto-approve tool + invocations (file writes, terminal commands). + - Stream progress via ``clineApi.sendMessage()`` if multi-turn + interaction is needed. + - Wrap the test extension in an npm package that this adapter + launches via ``npx``. + """ + # -- 1. Prerequisite check ----------------------------------------- + ok, msg = self.check_prerequisites() + if not ok: + return AdapterResult( + success=False, + output_dir=config.output_dir, + error=msg, + ) + + workspace_dir: Path | None = None + start_time = time.monotonic() + + try: + # -- 2. Create temp workspace ----------------------------------- + workspace_dir = Path(tempfile.mkdtemp(prefix="aidlc-cline-")) + logger.info("Cline workspace created at %s", workspace_dir) + + # -- 3. Copy input documents ------------------------------------ + if not config.vision_path.is_file(): + return AdapterResult( + success=False, + output_dir=config.output_dir, + error=f"vision.md not found at {config.vision_path}", + ) + shutil.copy2(config.vision_path, workspace_dir / "vision.md") + + if config.tech_env_path and config.tech_env_path.is_file(): + shutil.copy2(config.tech_env_path, workspace_dir / "tech-env.md") + + # -- 4. Inject AIDLC rules into .clinerules/ -------------------- + self._inject_clinerules(config.rules_path, workspace_dir) + + # -- 5. Copy rules into aidlc-rules/ for prompt references ------ + self._inject_aidlc_rules(config.rules_path, workspace_dir) + + # -- 6. Write INSTRUCTIONS.md ----------------------------------- + prompt = config.prompt_template or render_prompt() + instructions_path = workspace_dir / "INSTRUCTIONS.md" + instructions_content = _build_instructions_md(prompt) + instructions_path.write_text(instructions_content, encoding="utf-8") + logger.info( + "INSTRUCTIONS.md written (%d bytes)", len(instructions_content) + ) + + # -- 7. Create .vscode/tasks.json stub -------------------------- + self._create_vscode_tasks(workspace_dir) + + # -- 8. Launch VS Code ------------------------------------------ + logger.info( + "Launching VS Code with workspace: %s (timeout=%ds)", + workspace_dir, + config.timeout_seconds, + ) + + # TODO: For full ClineAPI automation, replace this subprocess + # call with @vscode/test-electron launch that loads the bridge + # extension. The bridge extension would: + # 1. Activate and acquire ClineAPI from Cline extension. + # 2. Call startNewTask() with the AIDLC prompt. + # 3. Auto-approve via pressPrimaryButton() on each tool call. + # 4. Signal completion by writing a sentinel file. + # nosec B603 - Executing user's VSCode with Cline extension and validated workspace + # nosemgrep: dangerous-subprocess-use-audit + vscode_proc = subprocess.Popen( + [_VSCODE_CLI, "--wait", str(workspace_dir)], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + # -- 9. Poll for aidlc-docs/ output ---------------------------- + completed = self._poll_for_output( + workspace_dir=workspace_dir, + process=vscode_proc, + timeout_seconds=config.timeout_seconds, + start_time=start_time, + ) + + elapsed = time.monotonic() - start_time + + # If VS Code is still running after we detect output, give it + # a moment then terminate gracefully. + if vscode_proc.poll() is None: + logger.info( + "VS Code still running after output detection; " + "waiting for user to close or timeout." + ) + try: + vscode_proc.wait(timeout=60) + except subprocess.TimeoutExpired: + logger.warning("Terminating VS Code process after grace period.") + vscode_proc.terminate() + try: + vscode_proc.wait(timeout=15) + except subprocess.TimeoutExpired: + vscode_proc.kill() + + # Capture any stdout/stderr from VS Code + raw_stdout = "" + raw_stderr = "" + try: + out, err = vscode_proc.communicate(timeout=5) + raw_stdout = (out or b"").decode("utf-8", errors="replace") + raw_stderr = (err or b"").decode("utf-8", errors="replace") + except (subprocess.TimeoutExpired, OSError): + pass + + # -- 10. Normalize output --------------------------------------- + self._normalize(workspace_dir, config.output_dir, elapsed) + aidlc_docs = _aidlc_docs_if_exists(config.output_dir) + + if completed and aidlc_docs: + logger.info( + "Cline run completed successfully in %.1fs", elapsed + ) + return AdapterResult( + success=True, + output_dir=config.output_dir, + aidlc_docs_dir=aidlc_docs, + workspace_dir=workspace_dir, + elapsed_seconds=elapsed, + extra=_build_extra( + stdout=raw_stdout, + stderr=raw_stderr, + mode="semi-automated", + ), + ) + + # Partial or no output — still normalize whatever is available + error_msg = ( + "Cline run did not produce complete aidlc-docs/ output. " + "This may indicate the AIDLC workflow was not fully executed. " + "Check the INSTRUCTIONS.md in the workspace and run Cline " + "manually if needed." + ) + if elapsed >= config.timeout_seconds: + error_msg = ( + f"Cline run timed out after {config.timeout_seconds}s " + "without producing complete output." + ) + + logger.warning(error_msg) + return AdapterResult( + success=False, + output_dir=config.output_dir, + aidlc_docs_dir=aidlc_docs, + workspace_dir=workspace_dir, + error=error_msg, + elapsed_seconds=elapsed, + extra=_build_extra( + stdout=raw_stdout, + stderr=raw_stderr, + mode="semi-automated", + ), + ) + + except FileNotFoundError as exc: + elapsed = time.monotonic() - start_time + error_msg = f"Required file not found: {exc}" + logger.error(error_msg) + return AdapterResult( + success=False, + output_dir=config.output_dir, + workspace_dir=workspace_dir, + error=error_msg, + elapsed_seconds=elapsed, + ) + + except Exception as exc: # noqa: BLE001 + elapsed = time.monotonic() - start_time + error_msg = f"Unexpected error during Cline run: {exc}" + logger.exception(error_msg) + + # Attempt to salvage any partial output. + if workspace_dir and workspace_dir.is_dir(): + try: + self._normalize(workspace_dir, config.output_dir, elapsed) + except Exception: # noqa: BLE001 + logger.debug("Failed to normalize partial output", exc_info=True) + + return AdapterResult( + success=False, + output_dir=config.output_dir, + aidlc_docs_dir=_aidlc_docs_if_exists(config.output_dir), + workspace_dir=workspace_dir, + error=error_msg, + elapsed_seconds=elapsed, + ) + + # ------------------------------------------------------------------ # + # Internal helpers + # ------------------------------------------------------------------ # + + @staticmethod + def _inject_clinerules(rules_path: Path, workspace: Path) -> None: + """Inject AIDLC rules into the ``.clinerules/`` directory. + + Cline automatically reads ``.clinerules/`` files from the workspace + root and applies them as system-level instructions for every task. + """ + clinerules_dir = workspace / _RULES_SUBDIR + clinerules_dir.mkdir(parents=True, exist_ok=True) + + if rules_path.is_file(): + shutil.copy2(rules_path, clinerules_dir / rules_path.name) + logger.info( + "AIDLC rules (single file) written to %s", + clinerules_dir / rules_path.name, + ) + elif rules_path.is_dir(): + # Copy every file from the rules directory into .clinerules/ + for item in sorted(rules_path.rglob("*")): + if not item.is_file(): + continue + rel = item.relative_to(rules_path) + dest = clinerules_dir / rel + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(item, dest) + logger.info( + "AIDLC rules (directory) written to %s", + clinerules_dir, + ) + else: + logger.warning( + "Rules path %s does not exist; .clinerules/ will be empty", + rules_path, + ) + + @staticmethod + def _inject_aidlc_rules(rules_path: Path, workspace: Path) -> None: + """Copy AIDLC rules into ``aidlc-rules/`` so the prompt template + can reference them (the standard AIDLC prompt tells the AI to read + rules from ``aidlc-rules/``). + """ + aidlc_rules_dir = workspace / "aidlc-rules" + + if rules_path.is_file(): + aidlc_rules_dir.mkdir(parents=True, exist_ok=True) + shutil.copy2(rules_path, aidlc_rules_dir / rules_path.name) + elif rules_path.is_dir(): + if aidlc_rules_dir.exists(): + shutil.rmtree(aidlc_rules_dir) + shutil.copytree(rules_path, aidlc_rules_dir) + else: + logger.warning( + "Rules path %s does not exist; aidlc-rules/ will not be created", + rules_path, + ) + + @staticmethod + def _create_vscode_tasks(workspace: Path) -> None: + """Create a ``.vscode/tasks.json`` with a placeholder task. + + TODO: This task definition is a stub. Once the ClineAPI bridge + extension is built, this could contain a task that triggers the + extension automatically on workspace open via ``runOn``: + + "runOptions": { "runOn": "folderOpen" } + + For now it serves as documentation for the intended automation path. + """ + vscode_dir = workspace / ".vscode" + vscode_dir.mkdir(parents=True, exist_ok=True) + + tasks = { + "version": "2.0.0", + "tasks": [ + { + "label": "aidlc-cline-trigger", + "type": "shell", + "command": "echo", + "args": [ + "Open INSTRUCTIONS.md and paste its contents into " + "Cline chat to start the AIDLC workflow." + ], + "problemMatcher": [], + "group": "none", + "presentation": { + "reveal": "always", + "panel": "new", + }, + # TODO: Uncomment when bridge extension is ready: + # "runOptions": {"runOn": "folderOpen"}, + }, + ], + } + + tasks_path = vscode_dir / "tasks.json" + tasks_path.write_text( + json.dumps(tasks, indent=2) + "\n", encoding="utf-8" + ) + logger.debug("VS Code tasks.json written to %s", tasks_path) + + @staticmethod + def _poll_for_output( + workspace_dir: Path, + process: subprocess.Popen, + timeout_seconds: int, + start_time: float, + ) -> bool: + """Poll the workspace for ``aidlc-docs/`` output or process exit. + + Returns True if a sufficient number of AIDLC output files were + detected, False if the process exited or the timeout was reached + before output was found. + + The polling loop checks two conditions each cycle: + 1. Whether the VS Code process has exited (user closed window). + 2. Whether ``aidlc-docs/`` exists and has enough files. + """ + aidlc_docs = workspace_dir / _AIDLC_DOCS_DIR + last_file_count = 0 + quiesce_start: float | None = None + + while True: + elapsed = time.monotonic() - start_time + + # Timeout guard + if elapsed >= timeout_seconds: + logger.warning( + "Polling timed out after %.1fs", elapsed + ) + # Terminate VS Code if still running + if process.poll() is None: + logger.info("Terminating VS Code due to timeout.") + process.terminate() + try: + process.wait(timeout=15) + except subprocess.TimeoutExpired: + process.kill() + # Still return True if we found output + return aidlc_docs.is_dir() and _count_files(aidlc_docs) >= _MIN_AIDLC_FILES_FOR_COMPLETION + + # Check if VS Code has exited + if process.poll() is not None: + logger.info( + "VS Code process exited (code=%d) after %.1fs", + process.returncode, + elapsed, + ) + # Check if output was produced before exit + if aidlc_docs.is_dir() and _count_files(aidlc_docs) >= _MIN_AIDLC_FILES_FOR_COMPLETION: + return True + # Even without complete output, return — VS Code is gone + return False + + # Check for aidlc-docs/ growth + if aidlc_docs.is_dir(): + current_count = _count_files(aidlc_docs) + + if current_count >= _MIN_AIDLC_FILES_FOR_COMPLETION: + # Files are present — check if output has stabilized + # (no new files for _QUIESCE_SECONDS). + if current_count != last_file_count: + last_file_count = current_count + quiesce_start = time.monotonic() + logger.info( + "aidlc-docs/ has %d files; waiting for output to stabilize...", + current_count, + ) + elif quiesce_start and (time.monotonic() - quiesce_start) >= _QUIESCE_SECONDS: + logger.info( + "aidlc-docs/ output stabilized at %d files after %.0fs quiesce period.", + current_count, + _QUIESCE_SECONDS, + ) + return True + else: + logger.debug( + "aidlc-docs/ has %d files (need >= %d)", + current_count, + _MIN_AIDLC_FILES_FOR_COMPLETION, + ) + + # nosemgrep: arbitrary-sleep - Polling IDE for completion state + time.sleep(_POLL_INTERVAL_SECONDS) + + @staticmethod + def _normalize( + workspace_dir: Path, + output_dir: Path, + elapsed: float, + ) -> Path: + """Delegate to the shared normalizer.""" + return normalize_output( + source_dir=workspace_dir, + output_dir=output_dir, + adapter_name="cline", + model_hint="ide:cline", + elapsed_seconds=elapsed, + ) + + +# ---------------------------------------------------------------------- # +# Module-level helpers +# ---------------------------------------------------------------------- # + + +def _build_instructions_md(prompt: str) -> str: + """Build the contents of INSTRUCTIONS.md for the workspace. + + This file is placed in the workspace root so the operator (or future + automation) can easily copy the prompt into Cline's chat input. + """ + return ( + "# AIDLC Instructions for Cline\n" + "\n" + "Copy the prompt below into the Cline chat panel to start the\n" + "AIDLC (AI Development Life Cycle) workflow.\n" + "\n" + "---\n" + "\n" + f"{prompt}\n" + ) + + +def _aidlc_docs_if_exists(output_dir: Path) -> Path | None: + """Return the aidlc-docs path if it was produced, else ``None``.""" + docs = output_dir / "aidlc-docs" + return docs if docs.is_dir() else None + + +def _count_files(directory: Path) -> int: + """Count all files (recursively) under *directory*.""" + return sum(1 for f in directory.rglob("*") if f.is_file()) + + +def _build_extra( + stdout: str, + stderr: str, + mode: str, +) -> dict: + """Build the ``extra`` dict for :class:`AdapterResult`.""" + extra: dict = {"automation_mode": mode} + if stdout: + extra["stdout_length"] = len(stdout) + if stderr: + extra["stderr_length"] = len(stderr) + extra["stderr_preview"] = stderr[:500] + return extra diff --git a/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/copilot.py b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/copilot.py new file mode 100644 index 00000000..04f80b44 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/copilot.py @@ -0,0 +1,278 @@ +"""GitHub Copilot adapter — CLI-based headless automation.""" + +from __future__ import annotations + +import logging +import shutil +import subprocess +import tempfile +import time +from pathlib import Path + +from ide_harness.adapter import AdapterConfig, AdapterResult, IDEAdapter +from ide_harness.normalizer import normalize_output +from ide_harness.prompt_template import render_prompt + +logger = logging.getLogger(__name__) + + +class CopilotAdapter(IDEAdapter): + """Adapter for GitHub Copilot CLI. + + Supports two CLI entry points: + - ``copilot`` (standalone Copilot CLI) + - ``gh copilot`` (GitHub CLI extension, used as fallback) + + Headless mode is engaged via the ``-p`` flag. The ``--allow-all-tools`` + flag auto-approves file writes and shell commands so the AIDLC workflow + can run without manual intervention. + + AIDLC rules are injected into the workspace via + ``.github/copilot-instructions.md``, which Copilot reads automatically. + """ + + def __init__(self) -> None: + self._cli_cmd: list[str] | None = None + + # ------------------------------------------------------------------ + # IDEAdapter interface + # ------------------------------------------------------------------ + + @property + def name(self) -> str: + return "Copilot" + + def check_prerequisites(self) -> tuple[bool, str]: + """Check for ``copilot`` in PATH, falling back to ``gh copilot``. + + Returns: + (ok, message) -- True with the resolved command, or False with + a description of what is missing. + """ + # Prefer the standalone copilot CLI + if shutil.which("copilot"): + self._cli_cmd = ["copilot"] + return True, "Copilot CLI found (`copilot`)" + + # Fall back to GitHub CLI with the copilot extension + if shutil.which("gh"): + try: + # nosec B603, B607 - Static gh copilot version check + result = subprocess.run( + ["gh", "copilot", "--version"], + capture_output=True, + text=True, + timeout=15, + ) + if result.returncode == 0: + self._cli_cmd = ["gh", "copilot"] + return True, "GitHub CLI with Copilot extension found (`gh copilot`)" + except (subprocess.TimeoutExpired, OSError): + pass + + return False, ( + "`gh` is installed but the Copilot extension is missing. " + "Install it with: gh extension install github/gh-copilot" + ) + + return False, ( + "Neither `copilot` nor `gh` found in PATH. " + "Install the Copilot CLI (https://docs.github.com/en/copilot/using-github-copilot/using-github-copilot-in-the-command-line) " + "or install the GitHub CLI with the Copilot extension." + ) + + def run(self, config: AdapterConfig) -> AdapterResult: + """Execute the full AIDLC workflow through the Copilot CLI. + + Steps: + 1. Ensure prerequisites are met (resolve CLI command). + 2. Create a temporary workspace directory. + 3. Copy ``vision.md`` and ``tech-env.md`` into the workspace. + 4. Copy AIDLC rules into ``.github/copilot-instructions.md``. + 5. Build the prompt via :func:`render_prompt`. + 6. Execute ``copilot -p "<prompt>" --allow-all-tools`` as a + subprocess inside the workspace. + 7. Normalize the workspace output to the evaluation folder layout. + 8. Return an :class:`AdapterResult`. + """ + # 0. Make sure we know which CLI to use + if self._cli_cmd is None: + ok, msg = self.check_prerequisites() + if not ok: + return AdapterResult( + success=False, + output_dir=config.output_dir, + error=f"Prerequisites not met: {msg}", + ) + + assert self._cli_cmd is not None # guaranteed after check_prerequisites + + # 1. Create a temporary workspace + workspace = Path(tempfile.mkdtemp(prefix="copilot-aidlc-")) + logger.info("Copilot workspace: %s", workspace) + + try: + # 2. Copy vision.md into the workspace + if not config.vision_path.is_file(): + return AdapterResult( + success=False, + output_dir=config.output_dir, + error=f"vision.md not found at {config.vision_path}", + ) + shutil.copy2(config.vision_path, workspace / "vision.md") + + # 3. Copy tech-env.md into the workspace (optional) + if config.tech_env_path and config.tech_env_path.is_file(): + shutil.copy2(config.tech_env_path, workspace / "tech-env.md") + + # 4. Inject AIDLC rules via .github/copilot-instructions.md + self._inject_rules(config.rules_path, workspace) + + # 5. Build the prompt + prompt = config.prompt_template or render_prompt() + + # 6. Execute the Copilot CLI + start = time.monotonic() + stdout, stderr, returncode = self._execute_cli( + prompt=prompt, + cwd=workspace, + timeout=config.timeout_seconds, + ) + elapsed = time.monotonic() - start + + logger.info( + "Copilot CLI exited with code %d after %.1fs", + returncode, + elapsed, + ) + + if returncode != 0: + error_detail = stderr.strip() or stdout.strip() or f"exit code {returncode}" + return AdapterResult( + success=False, + output_dir=config.output_dir, + workspace_dir=workspace, + error=f"Copilot CLI failed: {error_detail}", + elapsed_seconds=elapsed, + extra={"stdout": stdout, "stderr": stderr, "returncode": returncode}, + ) + + # 7. Normalize output to the evaluation folder layout + normalize_output( + source_dir=workspace, + output_dir=config.output_dir, + adapter_name=self.name.lower(), + model_hint="ide:copilot", + elapsed_seconds=elapsed, + ) + + aidlc_docs_dir = config.output_dir / "aidlc-docs" + return AdapterResult( + success=True, + output_dir=config.output_dir, + aidlc_docs_dir=aidlc_docs_dir if aidlc_docs_dir.is_dir() else None, + workspace_dir=workspace, + elapsed_seconds=elapsed, + extra={"stdout": stdout, "stderr": stderr, "returncode": returncode}, + ) + + except subprocess.TimeoutExpired: + elapsed = time.monotonic() - start # type: ignore[possibly-undefined] + logger.error("Copilot CLI timed out after %ds", config.timeout_seconds) + return AdapterResult( + success=False, + output_dir=config.output_dir, + workspace_dir=workspace, + error=f"Copilot CLI timed out after {config.timeout_seconds}s", + elapsed_seconds=elapsed, + ) + + except Exception as exc: + logger.exception("Unexpected error running Copilot adapter") + return AdapterResult( + success=False, + output_dir=config.output_dir, + workspace_dir=workspace, + error=f"Unexpected error: {exc}", + ) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + def _inject_rules(self, rules_path: Path, workspace: Path) -> None: + """Copy AIDLC rules into the workspace. + + Copilot reads custom instructions from ``.github/copilot-instructions.md``. + The rules are also placed under ``aidlc-rules/`` so the standard AIDLC + prompt can reference them. + """ + # -- .github/copilot-instructions.md (Copilot picks this up automatically) + instructions_dir = workspace / ".github" + instructions_dir.mkdir(parents=True, exist_ok=True) + instructions_file = instructions_dir / "copilot-instructions.md" + + if rules_path.is_file(): + shutil.copy2(rules_path, instructions_file) + elif rules_path.is_dir(): + # Concatenate all markdown files in the rules directory into a + # single instructions file, preserving order. + parts: list[str] = [] + for md_file in sorted(rules_path.rglob("*.md")): + parts.append(md_file.read_text(errors="replace")) + instructions_file.write_text("\n\n---\n\n".join(parts)) + else: + logger.warning("Rules path %s not found; skipping instructions injection", rules_path) + + # -- aidlc-rules/ directory (referenced by the prompt template) + aidlc_rules_dir = workspace / "aidlc-rules" + if rules_path.is_file(): + aidlc_rules_dir.mkdir(parents=True, exist_ok=True) + shutil.copy2(rules_path, aidlc_rules_dir / rules_path.name) + elif rules_path.is_dir(): + if aidlc_rules_dir.exists(): + shutil.rmtree(aidlc_rules_dir) + shutil.copytree(rules_path, aidlc_rules_dir) + + def _execute_cli( + self, + prompt: str, + cwd: Path, + timeout: int, + ) -> tuple[str, str, int]: + """Run the Copilot CLI in headless (``-p``) mode. + + Args: + prompt: The full AIDLC prompt text. + cwd: Working directory (the prepared workspace). + timeout: Maximum wall-clock seconds before the process is killed. + + Returns: + (stdout, stderr, returncode) + + Raises: + subprocess.TimeoutExpired: If the process exceeds *timeout*. + """ + assert self._cli_cmd is not None + + cmd = [ + *self._cli_cmd, + "-p", + prompt, + "--allow-all-tools", + ] + + logger.info("Executing: %s (cwd=%s, timeout=%ds)", cmd[0], cwd, timeout) + logger.debug("Full command: %s", cmd) + + # nosec B603 - Executing user's GitHub Copilot CLI with validated configuration + # nosemgrep: dangerous-subprocess-use-audit + proc = subprocess.run( + cmd, + cwd=cwd, + capture_output=True, + text=True, + timeout=timeout, + ) + + return proc.stdout, proc.stderr, proc.returncode diff --git a/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/cursor.py b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/cursor.py new file mode 100644 index 00000000..af7e6ceb --- /dev/null +++ b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/cursor.py @@ -0,0 +1,306 @@ +"""Cursor IDE adapter — AI-first code editor with headless CLI support. + +Cursor's standalone CLI tool ``agent`` supports fully headless operation via +the ``-p`` (print) flag with structured JSON output. AIDLC rules are injected +through ``.cursor/rules/`` markdown files. +""" + +from __future__ import annotations + +import json +import logging +import shutil +import subprocess +import tempfile +import time +from pathlib import Path + +from ide_harness.adapter import AdapterConfig, AdapterResult, IDEAdapter +from ide_harness.normalizer import normalize_output +from ide_harness.prompt_template import render_prompt + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- +_AGENT_CLI = "agent" +_RULES_SUBDIR = Path(".cursor") / "rules" +_RULES_FILENAME = "aidlc-rules.mdc" + + +class CursorAdapter(IDEAdapter): + """Adapter for Cursor IDE. + + Cursor is a VS Code fork with built-in AI chat. Its standalone ``agent`` + CLI supports headless/non-interactive mode that is purpose-built for + scripted automation: + + - ``agent -p "prompt" --force --output-format json`` + - ``--force`` allows file modifications without interactive approval + - ``--output-format json`` provides structured output for parsing + - AIDLC rules are injected via ``.cursor/rules/`` directory + """ + + # ------------------------------------------------------------------ # + # IDEAdapter interface + # ------------------------------------------------------------------ # + + @property + def name(self) -> str: + return "Cursor" + + def check_prerequisites(self) -> tuple[bool, str]: + """Verify the ``agent`` CLI is available on PATH.""" + if shutil.which(_AGENT_CLI): + return True, f"Cursor CLI ('{_AGENT_CLI}') found in PATH" + return ( + False, + f"Cursor CLI ('{_AGENT_CLI}') not found in PATH. " + "Install the Cursor agent CLI first.", + ) + + def run(self, config: AdapterConfig) -> AdapterResult: + """Execute the full AIDLC process through Cursor's headless CLI. + + Steps: + 1. Verify prerequisites. + 2. Create a temporary workspace directory. + 3. Copy vision.md and (optionally) tech-env.md into the workspace. + 4. Create ``.cursor/rules/aidlc-rules.mdc`` with AIDLC rules content. + 5. Build the AIDLC prompt via ``render_prompt()``. + 6. Run ``agent -p "<prompt>" --force --output-format json``. + 7. Parse the JSON output (if available). + 8. Normalize output into the evaluation-compatible run folder layout. + 9. Return an :class:`AdapterResult`. + """ + # -- 1. Prerequisite check ----------------------------------------- + ok, msg = self.check_prerequisites() + if not ok: + return AdapterResult( + success=False, + output_dir=config.output_dir, + error=msg, + ) + + workspace_dir: Path | None = None + start_time = time.monotonic() + + try: + # -- 2. Create temp workspace ----------------------------------- + workspace_dir = Path( + tempfile.mkdtemp(prefix="aidlc-cursor-") + ) + logger.info("Cursor workspace created at %s", workspace_dir) + + # -- 3. Copy input documents ------------------------------------ + shutil.copy2(config.vision_path, workspace_dir / "vision.md") + + if config.tech_env_path and config.tech_env_path.is_file(): + shutil.copy2(config.tech_env_path, workspace_dir / "tech-env.md") + + # -- 4. Inject AIDLC rules -------------------------------------- + rules_dir = workspace_dir / _RULES_SUBDIR + rules_dir.mkdir(parents=True, exist_ok=True) + rules_dest = rules_dir / _RULES_FILENAME + + rules_content = config.rules_path.read_text(encoding="utf-8") + rules_dest.write_text(rules_content, encoding="utf-8") + logger.info( + "AIDLC rules written to %s (%d bytes)", + rules_dest, + len(rules_content), + ) + + # -- 5. Build the prompt ---------------------------------------- + prompt = render_prompt( + vision_path="vision.md", + tech_env_path="tech-env.md", + ) + + # If the caller supplied a custom template, prefer that. + if config.prompt_template: + prompt = config.prompt_template + + # -- 6. Execute the agent CLI ----------------------------------- + cmd = [ + _AGENT_CLI, + "-p", + prompt, + "--force", + "--output-format", + "json", + ] + logger.info("Running: %s (timeout=%ds)", cmd[0], config.timeout_seconds) + + # nosec B603 - Executing user's Cursor IDE with validated configuration + # nosemgrep: dangerous-subprocess-use-audit + proc = subprocess.run( + cmd, + cwd=str(workspace_dir), + capture_output=True, + text=True, + timeout=config.timeout_seconds, + ) + + elapsed = time.monotonic() - start_time + + # -- 7. Parse output -------------------------------------------- + raw_stdout = proc.stdout or "" + raw_stderr = proc.stderr or "" + parsed_json = _try_parse_json(raw_stdout) + + if proc.returncode != 0: + error_detail = ( + f"agent CLI exited with code {proc.returncode}. " + f"stderr: {raw_stderr[:2000]}" + ) + logger.error(error_detail) + + # Even on failure, attempt to normalize whatever was produced. + self._normalize(workspace_dir, config.output_dir, elapsed) + + return AdapterResult( + success=False, + output_dir=config.output_dir, + aidlc_docs_dir=_aidlc_docs_if_exists(config.output_dir), + workspace_dir=workspace_dir, + error=error_detail, + elapsed_seconds=elapsed, + extra=_build_extra(raw_stdout, raw_stderr, parsed_json), + ) + + # -- 8. Normalize output ---------------------------------------- + self._normalize(workspace_dir, config.output_dir, elapsed) + + logger.info( + "Cursor run completed successfully in %.1fs", elapsed + ) + return AdapterResult( + success=True, + output_dir=config.output_dir, + aidlc_docs_dir=_aidlc_docs_if_exists(config.output_dir), + workspace_dir=workspace_dir, + elapsed_seconds=elapsed, + extra=_build_extra(raw_stdout, raw_stderr, parsed_json), + ) + + except subprocess.TimeoutExpired: + elapsed = time.monotonic() - start_time + error_msg = ( + f"Cursor agent CLI timed out after {config.timeout_seconds}s" + ) + logger.error(error_msg) + + # Attempt to salvage any partial output that was written to disk. + if workspace_dir and workspace_dir.is_dir(): + self._normalize(workspace_dir, config.output_dir, elapsed) + + return AdapterResult( + success=False, + output_dir=config.output_dir, + aidlc_docs_dir=_aidlc_docs_if_exists(config.output_dir), + workspace_dir=workspace_dir, + error=error_msg, + elapsed_seconds=elapsed, + ) + + except FileNotFoundError as exc: + elapsed = time.monotonic() - start_time + error_msg = f"Required file not found: {exc}" + logger.error(error_msg) + return AdapterResult( + success=False, + output_dir=config.output_dir, + workspace_dir=workspace_dir, + error=error_msg, + elapsed_seconds=elapsed, + ) + + except Exception as exc: # noqa: BLE001 + elapsed = time.monotonic() - start_time + error_msg = f"Unexpected error during Cursor run: {exc}" + logger.exception(error_msg) + return AdapterResult( + success=False, + output_dir=config.output_dir, + workspace_dir=workspace_dir, + error=error_msg, + elapsed_seconds=elapsed, + ) + + # ------------------------------------------------------------------ # + # Internal helpers + # ------------------------------------------------------------------ # + + @staticmethod + def _normalize( + workspace_dir: Path, + output_dir: Path, + elapsed: float, + ) -> Path: + """Delegate to the shared normalizer.""" + return normalize_output( + source_dir=workspace_dir, + output_dir=output_dir, + adapter_name="cursor", + model_hint="ide:cursor", + elapsed_seconds=elapsed, + ) + + +# ---------------------------------------------------------------------- # +# Module-level helpers +# ---------------------------------------------------------------------- # + + +def _try_parse_json(raw: str) -> dict | None: + """Attempt to parse the agent CLI's JSON output. + + The CLI may emit mixed content (text + JSON) so we try progressively + less strict strategies: + 1. Parse the entire stdout as JSON. + 2. Find the first ``{`` / last ``}`` and parse that substring. + """ + if not raw.strip(): + return None + + # Strategy 1: full stdout is valid JSON + try: + return json.loads(raw) + except (json.JSONDecodeError, ValueError): + pass + + # Strategy 2: extract the outermost JSON object + first_brace = raw.find("{") + last_brace = raw.rfind("}") + if first_brace != -1 and last_brace > first_brace: + try: + return json.loads(raw[first_brace : last_brace + 1]) + except (json.JSONDecodeError, ValueError): + pass + + return None + + +def _aidlc_docs_if_exists(output_dir: Path) -> Path | None: + """Return the aidlc-docs path if it was produced, else ``None``.""" + docs = output_dir / "aidlc-docs" + return docs if docs.is_dir() else None + + +def _build_extra( + stdout: str, + stderr: str, + parsed: dict | None, +) -> dict: + """Build the ``extra`` dict for :class:`AdapterResult`.""" + extra: dict = {} + if stdout: + extra["stdout_length"] = len(stdout) + if stderr: + extra["stderr_length"] = len(stderr) + extra["stderr_preview"] = stderr[:500] + if parsed is not None: + extra["parsed_json"] = parsed + return extra diff --git a/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/kiro.py b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/kiro.py new file mode 100644 index 00000000..b98efd4b --- /dev/null +++ b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/kiro.py @@ -0,0 +1,283 @@ +"""Kiro IDE adapter — AWS AI-powered IDE with CLI automation via pexpect. + +Kiro does not have a headless ``-p`` mode like Cursor. Automation is +achieved by spawning an interactive ``kiro-cli`` session inside a PTY and +driving it with the ``pexpect`` library. + +AIDLC rules are injected through Kiro's steering-file mechanism by writing +them to ``.kiro/steering/aidlc-rules.md`` inside the workspace. +""" + +from __future__ import annotations + +import importlib +import logging +import shutil +import tempfile +import time +from pathlib import Path + +from ide_harness.adapter import AdapterConfig, AdapterResult, IDEAdapter +from ide_harness.normalizer import normalize_output +from ide_harness.prompt_template import render_prompt + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +_KIRO_CLI = "kiro-cli" + +# The prompt marker Kiro CLI emits when it is ready for input. +# Adjust if the actual CLI uses a different prompt indicator. +_PROMPT_PATTERN = r"[>$#] " + +# How often (in seconds) to poll the workspace for the aidlc-docs/ directory. +_POLL_INTERVAL = 5 + +# Minimum number of expected files inside aidlc-docs/ before we consider the +# AIDLC run "complete". A real AIDLC run produces many files; we use a low +# threshold so the adapter returns as soon as at least *some* output appears. +_MIN_AIDLC_FILES = 1 + +# Grace period (seconds) after the last new file is created before we decide +# that the agent has stopped producing output. +_QUIESCENCE_SECONDS = 60 + + +class KiroAdapter(IDEAdapter): + """Adapter for Kiro (AWS AI IDE). + + Uses ``pexpect`` to drive an interactive ``kiro-cli`` terminal session. + + Automation flow + --------------- + 1. Create a temporary workspace directory. + 2. Copy ``vision.md`` and ``tech-env.md`` into the workspace. + 3. Write AIDLC rules into ``.kiro/steering/aidlc-rules.md``. + 4. Build the evaluation prompt via :func:`render_prompt`. + 5. Spawn ``kiro-cli`` inside the workspace using ``pexpect``. + 6. Send the prompt and wait for output. + 7. Monitor the workspace for the ``aidlc-docs/`` directory. + 8. Normalize output via :func:`normalize_output`. + 9. Return an :class:`AdapterResult`. + """ + + @property + def name(self) -> str: + return "Kiro" + + # ------------------------------------------------------------------ + # Prerequisites + # ------------------------------------------------------------------ + + def check_prerequisites(self) -> tuple[bool, str]: + """Verify that ``kiro-cli`` is on *PATH* and ``pexpect`` is installed.""" + issues: list[str] = [] + + if not shutil.which(_KIRO_CLI): + issues.append( + f"'{_KIRO_CLI}' not found in PATH. " + "Install the Kiro CLI first (https://kiro.dev)." + ) + + if importlib.util.find_spec("pexpect") is None: + issues.append( + "'pexpect' Python package is not installed. " + "Install it with: pip install pexpect" + ) + + if issues: + return False, " | ".join(issues) + + return True, f"Kiro CLI ('{_KIRO_CLI}') found and pexpect is available" + + # ------------------------------------------------------------------ + # Run + # ------------------------------------------------------------------ + + def run(self, config: AdapterConfig) -> AdapterResult: + """Execute the full AIDLC workflow through the Kiro CLI.""" + + # -- Pre-flight checks ---------------------------------------- + ok, msg = self.check_prerequisites() + if not ok: + return AdapterResult( + success=False, + output_dir=config.output_dir, + error=f"Prerequisites not met: {msg}", + ) + + # Late-import so the module can be loaded even when pexpect is + # absent (check_prerequisites will flag the problem instead). + import pexpect # noqa: E402 (late import intentional) + + start_time = time.monotonic() + + # -- 1. Create temporary workspace ---------------------------- + workspace = Path(tempfile.mkdtemp(prefix="kiro-aidlc-")) + logger.info("Kiro workspace: %s", workspace) + + try: + # -- 2. Copy input documents ------------------------------ + shutil.copy2(config.vision_path, workspace / "vision.md") + if config.tech_env_path and config.tech_env_path.is_file(): + shutil.copy2(config.tech_env_path, workspace / "tech-env.md") + + # -- 3. Inject AIDLC rules via steering files ------------- + steering_dir = workspace / ".kiro" / "steering" + steering_dir.mkdir(parents=True, exist_ok=True) + + rules_content = config.rules_path.read_text(encoding="utf-8") + (steering_dir / "aidlc-rules.md").write_text( + rules_content, encoding="utf-8" + ) + logger.info("AIDLC rules written to %s", steering_dir / "aidlc-rules.md") + + # -- 4. Build the prompt ---------------------------------- + prompt = render_prompt( + vision_path="vision.md", + tech_env_path="tech-env.md", + ) + if config.prompt_template: + prompt = config.prompt_template + + # -- 5. Spawn kiro-cli in the workspace ------------------- + logger.info("Spawning %s ...", _KIRO_CLI) + child = pexpect.spawn( + _KIRO_CLI, + cwd=str(workspace), + encoding="utf-8", + timeout=config.timeout_seconds, + ) + + # Log all CLI output for debugging / audit purposes. + log_path = workspace / ".kiro-session.log" + child.logfile_read = log_path.open("w", encoding="utf-8") + + try: + # Wait for the initial prompt. + child.expect(_PROMPT_PATTERN, timeout=60) + logger.info("Kiro CLI ready — sending AIDLC prompt") + + # -- 6. Send the prompt and monitor ------------------- + child.sendline(prompt) + + # Monitor the workspace for aidlc-docs/ completion. + aidlc_docs_dir = workspace / "aidlc-docs" + last_change_time = time.monotonic() + last_file_count = 0 + completed = False + + while True: + elapsed = time.monotonic() - start_time + if elapsed >= config.timeout_seconds: + logger.warning( + "Timeout reached (%ds). Stopping Kiro session.", + config.timeout_seconds, + ) + break + + # Non-blocking read: consume any available output so the + # PTY buffer doesn't fill up and block the child process. + try: + child.read_nonblocking(size=4096, timeout=_POLL_INTERVAL) + except pexpect.TIMEOUT: + pass # Nothing new — expected during long-running tasks. + except pexpect.EOF: + logger.info("Kiro CLI session ended (EOF).") + completed = True + break + + # Check whether aidlc-docs/ has appeared / grown. + if aidlc_docs_dir.is_dir(): + current_count = sum( + 1 for _ in aidlc_docs_dir.rglob("*") if _.is_file() + ) + if current_count != last_file_count: + last_file_count = current_count + last_change_time = time.monotonic() + logger.info( + "aidlc-docs/ now has %d file(s)", current_count + ) + + # Quiescence check: if enough files exist and no new + # files have appeared for _QUIESCENCE_SECONDS, treat + # the run as complete. + idle = time.monotonic() - last_change_time + if ( + current_count >= _MIN_AIDLC_FILES + and idle >= _QUIESCENCE_SECONDS + ): + logger.info( + "aidlc-docs/ quiescent for %ds with %d file(s) " + "— treating run as complete.", + int(idle), + current_count, + ) + completed = True + break + + finally: + # Ensure the child process is terminated cleanly. + if child.isalive(): + child.sendline("exit") + try: + child.expect(pexpect.EOF, timeout=15) + except (pexpect.TIMEOUT, pexpect.EOF): + pass + if child.isalive(): + child.terminate(force=True) + + if child.logfile_read and not child.logfile_read.closed: + child.logfile_read.close() + + elapsed_seconds = time.monotonic() - start_time + + # -- 7. Normalize output ---------------------------------- + config.output_dir.mkdir(parents=True, exist_ok=True) + normalize_output( + source_dir=workspace, + output_dir=config.output_dir, + adapter_name=self.name.lower(), + elapsed_seconds=elapsed_seconds, + ) + + aidlc_docs_out = config.output_dir / "aidlc-docs" + has_docs = aidlc_docs_out.is_dir() and any(aidlc_docs_out.iterdir()) + + if completed and has_docs: + return AdapterResult( + success=True, + output_dir=config.output_dir, + aidlc_docs_dir=aidlc_docs_out, + workspace_dir=workspace, + elapsed_seconds=elapsed_seconds, + ) + + # Partial or no output — report what we got. + error_detail = ( + "Kiro session ended but no aidlc-docs/ output was produced." + if not has_docs + else "Kiro session ended before the AIDLC workflow completed (timeout or early exit)." + ) + return AdapterResult( + success=False, + output_dir=config.output_dir, + aidlc_docs_dir=aidlc_docs_out if has_docs else None, + workspace_dir=workspace, + error=error_detail, + elapsed_seconds=elapsed_seconds, + ) + + except Exception as exc: + elapsed_seconds = time.monotonic() - start_time + logger.exception("Kiro adapter run failed") + return AdapterResult( + success=False, + output_dir=config.output_dir, + workspace_dir=workspace, + error=f"Kiro adapter error: {exc}", + elapsed_seconds=elapsed_seconds, + ) diff --git a/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/windsurf.py b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/windsurf.py new file mode 100644 index 00000000..e38101ee --- /dev/null +++ b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/adapters/windsurf.py @@ -0,0 +1,615 @@ +"""Windsurf adapter — Codeium's AI IDE (VS Code fork with Cascade). + +Windsurf is a VS Code fork with Cascade, Codeium's agentic AI assistant. +Unlike Cursor's headless ``agent`` CLI, Windsurf's ``windsurf`` CLI can only +*launch* the IDE — there is no headless/scripted chat mode. + +Automation strategy (semi-automated): + 1. Prepare a temporary workspace with all AIDLC inputs. + 2. Inject AIDLC rules via ``AGENTS.md`` (Cascade reads this automatically). + 3. Create a Cascade Workflow (``.windsurf/workflows/aidlc-eval.md``) that + the user (or future GUI automation) triggers via ``/aidlc-eval`` in chat. + 4. Launch Windsurf pointed at the workspace directory. + 5. Write ``INSTRUCTIONS.md`` telling the operator how to trigger the workflow. + 6. Monitor the workspace for ``aidlc-docs/`` output via a polling file watcher. + 7. Normalize output and return an :class:`AdapterResult`. + +Full end-to-end automation would require GUI-level tools such as +``vscode-extension-tester`` (ExTester) by Red Hat or Electron/Playwright +automation to drive the Cascade chat panel programmatically. +""" + +from __future__ import annotations + +import logging +import shutil +import subprocess +import tempfile +import time +from pathlib import Path + +from ide_harness.adapter import AdapterConfig, AdapterResult, IDEAdapter +from ide_harness.normalizer import normalize_output +from ide_harness.prompt_template import render_prompt + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- +_WINDSURF_CLI = "windsurf" + +# Cascade reads AGENTS.md from the workspace root for directory-scoped rules. +_AGENTS_MD = "AGENTS.md" + +# Workflow location inside the workspace — Cascade discovers these automatically. +_WORKFLOW_DIR = Path(".windsurf") / "workflows" +_WORKFLOW_FILENAME = "aidlc-eval.md" + +# Polling configuration for the file-watcher loop. +_POLL_INTERVAL_SECONDS = 10 +_MIN_STABLE_SECONDS = 60 # Require no new writes for this long before declaring done. + +# Expected sentinel files/directories that signal AIDLC phases completed. +_AIDLC_DOCS_DIR = "aidlc-docs" +_INCEPTION_SENTINELS = [ + "aidlc-docs/inception/requirements/requirements.md", + "aidlc-docs/inception/plans/execution-plan.md", + "aidlc-docs/inception/application-design/components.md", +] +_CONSTRUCTION_SENTINELS = [ + "aidlc-docs/construction/plans", + "aidlc-docs/construction/build-and-test/build-and-test-summary.md", +] +_TRACKING_SENTINELS = [ + "aidlc-docs/aidlc-state.md", + "aidlc-docs/audit.md", +] + + +class WindsurfAdapter(IDEAdapter): + """Adapter for Windsurf (Codeium AI IDE). + + Windsurf is a VS Code fork with Cascade AI. Because Windsurf lacks a + headless scripted-chat mode, this adapter takes a **semi-automated** + approach: + + - It fully prepares the workspace (input files, AGENTS.md rules, + Cascade workflow definition). + - It launches Windsurf pointed at the workspace. + - It monitors the filesystem for ``aidlc-docs/`` output. + - A human operator (or future GUI automation) triggers the + ``/aidlc-eval`` workflow inside Cascade's chat panel. + + Semi-automated mode is the default. The ``automation_mode`` constructor + parameter is reserved for future ``"extester"`` / ``"playwright"`` modes + that would drive the GUI programmatically. + """ + + def __init__(self, automation_mode: str = "semi-auto") -> None: + self._automation_mode = automation_mode + + # ------------------------------------------------------------------ # + # IDEAdapter interface + # ------------------------------------------------------------------ # + + @property + def name(self) -> str: + return "Windsurf" + + def check_prerequisites(self) -> tuple[bool, str]: + """Verify the ``windsurf`` CLI is available on PATH.""" + if shutil.which(_WINDSURF_CLI): + return True, f"Windsurf CLI ('{_WINDSURF_CLI}') found in PATH" + return ( + False, + f"Windsurf CLI ('{_WINDSURF_CLI}') not found in PATH. " + "Install Windsurf IDE from https://windsurf.com first.", + ) + + def run(self, config: AdapterConfig) -> AdapterResult: + """Execute the AIDLC process through Windsurf. + + Steps: + 1. Verify prerequisites. + 2. Create a temporary workspace directory. + 3. Copy vision.md and tech-env.md into the workspace. + 4. Copy AIDLC rules into the workspace (``aidlc-rules/``). + 5. Write ``AGENTS.md`` at the workspace root with AIDLC instructions + (Cascade reads this file automatically for directory-scoped context). + 6. Create ``.windsurf/workflows/aidlc-eval.md`` — a Cascade Workflow + that the operator triggers via ``/aidlc-eval`` in the chat panel. + 7. Write ``INSTRUCTIONS.md`` telling the operator what to do. + 8. Launch Windsurf pointed at the workspace. + 9. Poll the workspace for ``aidlc-docs/`` output until completion + or timeout. + 10. Normalize output into the evaluation-compatible layout. + 11. Return an :class:`AdapterResult`. + """ + # -- 1. Prerequisite check ----------------------------------------- + ok, msg = self.check_prerequisites() + if not ok: + return AdapterResult( + success=False, + output_dir=config.output_dir, + error=msg, + ) + + workspace_dir: Path | None = None + start_time = time.monotonic() + + try: + # -- 2. Create temp workspace ----------------------------------- + workspace_dir = Path(tempfile.mkdtemp(prefix="aidlc-windsurf-")) + logger.info("Windsurf workspace created at %s", workspace_dir) + + # -- 3. Copy input documents ------------------------------------ + shutil.copy2(config.vision_path, workspace_dir / "vision.md") + + if config.tech_env_path and config.tech_env_path.is_file(): + shutil.copy2(config.tech_env_path, workspace_dir / "tech-env.md") + + # -- 4. Copy AIDLC rules ---------------------------------------- + rules_dest = workspace_dir / "aidlc-rules" + if config.rules_path.is_dir(): + shutil.copytree(config.rules_path, rules_dest) + else: + # Single file — wrap it in a directory. + rules_dest.mkdir(parents=True, exist_ok=True) + shutil.copy2(config.rules_path, rules_dest / config.rules_path.name) + logger.info("AIDLC rules copied to %s", rules_dest) + + # -- 5. Write AGENTS.md ----------------------------------------- + agents_md_content = _build_agents_md(config) + agents_path = workspace_dir / _AGENTS_MD + agents_path.write_text(agents_md_content, encoding="utf-8") + logger.info( + "AGENTS.md written to %s (%d bytes)", + agents_path, + len(agents_md_content), + ) + + # -- 6. Create Cascade Workflow --------------------------------- + workflow_dir = workspace_dir / _WORKFLOW_DIR + workflow_dir.mkdir(parents=True, exist_ok=True) + workflow_path = workflow_dir / _WORKFLOW_FILENAME + workflow_content = _build_cascade_workflow(config) + workflow_path.write_text(workflow_content, encoding="utf-8") + logger.info("Cascade workflow written to %s", workflow_path) + + # -- 7. Write INSTRUCTIONS.md ----------------------------------- + instructions_content = _build_instructions_md() + instructions_path = workspace_dir / "INSTRUCTIONS.md" + instructions_path.write_text(instructions_content, encoding="utf-8") + logger.info("INSTRUCTIONS.md written to %s", instructions_path) + + # -- 8. Launch Windsurf ----------------------------------------- + logger.info( + "Launching Windsurf on workspace %s (mode=%s)", + workspace_dir, + self._automation_mode, + ) + windsurf_proc = self._launch_windsurf(workspace_dir) + + # -- 9. Monitor for output -------------------------------------- + # + # TODO: In "extester" or "playwright" mode, this is where GUI + # automation would: + # a. Wait for Windsurf to finish loading. + # b. Open the Cascade chat panel (Ctrl+Shift+L or Cmd+Shift+L). + # c. Type "/aidlc-eval" and press Enter to trigger the workflow. + # d. Optionally monitor the chat panel for completion signals. + # + # In semi-auto mode we simply poll the filesystem and rely on the + # human operator to trigger the workflow. + + logger.info( + "Monitoring workspace for aidlc-docs/ output " + "(timeout=%ds, poll=%ds, stable=%ds)", + config.timeout_seconds, + _POLL_INTERVAL_SECONDS, + _MIN_STABLE_SECONDS, + ) + + completed = self._poll_for_output( + workspace_dir=workspace_dir, + timeout_seconds=config.timeout_seconds, + start_time=start_time, + ) + + elapsed = time.monotonic() - start_time + + # -- 10. Normalize output --------------------------------------- + self._normalize(workspace_dir, config.output_dir, elapsed) + + aidlc_docs = _aidlc_docs_if_exists(config.output_dir) + + if completed: + logger.info( + "Windsurf run completed successfully in %.1fs", elapsed + ) + return AdapterResult( + success=True, + output_dir=config.output_dir, + aidlc_docs_dir=aidlc_docs, + workspace_dir=workspace_dir, + elapsed_seconds=elapsed, + extra=self._build_extra(windsurf_proc, completed=True), + ) + else: + # Timed out — salvage whatever partial output was produced. + partial_files = _count_aidlc_files(workspace_dir) + error_msg = ( + f"Timed out after {config.timeout_seconds}s waiting for " + f"aidlc-docs/ output. Partial files found: {partial_files}" + ) + logger.warning(error_msg) + return AdapterResult( + success=False, + output_dir=config.output_dir, + aidlc_docs_dir=aidlc_docs, + workspace_dir=workspace_dir, + error=error_msg, + elapsed_seconds=elapsed, + extra=self._build_extra( + windsurf_proc, + completed=False, + partial_files=partial_files, + ), + ) + + except FileNotFoundError as exc: + elapsed = time.monotonic() - start_time + error_msg = f"Required file not found: {exc}" + logger.error(error_msg) + return AdapterResult( + success=False, + output_dir=config.output_dir, + workspace_dir=workspace_dir, + error=error_msg, + elapsed_seconds=elapsed, + ) + + except Exception as exc: # noqa: BLE001 + elapsed = time.monotonic() - start_time + error_msg = f"Unexpected error during Windsurf run: {exc}" + logger.exception(error_msg) + + # Attempt to salvage any partial output. + if workspace_dir and workspace_dir.is_dir(): + try: + self._normalize(workspace_dir, config.output_dir, elapsed) + except Exception: # noqa: BLE001 + logger.warning("Failed to normalize partial output", exc_info=True) + + return AdapterResult( + success=False, + output_dir=config.output_dir, + workspace_dir=workspace_dir, + error=error_msg, + elapsed_seconds=elapsed, + ) + + # ------------------------------------------------------------------ # + # Internal helpers + # ------------------------------------------------------------------ # + + @staticmethod + def _launch_windsurf(workspace_dir: Path) -> subprocess.Popen: + """Launch Windsurf IDE pointed at the given workspace. + + The ``windsurf`` CLI opens the IDE as a detached GUI process. + We use ``Popen`` (non-blocking) because the CLI returns quickly + while the IDE continues running. + + TODO: For ExTester/Playwright automation, this method should also: + - Set ``ELECTRON_ENABLE_LOGGING=1`` for debug output. + - Potentially pass ``--disable-gpu`` for headless CI environments. + - Pass ``--extensions-dir`` to load a test-driver extension. + """ + cmd = [_WINDSURF_CLI, str(workspace_dir)] + logger.info("Launching: %s", " ".join(cmd)) + + # nosec B603 - Executing user's Windsurf IDE with validated workspace path + # nosemgrep: dangerous-subprocess-use-audit + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + # Give the IDE a moment to start initializing. + # nosemgrep: arbitrary-sleep - Required delay for IDE initialization + time.sleep(2) + return proc + + def _poll_for_output( + self, + workspace_dir: Path, + timeout_seconds: int, + start_time: float, + ) -> bool: + """Poll the workspace for aidlc-docs/ output until completion or timeout. + + Completion is declared when: + - The ``aidlc-docs/`` directory exists, AND + - Key sentinel files from both inception and construction phases are + present, AND + - No new files have been written for ``_MIN_STABLE_SECONDS`` (indicating + Cascade has likely finished generating output). + + Returns: + True if output appears complete, False if we timed out. + """ + last_change_time: float | None = None + last_file_count = 0 + + while True: + elapsed = time.monotonic() - start_time + if elapsed >= timeout_seconds: + return False + + docs_dir = workspace_dir / _AIDLC_DOCS_DIR + + if not docs_dir.is_dir(): + # nosemgrep: arbitrary-sleep - Polling IDE for output directory creation + time.sleep(_POLL_INTERVAL_SECONDS) + continue + + # Count current files. + current_files = list(docs_dir.rglob("*")) + current_file_count = sum(1 for f in current_files if f.is_file()) + + if current_file_count != last_file_count: + # New files appeared — reset the stability timer. + last_change_time = time.monotonic() + last_file_count = current_file_count + logger.debug( + "aidlc-docs: %d files detected (%.0fs elapsed)", + current_file_count, + elapsed, + ) + + # Check for sentinel files indicating all phases are done. + sentinels_present = _check_sentinels(workspace_dir) + + if sentinels_present and last_change_time is not None: + stable_duration = time.monotonic() - last_change_time + if stable_duration >= _MIN_STABLE_SECONDS: + logger.info( + "Output appears complete: %d files, stable for %.0fs, " + "all sentinels present.", + current_file_count, + stable_duration, + ) + return True + + # nosemgrep: arbitrary-sleep - Polling IDE for stable output state + time.sleep(_POLL_INTERVAL_SECONDS) + + @staticmethod + def _normalize( + workspace_dir: Path, + output_dir: Path, + elapsed: float, + ) -> Path: + """Delegate to the shared normalizer.""" + return normalize_output( + source_dir=workspace_dir, + output_dir=output_dir, + adapter_name="windsurf", + model_hint="ide:windsurf-cascade", + elapsed_seconds=elapsed, + ) + + @staticmethod + def _build_extra( + proc: subprocess.Popen, + *, + completed: bool, + partial_files: int = 0, + ) -> dict: + """Build the ``extra`` dict for :class:`AdapterResult`.""" + extra: dict = { + "automation_mode": "semi-auto", + "windsurf_pid": proc.pid, + "completed_naturally": completed, + } + if not completed: + extra["partial_aidlc_files"] = partial_files + return extra + + +# ---------------------------------------------------------------------- # +# Module-level helpers +# ---------------------------------------------------------------------- # + + +def _build_agents_md(config: AdapterConfig) -> str: + """Build the ``AGENTS.md`` content for Cascade. + + Cascade automatically reads ``AGENTS.md`` from the workspace root to + understand directory-scoped instructions. We use it to inject the + AIDLC rules and context so Cascade is primed to follow the process + when the operator triggers the workflow. + """ + # Read AIDLC rules content. + if config.rules_path.is_dir(): + # Concatenate all markdown files in the rules directory. + rule_files = sorted(config.rules_path.rglob("*.md")) + rules_text = "\n\n---\n\n".join( + f.read_text(encoding="utf-8") for f in rule_files + ) + else: + rules_text = config.rules_path.read_text(encoding="utf-8") + + return f"""\ +# AGENTS.md — AIDLC Evaluation Workspace + +This workspace is configured for an AIDLC (AI Development Life Cycle) evaluation +run. Cascade should follow the AIDLC process precisely when prompted. + +## Workspace Structure + +- `vision.md` — The application vision document. +- `tech-env.md` — Technical environment specification. +- `aidlc-rules/` — Complete AIDLC rules (inception + construction phases). +- `.windsurf/workflows/aidlc-eval.md` — Cascade Workflow for the evaluation. +- `INSTRUCTIONS.md` — Human-readable instructions for the operator. + +## AIDLC Rules Summary + +The complete AIDLC rules are in the `aidlc-rules/` directory. Key principles: + +1. Follow the **Inception** phase first (requirements, plans, application design). +2. Then follow the **Construction** phase (build plans, code generation, tests). +3. All documentation goes in `aidlc-docs/` with the prescribed directory structure. +4. Maintain `aidlc-docs/aidlc-state.md` and `aidlc-docs/audit.md` throughout. +5. Generate complete, working code with full test coverage. +6. Do not skip phases or documents. + +## Rules Content + +{rules_text} +""" + + +def _build_cascade_workflow(config: AdapterConfig) -> str: + """Build the Cascade Workflow definition. + + Cascade Workflows are markdown files in ``.windsurf/workflows/`` that + define reusable, triggerable prompt sequences. They are invoked via + slash commands in the Cascade chat panel (e.g., ``/aidlc-eval``). + + The workflow name is derived from the filename (``aidlc-eval.md`` becomes + ``/aidlc-eval``). + """ + # Use custom prompt if provided, otherwise the standard AIDLC prompt. + if config.prompt_template: + prompt_body = config.prompt_template + else: + prompt_body = render_prompt( + vision_path="vision.md", + tech_env_path="tech-env.md", + ) + + return f"""\ +--- +name: AIDLC Evaluation +description: > + Execute the full AIDLC (AI Development Life Cycle) process — inception + through construction — generating all required documents and application + source code. +tags: + - aidlc + - evaluation +--- + +# AIDLC Evaluation Workflow + +This workflow executes the complete AIDLC process for the application described +in `vision.md` and `tech-env.md`. + +## Steps + +1. Read the workspace context: `AGENTS.md`, `vision.md`, `tech-env.md`, and + all files in `aidlc-rules/`. +2. Execute the AIDLC process as described below. +3. Generate all required documents in `aidlc-docs/`. +4. Generate the application source code and tests in the project root. +5. Ensure all tests pass. + +## Prompt + +{prompt_body} +""" + + +def _build_instructions_md() -> str: + """Build the ``INSTRUCTIONS.md`` for the human operator.""" + return """\ +# AIDLC Evaluation — Windsurf Instructions + +This workspace has been prepared for an AIDLC evaluation run. Follow these +steps to execute the evaluation: + +## Quick Start + +1. **Windsurf should already be open** with this workspace loaded. +2. Open the **Cascade** chat panel: + - macOS: `Cmd + Shift + L` + - Windows/Linux: `Ctrl + Shift + L` +3. In the chat input, type: + ``` + /aidlc-eval + ``` +4. Press **Enter** to trigger the AIDLC evaluation workflow. +5. Cascade will begin executing the full AIDLC process. This may take + 30-120 minutes depending on the project complexity. +6. **Do not close Windsurf** until Cascade finishes. The harness is + monitoring this workspace for output. + +## What Happens Next + +- Cascade will read `vision.md`, `tech-env.md`, and `aidlc-rules/`. +- It will create `aidlc-docs/` with all inception and construction documents. +- It will generate application source code and tests in the project root. +- The evaluation harness is polling this directory for output and will + detect completion automatically. + +## Troubleshooting + +- If `/aidlc-eval` is not recognized, Cascade may need a moment to index + the `.windsurf/workflows/` directory. Close and reopen the chat panel, + then try again. +- If Cascade stalls, you can paste the prompt from `AGENTS.md` directly + into the chat as a fallback. +- Check the evaluation harness terminal for progress logs. + +## Files in This Workspace + +| File | Purpose | +|-------------------------------------------|------------------------------------| +| `vision.md` | Application vision document | +| `tech-env.md` | Technical environment spec | +| `aidlc-rules/` | AIDLC process rules | +| `AGENTS.md` | Cascade directory-scoped rules | +| `.windsurf/workflows/aidlc-eval.md` | Cascade Workflow definition | +| `INSTRUCTIONS.md` | This file | +""" + + +def _check_sentinels(workspace_dir: Path) -> bool: + """Check whether key AIDLC output sentinel files/directories exist. + + We require at least one file from each major section (inception, + construction, tracking) to consider the output "complete." + """ + # At least one inception sentinel must exist. + inception_ok = any( + (workspace_dir / s).exists() for s in _INCEPTION_SENTINELS + ) + # At least one construction sentinel must exist. + construction_ok = any( + (workspace_dir / s).exists() for s in _CONSTRUCTION_SENTINELS + ) + # Both tracking files should exist. + tracking_ok = all( + (workspace_dir / s).exists() for s in _TRACKING_SENTINELS + ) + + return inception_ok and construction_ok and tracking_ok + + +def _count_aidlc_files(workspace_dir: Path) -> int: + """Count the number of files in aidlc-docs/ (if it exists).""" + docs_dir = workspace_dir / _AIDLC_DOCS_DIR + if not docs_dir.is_dir(): + return 0 + return sum(1 for f in docs_dir.rglob("*") if f.is_file()) + + +def _aidlc_docs_if_exists(output_dir: Path) -> Path | None: + """Return the aidlc-docs path if it was produced, else ``None``.""" + docs = output_dir / "aidlc-docs" + return docs if docs.is_dir() else None diff --git a/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/normalizer.py b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/normalizer.py new file mode 100644 index 00000000..ac1b4594 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/normalizer.py @@ -0,0 +1,167 @@ +"""Output normalization — map IDE workspace output to evaluation-compatible layout.""" + +from __future__ import annotations + +import shutil +from datetime import UTC, datetime +from pathlib import Path + +import yaml + + +def normalize_output( + source_dir: Path, + output_dir: Path, + adapter_name: str, + model_hint: str = "", + elapsed_seconds: float = 0.0, +) -> Path: + """Normalize IDE output into the run folder layout expected by run_evaluation.py. + + The expected layout is: + <output_dir>/ + run-meta.yaml + run-metrics.yaml + aidlc-docs/ + workspace/ + + Args: + source_dir: The IDE's workspace directory containing generated files. + output_dir: Where to write the normalized output. + adapter_name: Name of the IDE adapter (e.g., "cursor"). + model_hint: Optional model identifier for run-meta. + elapsed_seconds: Wall clock time for the run. + + Returns: + Path to the output_dir. + """ + output_dir.mkdir(parents=True, exist_ok=True) + + # Copy aidlc-docs if present in source + src_docs = source_dir / "aidlc-docs" + dst_docs = output_dir / "aidlc-docs" + if src_docs.is_dir(): + if dst_docs.exists(): + shutil.rmtree(dst_docs) + shutil.copytree(src_docs, dst_docs) + + # Copy workspace — everything except aidlc-docs and aidlc-rules + dst_workspace = output_dir / "workspace" + dst_workspace.mkdir(exist_ok=True) + skip = {"aidlc-docs", "aidlc-rules", ".git", ".venv", "node_modules", "__pycache__"} + for item in source_dir.iterdir(): + if item.name in skip: + continue + dst = dst_workspace / item.name + if item.is_dir(): + if dst.exists(): + shutil.rmtree(dst) + shutil.copytree(item, dst) + else: + shutil.copy2(item, dst) + + # Generate run-meta.yaml + now = datetime.now(UTC).isoformat(timespec="seconds") + meta = { + "run_folder": str(output_dir), + "started_at": now, + "completed_at": now, + "status": "completed", + "execution_time_ms": int(elapsed_seconds * 1000), + "total_handoffs": 0, + "node_history": [], + "config": { + "executor_model": model_hint or f"ide:{adapter_name}", + "simulator_model": "human", + "aws_region": "", + }, + } + meta_path = output_dir / "run-meta.yaml" + with open(meta_path, "w", encoding="utf-8") as f: + yaml.safe_dump(meta, f, default_flow_style=False, sort_keys=False) + + # Generate minimal run-metrics.yaml + metrics = { + "tokens": { + "total": { + "input_tokens": 0, + "output_tokens": 0, + "total_tokens": 0, + }, + }, + "timing": { + "total_wall_clock_ms": int(elapsed_seconds * 1000), + "handoffs": [], + }, + "artifacts": { + "workspace": _count_workspace_files(dst_workspace), + "aidlc_docs": _count_doc_files(dst_docs) if dst_docs.is_dir() else {}, + }, + "errors": {}, + } + metrics_path = output_dir / "run-metrics.yaml" + with open(metrics_path, "w", encoding="utf-8") as f: + yaml.safe_dump(metrics, f, default_flow_style=False, sort_keys=False) + + return output_dir + + +def _count_workspace_files(workspace: Path) -> dict: + """Count files in the workspace by category.""" + if not workspace.is_dir(): + return {} + + source_exts = {".py", ".js", ".ts", ".jsx", ".tsx", ".rs", ".go", ".java"} + test_patterns = {"test_", "_test.", ".test.", ".spec."} + config_exts = {".yaml", ".yml", ".json", ".toml", ".cfg", ".ini"} + + source = test = config = other = 0 + total_lines = 0 + + for f in workspace.rglob("*"): + if not f.is_file(): + continue + ext = f.suffix.lower() + name = f.name.lower() + + is_test = any(p in name for p in test_patterns) + if is_test and ext in source_exts: + test += 1 + elif ext in source_exts: + source += 1 + try: + total_lines += len(f.read_text(errors="replace").splitlines()) + except OSError: + pass + elif ext in config_exts: + config += 1 + else: + other += 1 + + return { + "source_files": source, + "test_files": test, + "config_files": config, + "other_files": other, + "total_files": source + test + config + other, + "total_lines_of_code": total_lines, + } + + +def _count_doc_files(docs_dir: Path) -> dict: + """Count AIDLC doc files by phase.""" + inception = construction = other = 0 + for f in docs_dir.rglob("*.md"): + rel = str(f.relative_to(docs_dir)) + if rel.startswith("inception"): + inception += 1 + elif rel.startswith("construction"): + construction += 1 + else: + other += 1 + return { + "inception_files": inception, + "construction_files": construction, + "other_files": other, + "total_files": inception + construction + other, + } diff --git a/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/orchestrator.py b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/orchestrator.py new file mode 100644 index 00000000..24ad9732 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/orchestrator.py @@ -0,0 +1,117 @@ +"""Run orchestration — invoke an IDE adapter then run the evaluation pipeline.""" + +from __future__ import annotations + +import shutil +import subprocess +import sys +from pathlib import Path + +import yaml + +from ide_harness.adapter import AdapterConfig, AdapterResult, IDEAdapter +from ide_harness.normalizer import normalize_output + + +REPO_ROOT = Path(__file__).resolve().parents[4] # packages/ide-harness/src/ide_harness -> repo root + + +def run_ide_evaluation( + adapter: IDEAdapter, + vision_path: Path, + output_dir: Path, + golden_docs: Path, + rules_path: Path, + tech_env_path: Path | None = None, + openapi_path: Path | None = None, + baseline_path: Path | None = None, + profile: str | None = None, + region: str | None = None, + scorer_model: str = "us.anthropic.claude-sonnet-4-5-20250929-v1:0", + report_format: str = "both", + prompt_template: str | None = None, + timeout_seconds: int = 7200, + use_sandbox: bool = True, +) -> tuple[AdapterResult, int]: + """Run the full IDE evaluation pipeline. + + Steps: + 1. Check adapter prerequisites + 2. Run the adapter to generate AIDLC outputs + 3. Normalize the output to the expected run folder layout + 4. Invoke run_evaluation.py --evaluate-only to score the output + + Returns: + (adapter_result, eval_exit_code) + """ + # 1. Check prerequisites + ok, msg = adapter.check_prerequisites() + if not ok: + print(f"[ERROR] {adapter.name} prerequisites not met: {msg}", file=sys.stderr) + return AdapterResult( + success=False, + output_dir=output_dir, + error=f"Prerequisites not met: {msg}", + ), 1 + + print(f"[OK] {adapter.name} prerequisites met: {msg}") + + # 2. Run the adapter + config = AdapterConfig( + vision_path=vision_path, + tech_env_path=tech_env_path, + rules_path=rules_path, + output_dir=output_dir, + prompt_template=prompt_template, + timeout_seconds=timeout_seconds, + ) + + print(f"\nRunning {adapter.name} adapter...") + result = adapter.run(config) + + if not result.success: + print(f"[FAILED] {adapter.name}: {result.error}", file=sys.stderr) + return result, 1 + + print(f"[OK] {adapter.name} completed in {result.elapsed_seconds:.0f}s") + + # 3. Verify aidlc-docs were produced + aidlc_docs = result.aidlc_docs_dir or output_dir / "aidlc-docs" + if not aidlc_docs.is_dir(): + print(f"[ERROR] No aidlc-docs directory found at {aidlc_docs}", file=sys.stderr) + result.success = False + result.error = "No aidlc-docs produced" + return result, 1 + + doc_files = [f for f in aidlc_docs.rglob("*.md") + if f.name not in ("aidlc-state.md", "audit.md")] + if not doc_files: + print(f"[WARN] aidlc-docs exists but contains no substantive documents") + + # 4. Run evaluation pipeline (stages 2-6) + eval_cmd = [ + sys.executable, str(REPO_ROOT / "run_evaluation.py"), + "--evaluate-only", str(aidlc_docs), + "--golden", str(golden_docs), + "--scorer-model", scorer_model, + "--report-format", report_format, + ] + if profile: + eval_cmd += ["--profile", profile] + if region: + eval_cmd += ["--region", region] + if openapi_path and openapi_path.is_file(): + eval_cmd += ["--openapi", str(openapi_path)] + if baseline_path and baseline_path.is_file(): + eval_cmd += ["--baseline", str(baseline_path)] + if use_sandbox: + eval_cmd.append("--sandbox") + else: + eval_cmd.append("--no-sandbox") + + print(f"\nRunning evaluation pipeline on {adapter.name} output...") + # nosec B603 - Executing trusted framework evaluation script with validated args + # nosemgrep: dangerous-subprocess-use-audit + eval_result = subprocess.run(eval_cmd) + + return result, eval_result.returncode diff --git a/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/prompt_template.py b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/prompt_template.py new file mode 100644 index 00000000..8cd7aa57 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/prompt_template.py @@ -0,0 +1,58 @@ +"""Standard AIDLC prompt template for IDE AI assistants.""" + +AIDLC_PROMPT = """\ +You are tasked with building an application following the AIDLC (AI Development \ +Life Cycle) process. The AIDLC rules are provided in the `aidlc-rules/` directory. + +Please read the vision document at `vision.md` and the technical environment \ +specification at `tech-env.md`, then follow the complete AIDLC process: + +## INCEPTION PHASE + +1. Read the AIDLC rules for the inception phase from `aidlc-rules/` +2. Create requirements analysis: + - `aidlc-docs/inception/requirements/requirements.md` + - `aidlc-docs/inception/requirements/requirement-verification-questions.md` +3. Create plans: + - `aidlc-docs/inception/plans/application-design-plan.md` + - `aidlc-docs/inception/plans/execution-plan.md` +4. Create application design: + - `aidlc-docs/inception/application-design/components.md` + - `aidlc-docs/inception/application-design/component-methods.md` + - `aidlc-docs/inception/application-design/component-dependency.md` + - `aidlc-docs/inception/application-design/services.md` + +## CONSTRUCTION PHASE + +1. Read the AIDLC rules for the construction phase +2. Create build plans and test instructions: + - `aidlc-docs/construction/plans/` (code generation plan) + - `aidlc-docs/construction/build-and-test/build-instructions.md` + - `aidlc-docs/construction/build-and-test/unit-test-instructions.md` + - `aidlc-docs/construction/build-and-test/integration-test-instructions.md` + - `aidlc-docs/construction/build-and-test/build-and-test-summary.md` +3. Generate the application source code and tests in the project root +4. Ensure all unit tests pass + +## TRACKING + +- Create and maintain `aidlc-docs/aidlc-state.md` tracking progress through \ +each phase +- Create and maintain `aidlc-docs/audit.md` with an audit trail of actions taken + +Follow every AIDLC rule precisely. Do not skip phases or documents. Generate \ +complete, working code with full test coverage. +""" + + +def render_prompt(vision_path: str = "vision.md", tech_env_path: str = "tech-env.md") -> str: + r"""Render the AIDLC prompt with customized file paths. + + Only replaces backtick-delimited references (``\`vision.md\```) so that + prose mentions are left intact. + """ + return ( + AIDLC_PROMPT + .replace("`vision.md`", f"`{vision_path}`") + .replace("`tech-env.md`", f"`{tech_env_path}`") + ) diff --git a/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/registry.py b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/registry.py new file mode 100644 index 00000000..1f0f6b2d --- /dev/null +++ b/scripts/aidlc-evaluator/packages/ide-harness/src/ide_harness/registry.py @@ -0,0 +1,43 @@ +"""Adapter registry — discover and instantiate IDE adapters by name.""" + +from __future__ import annotations + +from ide_harness.adapter import IDEAdapter + + +# Lazy imports to avoid pulling in adapter-specific deps at import time +_ADAPTER_MAP: dict[str, str] = { + "kiro": "ide_harness.adapters.kiro.KiroAdapter", + "cursor": "ide_harness.adapters.cursor.CursorAdapter", + "cline": "ide_harness.adapters.cline.ClineAdapter", + "copilot": "ide_harness.adapters.copilot.CopilotAdapter", + "windsurf": "ide_harness.adapters.windsurf.WindsurfAdapter", + "antigravity": "ide_harness.adapters.antigravity.AntigravityAdapter", +} + + +def list_adapters() -> list[str]: + """Return sorted list of registered adapter names.""" + return sorted(_ADAPTER_MAP.keys()) + + +def get_adapter(name: str) -> IDEAdapter: + """Instantiate an adapter by name. + + Raises KeyError if the adapter is not registered. + Raises ImportError if the adapter module cannot be loaded. + """ + key = name.lower().strip() + if key not in _ADAPTER_MAP: + raise KeyError( + f"Unknown adapter '{name}'. Available: {', '.join(list_adapters())}" + ) + + fqn = _ADAPTER_MAP[key] + module_path, class_name = fqn.rsplit(".", 1) + + import importlib + # nosemgrep: non-literal-import - module_path validated against _ADAPTER_MAP whitelist + module = importlib.import_module(module_path) + cls = getattr(module, class_name) + return cls() diff --git a/scripts/aidlc-evaluator/packages/ide-harness/tests/test_normalizer.py b/scripts/aidlc-evaluator/packages/ide-harness/tests/test_normalizer.py new file mode 100644 index 00000000..bb761ce8 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/ide-harness/tests/test_normalizer.py @@ -0,0 +1,68 @@ +"""Tests for output normalization.""" + +from pathlib import Path + +import yaml + +from ide_harness.normalizer import normalize_output + + +def test_normalize_creates_run_meta(tmp_path: Path) -> None: + """normalize_output should create run-meta.yaml.""" + source = tmp_path / "source" + source.mkdir() + (source / "aidlc-docs").mkdir() + (source / "aidlc-docs" / "test.md").write_text("# Test") + + output = tmp_path / "output" + normalize_output(source, output, adapter_name="test", elapsed_seconds=120.5) + + meta_path = output / "run-meta.yaml" + assert meta_path.exists() + meta = yaml.safe_load(meta_path.read_text()) + assert meta["status"] == "completed" + assert meta["execution_time_ms"] == 120500 + assert meta["config"]["executor_model"] == "ide:test" + + +def test_normalize_copies_aidlc_docs(tmp_path: Path) -> None: + """normalize_output should copy aidlc-docs to output.""" + source = tmp_path / "source" + (source / "aidlc-docs" / "inception").mkdir(parents=True) + (source / "aidlc-docs" / "inception" / "requirements.md").write_text("# Reqs") + + output = tmp_path / "output" + normalize_output(source, output, adapter_name="test") + + assert (output / "aidlc-docs" / "inception" / "requirements.md").exists() + + +def test_normalize_copies_workspace_files(tmp_path: Path) -> None: + """normalize_output should copy non-aidlc files to workspace/.""" + source = tmp_path / "source" + source.mkdir() + (source / "main.py").write_text("print('hello')") + (source / "tests").mkdir() + (source / "tests" / "test_main.py").write_text("def test_it(): pass") + + output = tmp_path / "output" + normalize_output(source, output, adapter_name="test") + + assert (output / "workspace" / "main.py").exists() + assert (output / "workspace" / "tests" / "test_main.py").exists() + + +def test_normalize_creates_metrics(tmp_path: Path) -> None: + """normalize_output should create run-metrics.yaml with file counts.""" + source = tmp_path / "source" + source.mkdir() + (source / "app.py").write_text("x = 1\ny = 2\n") + + output = tmp_path / "output" + normalize_output(source, output, adapter_name="test", elapsed_seconds=60) + + metrics_path = output / "run-metrics.yaml" + assert metrics_path.exists() + metrics = yaml.safe_load(metrics_path.read_text()) + assert metrics["timing"]["total_wall_clock_ms"] == 60000 + assert metrics["artifacts"]["workspace"]["source_files"] == 1 diff --git a/scripts/aidlc-evaluator/packages/nonfunctional/pyproject.toml b/scripts/aidlc-evaluator/packages/nonfunctional/pyproject.toml new file mode 100644 index 00000000..00c4fe8e --- /dev/null +++ b/scripts/aidlc-evaluator/packages/nonfunctional/pyproject.toml @@ -0,0 +1,19 @@ +[project] +name = "aidlc-nonfunctional" +version = "0.1.0" +description = "NFR evaluation — token consumption, execution time, cross-model consistency" +requires-python = ">=3.13" +dependencies = [ + "aidlc-shared", + "pyyaml>=6.0", +] + +[tool.uv.sources] +aidlc-shared = { workspace = true } + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/nonfunctional"] diff --git a/scripts/aidlc-evaluator/packages/nonfunctional/src/nonfunctional/__init__.py b/scripts/aidlc-evaluator/packages/nonfunctional/src/nonfunctional/__init__.py new file mode 100644 index 00000000..4bc2e262 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/nonfunctional/src/nonfunctional/__init__.py @@ -0,0 +1 @@ +"""NFR evaluation — token consumption, execution time, cross-model consistency.""" diff --git a/scripts/aidlc-evaluator/packages/qualitative/pyproject.toml b/scripts/aidlc-evaluator/packages/qualitative/pyproject.toml new file mode 100644 index 00000000..59694d92 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/qualitative/pyproject.toml @@ -0,0 +1,27 @@ +[project] +name = "aidlc-qualitative" +version = "0.1.0" +description = "Semantic evaluation — score AIDLC document outputs for similarity of intent and design" +requires-python = ">=3.13" +dependencies = [ + "aidlc-shared", + "pyyaml>=6.0", +] + +[project.optional-dependencies] +llm = [ + "boto3>=1.35.0", +] +dev = [ + "pytest>=8.0.0", +] + +[tool.uv.sources] +aidlc-shared = { workspace = true } + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/qualitative"] diff --git a/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/__init__.py b/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/__init__.py new file mode 100644 index 00000000..45be92a1 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/__init__.py @@ -0,0 +1,18 @@ +"""Semantic evaluation — score AIDLC document outputs for similarity of intent and design. + +Default scorer uses Amazon Bedrock (LlmScorer). The HeuristicScorer is available +for offline/unit test scenarios but does not provide true semantic evaluation. +""" + +from qualitative.comparator import compare_runs +from qualitative.models import ComparisonResult, DocumentScore, PhaseScore +from qualitative.scorer import HeuristicScorer, LlmScorer + +__all__ = [ + "compare_runs", + "ComparisonResult", + "DocumentScore", + "PhaseScore", + "LlmScorer", + "HeuristicScorer", +] diff --git a/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/__main__.py b/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/__main__.py new file mode 100644 index 00000000..8b7b47bf --- /dev/null +++ b/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/__main__.py @@ -0,0 +1,118 @@ +"""CLI entry point for qualitative (semantic) evaluation. + +Usage: + python -m qualitative compare \ + --reference test_cases/sci-calc/golden-aidlc-docs \ + --candidate runs/20260213T194046-.../aidlc-docs \ + --profile default \ + --output comparison-results.yaml +""" + +from __future__ import annotations + +import argparse +import io +import sys +from pathlib import Path + +from qualitative.comparator import compare_runs + +if sys.stdout.encoding and sys.stdout.encoding.lower().replace("-", "") != "utf8": + sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") + sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace") + + +def main() -> None: + parser = argparse.ArgumentParser( + prog="qualitative", + description="Semantic evaluation of AIDLC document outputs via Bedrock", + ) + sub = parser.add_subparsers(dest="command") + + compare = sub.add_parser( + "compare", + help="Compare candidate aidlc-docs against a golden reference using Bedrock", + ) + compare.add_argument( + "--reference", required=True, type=Path, + help="Path to reference aidlc-docs directory (golden baseline)", + ) + compare.add_argument( + "--candidate", required=True, type=Path, + help="Path to candidate aidlc-docs directory (run to evaluate)", + ) + compare.add_argument( + "--output", "-o", type=Path, default=None, + help="Write results YAML to this path", + ) + compare.add_argument( + "--profile", default=None, + help="AWS profile for Bedrock access (default: from environment)", + ) + compare.add_argument( + "--region", default=None, + help="AWS region (default: from environment)", + ) + compare.add_argument( + "--model-id", default="us.anthropic.claude-sonnet-4-5-20250929-v1:0", + help="Bedrock model ID for semantic scoring", + ) + + args = parser.parse_args() + + if args.command != "compare": + parser.print_help() + sys.exit(1) + + if not args.reference.is_dir(): + print(f"Error: reference path does not exist: {args.reference}", file=sys.stderr) + sys.exit(1) + if not args.candidate.is_dir(): + print(f"Error: candidate path does not exist: {args.candidate}", file=sys.stderr) + sys.exit(1) + + print(f"Reference: {args.reference}") + print(f"Candidate: {args.candidate}") + print(f"Scorer: Bedrock LLM ({args.model_id})") + print(f"Profile: {args.profile or '(from environment)'}") + print(f"Region: {args.region or '(from environment)'}") + print() + + result = compare_runs( + reference_path=args.reference, + candidate_path=args.candidate, + output_path=args.output, + aws_profile=args.profile, + aws_region=args.region, + model_id=args.model_id, + ) + + print() + print("=" * 60) + print(f"Overall Score: {result.overall_score:.4f}") + print("=" * 60) + for ps in result.phase_scores: + print(f"\n Phase: {ps.phase}") + print(f" Intent: {ps.avg_intent:.4f}") + print(f" Design: {ps.avg_design:.4f}") + print(f" Completeness: {ps.avg_completeness:.4f}") + print(f" Overall: {ps.avg_overall:.4f}") + for ds in ps.document_scores: + print(f" {ds.relative_path}: " + f"intent={ds.intent_similarity:.2f} " + f"design={ds.design_similarity:.2f} " + f"complete={ds.completeness:.2f}") + if ds.notes: + print(f" {ds.notes}") + + if result.unmatched_reference: + print(f"\n Unmatched in reference: {result.unmatched_reference}") + if result.unmatched_candidate: + print(f"\n Unmatched in candidate: {result.unmatched_candidate}") + + if args.output: + print(f"\nResults written to: {args.output}") + + +if __name__ == "__main__": + main() diff --git a/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/comparator.py b/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/comparator.py new file mode 100644 index 00000000..6e7e1e16 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/comparator.py @@ -0,0 +1,96 @@ +"""Comparison orchestration — load, pair, score, and aggregate results.""" + +from __future__ import annotations + +from collections import defaultdict +from pathlib import Path + +import yaml +from shared.io import atomic_yaml_dump + +from qualitative.document import load_documents, pair_documents +from qualitative.models import ComparisonResult, PhaseScore +from qualitative.scorer import LlmScorer, Scorer + + +def compare_runs( + reference_path: Path, + candidate_path: Path, + scorer: Scorer | None = None, + output_path: Path | None = None, + *, + aws_profile: str | None = None, + aws_region: str | None = None, + model_id: str = "us.anthropic.claude-sonnet-4-5-20250929-v1:0", +) -> ComparisonResult: + """Compare AIDLC document outputs between a reference and candidate run. + + Uses Bedrock (LlmScorer) by default for true semantic evaluation. Pass an + explicit scorer to override (e.g. HeuristicScorer for offline/unit tests). + + Args: + reference_path: Path to the reference aidlc-docs directory (golden baseline). + candidate_path: Path to the candidate aidlc-docs directory (run to evaluate). + scorer: Scorer implementation. Defaults to LlmScorer via Bedrock. + output_path: If provided, write results as YAML to this path. + aws_profile: AWS profile for Bedrock access (used when scorer is None). + aws_region: AWS region for Bedrock (used when scorer is None). + model_id: Bedrock model ID for scoring (used when scorer is None). + + Returns: + ComparisonResult with per-document and per-phase scores. + """ + if scorer is None: + scorer = LlmScorer( + model_id=model_id, + region=aws_region, + profile=aws_profile, + ) + + ref_docs = load_documents(reference_path) + cand_docs = load_documents(candidate_path) + + paired, unmatched_ref, unmatched_cand = pair_documents(ref_docs, cand_docs) + + phase_documents: dict[str, list] = defaultdict(list) + for pair in paired: + print(f" Scoring: {pair.relative_path} ({pair.phase})") + doc_score = scorer.score(pair) + phase_documents[pair.phase].append(doc_score) + + # Build phase scores from all phases found in the documents rather + # than a hardcoded list, so new AIDLC phases are not silently dropped. + # Preserve a stable ordering: known phases first, then any extras. + _KNOWN_PHASE_ORDER = ("inception", "construction", "other") + ordered_phases = [p for p in _KNOWN_PHASE_ORDER if p in phase_documents] + ordered_phases += sorted(p for p in phase_documents if p not in _KNOWN_PHASE_ORDER) + + phase_scores = [] + for phase in ordered_phases: + ps = PhaseScore(phase=phase, document_scores=phase_documents[phase]) + phase_scores.append(ps) + + # Store paths relative to cwd so YAML output never leaks absolute paths. + try: + rel_ref = reference_path.resolve().relative_to(Path.cwd().resolve()) + except ValueError: + rel_ref = reference_path + try: + rel_cand = candidate_path.resolve().relative_to(Path.cwd().resolve()) + except ValueError: + rel_cand = candidate_path + + result = ComparisonResult( + reference_path=str(rel_ref), + candidate_path=str(rel_cand), + phase_scores=phase_scores, + unmatched_reference=unmatched_ref, + unmatched_candidate=unmatched_cand, + ) + result.compute_overall() + + if output_path is not None: + output_path.parent.mkdir(parents=True, exist_ok=True) + atomic_yaml_dump(result.to_dict(), output_path) + + return result diff --git a/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/document.py b/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/document.py new file mode 100644 index 00000000..e0c43251 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/document.py @@ -0,0 +1,92 @@ +"""Document loading and pairing for AIDLC output comparison.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +_SKIP_FILES = frozenset({"aidlc-state.md", "audit.md"}) + + +@dataclass +class AidlcDocument: + """A single AIDLC markdown document with its phase and content.""" + + relative_path: str + phase: str + content: str + + +def classify_phase(relative_path: str) -> str: + """Determine the AIDLC phase from a document's relative path. + + Returns 'inception', 'construction', or 'other'. + """ + parts = Path(relative_path).parts + if parts and parts[0] == "inception": + return "inception" + if parts and parts[0] == "construction": + return "construction" + return "other" + + +def load_documents(aidlc_docs_path: Path) -> list[AidlcDocument]: + """Load all markdown documents from an aidlc-docs directory. + + Skips workflow-internal files (aidlc-state.md, audit.md) that track + process state rather than design intent. + """ + if not aidlc_docs_path.is_dir(): + return [] + + docs: list[AidlcDocument] = [] + for md_file in sorted(aidlc_docs_path.rglob("*.md")): + relative = md_file.relative_to(aidlc_docs_path).as_posix() + if md_file.name in _SKIP_FILES: + continue + try: + content = md_file.read_text(encoding="utf-8") + except (OSError, UnicodeDecodeError): + continue + if not content.strip(): + continue + phase = classify_phase(relative) + docs.append(AidlcDocument(relative_path=relative, phase=phase, content=content)) + return docs + + +@dataclass +class DocumentPair: + """A matched pair of reference and candidate documents at the same relative path.""" + + relative_path: str + phase: str + reference: AidlcDocument + candidate: AidlcDocument + + +def pair_documents( + reference_docs: list[AidlcDocument], + candidate_docs: list[AidlcDocument], +) -> tuple[list[DocumentPair], list[str], list[str]]: + """Pair reference and candidate documents by relative path. + + Returns (paired, unmatched_reference_paths, unmatched_candidate_paths). + """ + ref_by_path = {d.relative_path: d for d in reference_docs} + cand_by_path = {d.relative_path: d for d in candidate_docs} + + paired: list[DocumentPair] = [] + for path, ref_doc in ref_by_path.items(): + if path in cand_by_path: + paired.append(DocumentPair( + relative_path=path, + phase=ref_doc.phase, + reference=ref_doc, + candidate=cand_by_path[path], + )) + + unmatched_ref = sorted(set(ref_by_path) - set(cand_by_path)) + unmatched_cand = sorted(set(cand_by_path) - set(ref_by_path)) + + return paired, unmatched_ref, unmatched_cand diff --git a/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/models.py b/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/models.py new file mode 100644 index 00000000..77306d89 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/models.py @@ -0,0 +1,98 @@ +"""Data models for qualitative comparison results.""" + +from __future__ import annotations + +from dataclasses import dataclass, field + + +@dataclass +class DocumentScore: + """Similarity scores for a single document pair (reference vs candidate).""" + + relative_path: str + phase: str + intent_similarity: float + design_similarity: float + completeness: float + overall: float = 0.0 + notes: str = "" + + def __post_init__(self) -> None: + if self.overall == 0.0: + self.overall = ( + self.intent_similarity * 0.4 + + self.design_similarity * 0.4 + + self.completeness * 0.2 + ) + + +@dataclass +class PhaseScore: + """Aggregated scores for an AIDLC phase (inception or construction).""" + + phase: str + document_scores: list[DocumentScore] = field(default_factory=list) + avg_intent: float = 0.0 + avg_design: float = 0.0 + avg_completeness: float = 0.0 + avg_overall: float = 0.0 + + def compute_averages(self) -> None: + if not self.document_scores: + return + n = len(self.document_scores) + self.avg_intent = sum(d.intent_similarity for d in self.document_scores) / n + self.avg_design = sum(d.design_similarity for d in self.document_scores) / n + self.avg_completeness = sum(d.completeness for d in self.document_scores) / n + self.avg_overall = sum(d.overall for d in self.document_scores) / n + + +@dataclass +class ComparisonResult: + """Full comparison result across all phases and documents.""" + + reference_path: str + candidate_path: str + phase_scores: list[PhaseScore] = field(default_factory=list) + unmatched_reference: list[str] = field(default_factory=list) + unmatched_candidate: list[str] = field(default_factory=list) + overall_score: float = 0.0 + + def compute_overall(self) -> None: + for ps in self.phase_scores: + ps.compute_averages() + scored_phases = [ps for ps in self.phase_scores if ps.document_scores] + if scored_phases: + self.overall_score = sum(ps.avg_overall for ps in scored_phases) / len(scored_phases) + + def to_dict(self) -> dict: + """Serialize to a plain dict suitable for YAML output.""" + self.compute_overall() + return { + "reference_path": self.reference_path, + "candidate_path": self.candidate_path, + "overall_score": round(self.overall_score, 4), + "phases": [ + { + "phase": ps.phase, + "avg_intent": round(ps.avg_intent, 4), + "avg_design": round(ps.avg_design, 4), + "avg_completeness": round(ps.avg_completeness, 4), + "avg_overall": round(ps.avg_overall, 4), + "documents": [ + { + "path": ds.relative_path, + "intent_similarity": round(ds.intent_similarity, 4), + "design_similarity": round(ds.design_similarity, 4), + "completeness": round(ds.completeness, 4), + "overall": round(ds.overall, 4), + "notes": ds.notes, + } + for ds in ps.document_scores + ], + } + for ps in self.phase_scores + ], + "unmatched_reference": self.unmatched_reference, + "unmatched_candidate": self.unmatched_candidate, + } diff --git a/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/scorer.py b/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/scorer.py new file mode 100644 index 00000000..9b5cd192 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/qualitative/src/qualitative/scorer.py @@ -0,0 +1,235 @@ +"""Scoring implementations for document similarity evaluation. + +Provides a Scorer protocol and two implementations: +- HeuristicScorer: fast, deterministic, no external dependencies +- LlmScorer: uses Bedrock for deeper semantic understanding (requires boto3) +""" + +from __future__ import annotations + +import json +import logging +import re +from collections import Counter +from typing import Protocol + +from qualitative.document import DocumentPair +from qualitative.models import DocumentScore + +_STOPWORDS = frozenset({ + "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", + "of", "with", "by", "from", "is", "are", "was", "were", "be", "been", + "being", "have", "has", "had", "do", "does", "did", "will", "would", + "could", "should", "may", "might", "shall", "can", "this", "that", + "these", "those", "it", "its", "not", "no", "as", "if", "then", + "than", "so", "up", "out", "about", +}) + + +class Scorer(Protocol): + """Protocol for document pair scoring implementations.""" + + def score(self, pair: DocumentPair) -> DocumentScore: ... + + +# --------------------------------------------------------------------------- +# Heuristic scorer — deterministic, no LLM required +# --------------------------------------------------------------------------- + +def _tokenize(text: str) -> list[str]: + """Lowercase tokenization with stopword removal.""" + words = re.findall(r"[a-z][a-z0-9_-]*", text.lower()) + return [w for w in words if w not in _STOPWORDS and len(w) > 1] + + +def _extract_headings(text: str) -> list[str]: + """Extract markdown heading text (any level).""" + return [m.group(1).strip().lower() for m in re.finditer(r"^#+\s+(.+)$", text, re.MULTILINE)] + + +def _extract_identifiers(text: str) -> set[str]: + """Extract likely technical identifiers (CamelCase, snake_case, paths).""" + camel = set(re.findall(r"\b[A-Z][a-z]+(?:[A-Z][a-z]+)+\b", text)) + snake = set(re.findall(r"\b[a-z][a-z0-9]*(?:_[a-z0-9]+)+\b", text)) + paths = set(re.findall(r"\b\w+(?:/\w+)+(?:\.\w+)?\b", text)) + return {s.lower() for s in camel | snake | paths} + + +def _cosine_similarity(a: Counter, b: Counter) -> float: + """Cosine similarity between two term-frequency counters.""" + if not a or not b: + return 0.0 + overlap = sum(a[k] * b[k] for k in a if k in b) + mag_a = sum(v * v for v in a.values()) ** 0.5 + mag_b = sum(v * v for v in b.values()) ** 0.5 + if mag_a == 0 or mag_b == 0: + return 0.0 + return overlap / (mag_a * mag_b) + + +def _jaccard_similarity(a: set, b: set) -> float: + """Jaccard similarity between two sets.""" + if not a and not b: + return 1.0 + if not a or not b: + return 0.0 + return len(a & b) / len(a | b) + + +class HeuristicScorer: + """Fast, deterministic scorer using text similarity heuristics. + + Evaluates three dimensions: + - Intent: term-frequency cosine similarity of document body text + - Design: Jaccard similarity of technical identifiers and heading structure + - Completeness: fraction of reference headings present in the candidate + """ + + def score(self, pair: DocumentPair) -> DocumentScore: + ref_tokens = Counter(_tokenize(pair.reference.content)) + cand_tokens = Counter(_tokenize(pair.candidate.content)) + intent = _cosine_similarity(ref_tokens, cand_tokens) + + ref_ids = _extract_identifiers(pair.reference.content) + cand_ids = _extract_identifiers(pair.candidate.content) + ref_headings = set(_extract_headings(pair.reference.content)) + cand_headings = set(_extract_headings(pair.candidate.content)) + id_sim = _jaccard_similarity(ref_ids, cand_ids) + heading_sim = _jaccard_similarity(ref_headings, cand_headings) + design = 0.6 * id_sim + 0.4 * heading_sim + + if ref_headings: + completeness = len(ref_headings & cand_headings) / len(ref_headings) + else: + completeness = 1.0 if not cand_headings else 0.0 + + return DocumentScore( + relative_path=pair.relative_path, + phase=pair.phase, + intent_similarity=round(intent, 4), + design_similarity=round(design, 4), + completeness=round(completeness, 4), + ) + + +# --------------------------------------------------------------------------- +# LLM scorer — requires boto3 and Bedrock access +# --------------------------------------------------------------------------- + +_LLM_PROMPT_TEMPLATE = """\ +You are an expert evaluator comparing two AIDLC (AI-Driven Development Life Cycle) documents. + +The REFERENCE document represents the golden baseline. The CANDIDATE document is from a new run. +Both documents were produced by the same AIDLC phase: {phase}. + +Score the CANDIDATE against the REFERENCE on three dimensions (each 0.0 to 1.0): + +1. **Intent Similarity**: Do both documents capture the same goals, requirements, and purpose? + - 1.0 = identical intent, same requirements and objectives + - 0.5 = partially overlapping intent, some requirements differ + - 0.0 = completely different intent + +2. **Design Similarity**: Are the architectural decisions, component structures, and technical approaches similar? + - 1.0 = same architecture, same components, same patterns + - 0.5 = similar high-level design but different details + - 0.0 = completely different design approach + +3. **Completeness**: Does the candidate cover the same topics and sections as the reference? + - 1.0 = all reference topics fully covered + - 0.5 = major topics covered but some gaps + - 0.0 = most reference topics missing + +Respond with ONLY a JSON object (no markdown fences): +{{"intent_similarity": <float>, "design_similarity": <float>, "completeness": <float>, "notes": "<brief explanation>"}} + +--- REFERENCE DOCUMENT ({doc_path}) --- +{reference_content} + +--- CANDIDATE DOCUMENT ({doc_path}) --- +{candidate_content} +""" + + +logger = logging.getLogger(__name__) + + +class LlmScorer: + """Scorer that uses an LLM via Amazon Bedrock for semantic evaluation. + + Requires boto3 and valid AWS credentials configured for Bedrock access. + + If a single document fails (malformed LLM response, transient Bedrock + error), the scorer falls back to ``HeuristicScorer`` for that document + and continues with the remaining pairs rather than aborting the entire + qualitative evaluation. + """ + + def __init__( + self, + model_id: str = "us.anthropic.claude-sonnet-4-5-20250929-v1:0", + region: str | None = None, + profile: str | None = None, + max_tokens: int = 512, + ) -> None: + import boto3 + from botocore.config import Config as BotoConfig + + session_kwargs: dict = {} + if region: + session_kwargs["region_name"] = region + if profile: + session_kwargs["profile_name"] = profile + session = boto3.Session(**session_kwargs) + client_config = BotoConfig( + read_timeout=300, + connect_timeout=30, + retries={"max_attempts": 10, "mode": "adaptive"}, + ) + self._client = session.client("bedrock-runtime", config=client_config) + self._model_id = model_id + self._max_tokens = max_tokens + self._fallback = HeuristicScorer() + + def score(self, pair: DocumentPair) -> DocumentScore: + try: + return self._score_llm(pair) + except Exception: + logger.warning( + "LLM scoring failed for %s — falling back to heuristic scorer", + pair.relative_path, + exc_info=True, + ) + result = self._fallback.score(pair) + result.notes = f"[fallback: heuristic] {result.notes or ''}".strip() + return result + + def _score_llm(self, pair: DocumentPair) -> DocumentScore: + prompt = _LLM_PROMPT_TEMPLATE.format( + phase=pair.phase, + doc_path=pair.relative_path, + reference_content=pair.reference.content[:15_000], + candidate_content=pair.candidate.content[:15_000], + ) + + response = self._client.converse( + modelId=self._model_id, + messages=[{"role": "user", "content": [{"text": prompt}]}], + inferenceConfig={"maxTokens": self._max_tokens, "temperature": 0.0}, + ) + + body = response["output"]["message"]["content"][0]["text"] + body = body.strip() + if body.startswith("```"): + body = re.sub(r"^```\w*\n?", "", body) + body = re.sub(r"\n?```$", "", body) + + parsed = json.loads(body) + + return DocumentScore( + relative_path=pair.relative_path, + phase=pair.phase, + intent_similarity=float(parsed["intent_similarity"]), + design_similarity=float(parsed["design_similarity"]), + completeness=float(parsed["completeness"]), + notes=parsed.get("notes", ""), + ) diff --git a/scripts/aidlc-evaluator/packages/qualitative/tests/test_comparator.py b/scripts/aidlc-evaluator/packages/qualitative/tests/test_comparator.py new file mode 100644 index 00000000..0166e604 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/qualitative/tests/test_comparator.py @@ -0,0 +1,157 @@ +"""Tests for the comparison orchestrator. + +All tests use HeuristicScorer explicitly so they run offline without Bedrock. +The default scorer in compare_runs() is LlmScorer (Bedrock) — these tests +override that to keep the unit test suite fast and credential-free. +""" + +from __future__ import annotations + +from pathlib import Path + +import yaml + +from qualitative.comparator import compare_runs +from qualitative.models import ComparisonResult +from qualitative.scorer import HeuristicScorer + +_HEURISTIC = HeuristicScorer() + + +def _create_aidlc_docs(base: Path, docs: dict[str, str]) -> Path: + """Helper to create a mock aidlc-docs directory tree.""" + for rel_path, content in docs.items(): + fp = base / rel_path + fp.parent.mkdir(parents=True, exist_ok=True) + fp.write_text(content) + return base + + +class TestCompareRuns: + def test_identical_runs(self, tmp_path: Path): + content = { + "inception/requirements/requirements.md": ( + "# Requirements\n## FR-001: Arithmetic\nThe API shall add numbers.\n" + ), + "construction/plans/plan.md": ( + "# Code Plan\n## Step 1: Setup\nCreate project structure.\n" + ), + } + ref = _create_aidlc_docs(tmp_path / "ref", content) + cand = _create_aidlc_docs(tmp_path / "cand", content) + + result = compare_runs(ref, cand, scorer=_HEURISTIC) + assert isinstance(result, ComparisonResult) + assert result.overall_score > 0.9 + assert len(result.phase_scores) == 2 + assert result.unmatched_reference == [] + assert result.unmatched_candidate == [] + + def test_unmatched_documents_tracked(self, tmp_path: Path): + ref_content = { + "inception/requirements/requirements.md": "# Reqs\nContent.\n", + "inception/design/extra.md": "# Extra\nOnly in reference.\n", + } + cand_content = { + "inception/requirements/requirements.md": "# Reqs\nContent.\n", + "inception/design/new-doc.md": "# New\nOnly in candidate.\n", + } + ref = _create_aidlc_docs(tmp_path / "ref", ref_content) + cand = _create_aidlc_docs(tmp_path / "cand", cand_content) + + result = compare_runs(ref, cand, scorer=_HEURISTIC) + assert "inception/design/extra.md" in result.unmatched_reference + assert "inception/design/new-doc.md" in result.unmatched_candidate + + def test_empty_candidate(self, tmp_path: Path): + ref_content = {"inception/reqs.md": "# Reqs\nContent.\n"} + ref = _create_aidlc_docs(tmp_path / "ref", ref_content) + cand = tmp_path / "cand" + cand.mkdir() + + result = compare_runs(ref, cand, scorer=_HEURISTIC) + assert result.overall_score == 0.0 + assert len(result.unmatched_reference) == 1 + + def test_yaml_output(self, tmp_path: Path): + content = {"inception/reqs.md": "# Requirements\nFR-001: Add numbers.\n"} + ref = _create_aidlc_docs(tmp_path / "ref", content) + cand = _create_aidlc_docs(tmp_path / "cand", content) + out = tmp_path / "results" / "comparison.yaml" + + compare_runs(ref, cand, scorer=_HEURISTIC, output_path=out) + + assert out.exists() + with open(out) as f: + data = yaml.safe_load(f) + assert "overall_score" in data + assert "phases" in data + assert len(data["phases"]) > 0 + assert "documents" in data["phases"][0] + + def test_to_dict_structure(self, tmp_path: Path): + content = { + "inception/reqs.md": "# Requirements\nStuff.\n", + "construction/plan.md": "# Plan\nSteps.\n", + } + ref = _create_aidlc_docs(tmp_path / "ref", content) + cand = _create_aidlc_docs(tmp_path / "cand", content) + + result = compare_runs(ref, cand, scorer=_HEURISTIC) + d = result.to_dict() + + assert isinstance(d["overall_score"], float) + assert isinstance(d["phases"], list) + for phase_data in d["phases"]: + assert "phase" in phase_data + assert "avg_intent" in phase_data + assert "avg_design" in phase_data + assert "avg_completeness" in phase_data + for doc_data in phase_data["documents"]: + assert "path" in doc_data + assert "intent_similarity" in doc_data + assert "design_similarity" in doc_data + assert "completeness" in doc_data + + def test_phase_ordering(self, tmp_path: Path): + content = { + "construction/plan.md": "# Plan\n", + "inception/reqs.md": "# Reqs\n", + } + ref = _create_aidlc_docs(tmp_path / "ref", content) + cand = _create_aidlc_docs(tmp_path / "cand", content) + + result = compare_runs(ref, cand, scorer=_HEURISTIC) + phases = [ps.phase for ps in result.phase_scores] + assert phases == ["inception", "construction"] + + +class TestCompareRunsWithRealData: + """Integration tests using the golden baseline — HeuristicScorer only (no Bedrock).""" + + def test_self_comparison_golden(self): + golden = Path(__file__).resolve().parents[3] / "test_cases" / "sci-calc" / "golden-aidlc-docs" + if not golden.is_dir(): + return + + result = compare_runs(golden, golden, scorer=_HEURISTIC) + assert result.overall_score > 0.95 + assert result.unmatched_reference == [] + assert result.unmatched_candidate == [] + assert len(result.phase_scores) >= 2 + + def test_cross_run_comparison(self): + golden = Path(__file__).resolve().parents[3] / "test_cases" / "sci-calc" / "golden-aidlc-docs" + run1_docs = ( + Path(__file__).resolve().parents[3] + / "runs" + / "20260213T194046-9412bc326d7f4fd09990b9aafecbf026" + / "aidlc-docs" + ) + if not golden.is_dir() or not run1_docs.is_dir(): + return + + result = compare_runs(golden, run1_docs, scorer=_HEURISTIC) + assert result.overall_score > 0.3 + for ps in result.phase_scores: + assert ps.avg_intent > 0.0 diff --git a/scripts/aidlc-evaluator/packages/qualitative/tests/test_document.py b/scripts/aidlc-evaluator/packages/qualitative/tests/test_document.py new file mode 100644 index 00000000..a12401be --- /dev/null +++ b/scripts/aidlc-evaluator/packages/qualitative/tests/test_document.py @@ -0,0 +1,128 @@ +"""Tests for document loading and pairing.""" + +from __future__ import annotations + +from pathlib import Path + +from qualitative.document import ( + AidlcDocument, + classify_phase, + load_documents, + pair_documents, +) + + +class TestClassifyPhase: + def test_inception_path(self): + assert classify_phase("inception/requirements/requirements.md") == "inception" + + def test_construction_path(self): + assert classify_phase("construction/plans/code-gen-plan.md") == "construction" + + def test_root_file(self): + assert classify_phase("some-doc.md") == "other" + + def test_nested_inception(self): + assert classify_phase("inception/application-design/components.md") == "inception" + + +class TestLoadDocuments: + def test_loads_markdown_files(self, tmp_path: Path): + inc = tmp_path / "inception" / "requirements" + inc.mkdir(parents=True) + (inc / "requirements.md").write_text("# Requirements\nFR-001: Do stuff\n") + con = tmp_path / "construction" / "plans" + con.mkdir(parents=True) + (con / "plan.md").write_text("# Code Plan\nStep 1\n") + + docs = load_documents(tmp_path) + assert len(docs) == 2 + paths = {d.relative_path for d in docs} + assert "inception/requirements/requirements.md" in paths + assert "construction/plans/plan.md" in paths + + def test_skips_aidlc_state_and_audit(self, tmp_path: Path): + (tmp_path / "aidlc-state.md").write_text("state tracking") + (tmp_path / "audit.md").write_text("audit log") + (tmp_path / "real-doc.md").write_text("# Real content") + + docs = load_documents(tmp_path) + assert len(docs) == 1 + assert docs[0].relative_path == "real-doc.md" + + def test_skips_empty_files(self, tmp_path: Path): + (tmp_path / "empty.md").write_text("") + (tmp_path / "whitespace.md").write_text(" \n ") + (tmp_path / "real.md").write_text("# Content") + + docs = load_documents(tmp_path) + assert len(docs) == 1 + + def test_nonexistent_directory(self, tmp_path: Path): + docs = load_documents(tmp_path / "does-not-exist") + assert docs == [] + + def test_phase_assignment(self, tmp_path: Path): + inc = tmp_path / "inception" + inc.mkdir() + (inc / "reqs.md").write_text("# Reqs") + con = tmp_path / "construction" + con.mkdir() + (con / "plan.md").write_text("# Plan") + (tmp_path / "other.md").write_text("# Other") + + docs = load_documents(tmp_path) + phases = {d.relative_path: d.phase for d in docs} + assert phases["inception/reqs.md"] == "inception" + assert phases["construction/plan.md"] == "construction" + assert phases["other.md"] == "other" + + +class TestPairDocuments: + def _make_doc(self, path: str, content: str = "content") -> AidlcDocument: + return AidlcDocument(relative_path=path, phase=classify_phase(path), content=content) + + def test_perfect_match(self): + ref = [self._make_doc("inception/reqs.md"), self._make_doc("construction/plan.md")] + cand = [self._make_doc("inception/reqs.md"), self._make_doc("construction/plan.md")] + paired, unmatched_ref, unmatched_cand = pair_documents(ref, cand) + assert len(paired) == 2 + assert unmatched_ref == [] + assert unmatched_cand == [] + + def test_unmatched_reference(self): + ref = [self._make_doc("inception/reqs.md"), self._make_doc("inception/extra.md")] + cand = [self._make_doc("inception/reqs.md")] + paired, unmatched_ref, unmatched_cand = pair_documents(ref, cand) + assert len(paired) == 1 + assert unmatched_ref == ["inception/extra.md"] + assert unmatched_cand == [] + + def test_unmatched_candidate(self): + ref = [self._make_doc("inception/reqs.md")] + cand = [self._make_doc("inception/reqs.md"), self._make_doc("inception/new.md")] + paired, unmatched_ref, unmatched_cand = pair_documents(ref, cand) + assert len(paired) == 1 + assert unmatched_ref == [] + assert unmatched_cand == ["inception/new.md"] + + def test_no_overlap(self): + ref = [self._make_doc("inception/a.md")] + cand = [self._make_doc("inception/b.md")] + paired, unmatched_ref, unmatched_cand = pair_documents(ref, cand) + assert len(paired) == 0 + assert unmatched_ref == ["inception/a.md"] + assert unmatched_cand == ["inception/b.md"] + + def test_empty_inputs(self): + paired, unmatched_ref, unmatched_cand = pair_documents([], []) + assert paired == [] + assert unmatched_ref == [] + assert unmatched_cand == [] + + def test_pair_preserves_content(self): + ref = [self._make_doc("inception/reqs.md", "reference content")] + cand = [self._make_doc("inception/reqs.md", "candidate content")] + paired, _, _ = pair_documents(ref, cand) + assert paired[0].reference.content == "reference content" + assert paired[0].candidate.content == "candidate content" diff --git a/scripts/aidlc-evaluator/packages/qualitative/tests/test_scorer.py b/scripts/aidlc-evaluator/packages/qualitative/tests/test_scorer.py new file mode 100644 index 00000000..8a0dc30b --- /dev/null +++ b/scripts/aidlc-evaluator/packages/qualitative/tests/test_scorer.py @@ -0,0 +1,169 @@ +"""Tests for the heuristic scorer.""" + +from __future__ import annotations + +from qualitative.document import AidlcDocument, DocumentPair +from qualitative.scorer import ( + HeuristicScorer, + _cosine_similarity, + _extract_headings, + _extract_identifiers, + _jaccard_similarity, + _tokenize, +) +from collections import Counter + + +class TestTokenize: + def test_basic_tokenization(self): + tokens = _tokenize("The API shall provide arithmetic operations") + assert "api" in tokens + assert "arithmetic" in tokens + assert "operations" in tokens + assert "the" not in tokens + assert "shall" not in tokens + + def test_removes_stopwords(self): + tokens = _tokenize("a the and or but in on at to for of with") + assert tokens == [] + + def test_removes_short_tokens(self): + tokens = _tokenize("I a x go API test") + assert "api" in tokens + assert "test" in tokens + assert "go" in tokens + assert "x" not in tokens + + def test_handles_code_identifiers(self): + tokens = _tokenize("math_engine routes/arithmetic pyproject.toml") + assert "math_engine" in tokens + assert "arithmetic" in tokens + + +class TestExtractHeadings: + def test_extracts_all_levels(self): + text = "# Title\n## Section\n### Subsection\nBody text" + headings = _extract_headings(text) + assert "title" in headings + assert "section" in headings + assert "subsection" in headings + + def test_no_headings(self): + assert _extract_headings("just body text\nno headings") == [] + + def test_strips_whitespace(self): + headings = _extract_headings("# Spaced Heading \n") + assert headings == ["spaced heading"] + + +class TestExtractIdentifiers: + def test_camel_case(self): + ids = _extract_identifiers("Use the MathEngine and ResponseModel classes") + assert "mathengine" in ids + assert "responsemodel" in ids + + def test_snake_case(self): + ids = _extract_identifiers("call math_engine and run_tests") + assert "math_engine" in ids + assert "run_tests" in ids + + def test_paths(self): + ids = _extract_identifiers("see src/sci_calc/routes/arithmetic.py") + assert any("src" in i and "arithmetic" in i for i in ids) + + +class TestCosineSimilarity: + def test_identical_counters(self): + c = Counter({"api": 3, "math": 2}) + assert _cosine_similarity(c, c) > 0.99 + + def test_disjoint_counters(self): + a = Counter({"api": 1, "math": 1}) + b = Counter({"dog": 1, "cat": 1}) + assert _cosine_similarity(a, b) == 0.0 + + def test_partial_overlap(self): + a = Counter({"api": 2, "math": 1, "test": 1}) + b = Counter({"api": 1, "math": 3, "route": 1}) + sim = _cosine_similarity(a, b) + assert 0.0 < sim < 1.0 + + def test_empty_counter(self): + assert _cosine_similarity(Counter(), Counter({"a": 1})) == 0.0 + + +class TestJaccardSimilarity: + def test_identical_sets(self): + s = {"a", "b", "c"} + assert _jaccard_similarity(s, s) == 1.0 + + def test_disjoint_sets(self): + assert _jaccard_similarity({"a"}, {"b"}) == 0.0 + + def test_both_empty(self): + assert _jaccard_similarity(set(), set()) == 1.0 + + def test_one_empty(self): + assert _jaccard_similarity(set(), {"a"}) == 0.0 + + +class TestHeuristicScorer: + def _make_pair(self, ref_content: str, cand_content: str, path: str = "inception/reqs.md"): + return DocumentPair( + relative_path=path, + phase="inception", + reference=AidlcDocument(relative_path=path, phase="inception", content=ref_content), + candidate=AidlcDocument(relative_path=path, phase="inception", content=cand_content), + ) + + def test_identical_documents(self): + content = "# Requirements\n## FR-001: Arithmetic\nThe API shall add numbers.\n" + pair = self._make_pair(content, content) + score = HeuristicScorer().score(pair) + assert score.intent_similarity > 0.95 + assert score.design_similarity > 0.95 + assert score.completeness == 1.0 + assert score.overall > 0.95 + + def test_completely_different_documents(self): + ref = "# Database Schema\n## Tables\nusers, products, orders\n" + cand = "# Network Protocol\n## Packets\nTCP, UDP, ICMP\n" + pair = self._make_pair(ref, cand) + score = HeuristicScorer().score(pair) + assert score.intent_similarity < 0.3 + assert score.completeness < 0.3 + + def test_similar_but_not_identical(self): + ref = ( + "# Requirements\n## FR-001: Arithmetic Operations\n" + "The API shall provide add, subtract, multiply, divide.\n" + "## FR-002: Trigonometry\nThe API shall provide sin, cos, tan.\n" + ) + cand = ( + "# Requirements\n## FR-001: Arithmetic Operations\n" + "The API provides addition, subtraction, multiplication, division.\n" + "## FR-002: Trigonometry\nThe API provides sine, cosine, tangent.\n" + ) + pair = self._make_pair(ref, cand) + score = HeuristicScorer().score(pair) + assert score.intent_similarity > 0.3 + assert score.completeness == 1.0 + + def test_missing_sections_reduces_completeness(self): + ref = "# Requirements\n## Section A\ncontent\n## Section B\ncontent\n## Section C\ncontent\n" + cand = "# Requirements\n## Section A\ncontent\n" + pair = self._make_pair(ref, cand) + score = HeuristicScorer().score(pair) + assert score.completeness <= 0.5 + + def test_scores_in_valid_range(self): + pair = self._make_pair("# Doc\nSome content here.\n", "# Doc\nOther content here.\n") + score = HeuristicScorer().score(pair) + for val in [score.intent_similarity, score.design_similarity, score.completeness, score.overall]: + assert 0.0 <= val <= 1.0 + + def test_relative_path_preserved(self): + pair = self._make_pair("content", "content", "construction/plans/plan.md") + score = HeuristicScorer().score(pair) + assert score.relative_path == "construction/plans/plan.md" + assert score.phase == "inception" # phase comes from pair, not path diff --git a/scripts/aidlc-evaluator/packages/quantitative/pyproject.toml b/scripts/aidlc-evaluator/packages/quantitative/pyproject.toml new file mode 100644 index 00000000..5233ce57 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/quantitative/pyproject.toml @@ -0,0 +1,16 @@ +[project] +name = "aidlc-quantitative" +version = "0.1.0" +description = "Code evaluation — linting, security scanning, and code organization analysis" +requires-python = ">=3.13" +dependencies = [ + "pyyaml>=6.0", + "defusedxml>=0.7.1", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/quantitative"] diff --git a/scripts/aidlc-evaluator/packages/quantitative/src/quantitative/__init__.py b/scripts/aidlc-evaluator/packages/quantitative/src/quantitative/__init__.py new file mode 100644 index 00000000..9dfde155 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/quantitative/src/quantitative/__init__.py @@ -0,0 +1,23 @@ +"""Code evaluation — linting, security scanning, and code organization analysis. + +Usage: + from quantitative import scan_workspace, write_report + report = scan_workspace(Path("runs/.../workspace")) + write_report(report, Path("quality-report.yaml")) + +CLI: + python -m quantitative analyze <workspace> [-o quality-report.yaml] +""" + +from quantitative.models import LintFinding, QualityReport, SecurityFinding, ToolResult +from quantitative.scanner import print_report, scan_workspace, write_report + +__all__ = [ + "LintFinding", + "QualityReport", + "SecurityFinding", + "ToolResult", + "print_report", + "scan_workspace", + "write_report", +] diff --git a/scripts/aidlc-evaluator/packages/quantitative/src/quantitative/__main__.py b/scripts/aidlc-evaluator/packages/quantitative/src/quantitative/__main__.py new file mode 100644 index 00000000..c80e60f7 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/quantitative/src/quantitative/__main__.py @@ -0,0 +1,51 @@ +"""CLI entry point: python -m quantitative analyze <workspace>.""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +from quantitative.scanner import print_report, scan_workspace, write_report + + +def main() -> None: + parser = argparse.ArgumentParser( + prog="quantitative", + description="Run lint and security analysis on generated code", + ) + sub = parser.add_subparsers(dest="command") + + analyze = sub.add_parser("analyze", help="Analyze a workspace directory") + analyze.add_argument("workspace", type=Path, help="Path to workspace directory") + analyze.add_argument( + "--output", "-o", type=Path, default=None, + help="Write quality-report.yaml to this path", + ) + analyze.add_argument( + "--pmd-path", type=str, default=None, + help="Path to PMD executable for duplication analysis (default: search PATH)", + ) + + args = parser.parse_args() + if args.command != "analyze": + parser.print_help() + sys.exit(1) + + if not args.workspace.is_dir(): + print(f"Error: workspace not found: {args.workspace}", file=sys.stderr) + sys.exit(1) + + report = scan_workspace(args.workspace, pmd_path=args.pmd_path) + if report is None: + print("No recognizable project found in workspace.", file=sys.stderr) + sys.exit(1) + + print_report(report) + if args.output: + write_report(report, args.output) + print(f"\nResults written to: {args.output}") + + +if __name__ == "__main__": + main() diff --git a/scripts/aidlc-evaluator/packages/quantitative/src/quantitative/analyzers.py b/scripts/aidlc-evaluator/packages/quantitative/src/quantitative/analyzers.py new file mode 100644 index 00000000..f214bdb8 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/quantitative/src/quantitative/analyzers.py @@ -0,0 +1,451 @@ +"""Language-aware static analysis runners. + +Each analyzer wraps an external CLI tool (ruff, bandit, eslint, etc.), +runs it against the project, and parses its JSON/text output into our +standardized finding models. +""" + +from __future__ import annotations + +import json +import os +import re +import shutil +import subprocess +from pathlib import Path + +import defusedxml.ElementTree as ET + +from quantitative.models import ( + DuplicationFinding, + LintFinding, + SecurityFinding, + ToolResult, +) + +_TIMEOUT = 120 + + +def _tool_version(cmd: str, cwd: Path | None = None) -> str | None: + """Get the version string of a CLI tool, or None if not installed. + + Tries the tool directly first, then falls back to ``uv run <cmd>`` + so that project-local dev dependencies are found. + """ + for argv in ([cmd, "--version"], ["uv", "run", cmd, "--version"]): + if shutil.which(argv[0]) is None: + continue + try: + # nosec B603 - Running static --version command for tool detection + # nosemgrep: dangerous-subprocess-use-audit + result = subprocess.run( + argv, capture_output=True, text=True, timeout=10, + cwd=str(cwd) if cwd else None, + ) + if result.returncode != 0: + continue + first_line = result.stdout.strip().split("\n")[0] + m = re.search(r"[\d]+\.[\d]+[\.\d]*", first_line) + return m.group(0) if m else first_line + except (subprocess.TimeoutExpired, OSError): + continue + return None + + +def _resolve_cmd(cmd: str, cwd: Path | None = None) -> list[str]: + """Return [cmd] if on PATH, else ['uv', 'run', cmd] if uv is available. + + When using uv, the caller must pass cwd= to _run_tool so that uv + resolves the tool from the project's virtual environment. + """ + if shutil.which(cmd) is not None: + return [cmd] + if shutil.which("uv") is not None: + return ["uv", "run", cmd] + return [] + + +def _run_tool(cmd: list[str], cwd: Path) -> subprocess.CompletedProcess: + env = {**os.environ} + # nosec B603 - Running security analysis tools (ruff, bandit, semgrep) with validated arguments + # nosemgrep: dangerous-subprocess-use-audit + return subprocess.run( + cmd, cwd=str(cwd), + capture_output=True, text=True, timeout=_TIMEOUT, env=env, + ) + + +# --------------------------------------------------------------------------- +# Python: ruff (linter) +# --------------------------------------------------------------------------- + +def run_ruff(project_root: Path) -> ToolResult: + """Run ruff linter and return parsed findings.""" + version = _tool_version("ruff", cwd=project_root) + if version is None: + return ToolResult(tool="ruff", version=None, available=False, + error="ruff not found on PATH or via uv") + + prefix = _resolve_cmd("ruff") + if not prefix: + return ToolResult(tool="ruff", version=None, available=False, + error="ruff not found on PATH or via uv") + + try: + result = _run_tool( + prefix + ["check", "--output-format=json", "--no-fix", "."], + cwd=project_root, + ) + except subprocess.TimeoutExpired: + return ToolResult(tool="ruff", version=version, available=True, + error="timed out") + + findings: list[LintFinding] = [] + try: + items = json.loads(result.stdout) if result.stdout.strip() else [] + for item in items: + sev = "error" if item.get("code", "").startswith("E") else "warning" + raw_path = item.get("filename", "?") + try: + rel_path = str(Path(raw_path).relative_to(project_root)) + except ValueError: + rel_path = raw_path + findings.append(LintFinding( + file=rel_path, + line=item.get("location", {}).get("row", 0), + column=item.get("location", {}).get("column", 0), + code=item.get("code", "?"), + message=item.get("message", ""), + severity=sev, + )) + except (json.JSONDecodeError, KeyError): + pass + + return ToolResult( + tool="ruff", version=version, available=True, + exit_code=result.returncode, findings=findings, + ) + + +# --------------------------------------------------------------------------- +# Python: bandit (security scanner) +# --------------------------------------------------------------------------- + +def run_bandit(project_root: Path) -> ToolResult: + """Run bandit security scanner and return parsed findings.""" + version = _tool_version("bandit", cwd=project_root) + if version is None: + return ToolResult(tool="bandit", version=None, available=False, + error="bandit not found on PATH or via uv") + + prefix = _resolve_cmd("bandit") + if not prefix: + return ToolResult(tool="bandit", version=None, available=False, + error="bandit not found on PATH or via uv") + + src_dir = project_root / "src" + target = str(src_dir) if src_dir.is_dir() else "." + + try: + result = _run_tool( + prefix + ["-r", target, "-f", "json", "-q"], + cwd=project_root, + ) + except subprocess.TimeoutExpired: + return ToolResult(tool="bandit", version=version, available=True, + error="timed out") + + findings: list[SecurityFinding] = [] + output = result.stdout or result.stderr + try: + data = json.loads(output) if output.strip() else {} + for item in data.get("results", []): + findings.append(SecurityFinding( + file=item.get("filename", "?"), + line=item.get("line_number", 0), + code=item.get("test_id", "?"), + message=item.get("issue_text", ""), + severity=item.get("issue_severity", "MEDIUM").lower(), + confidence=item.get("issue_confidence", "MEDIUM").lower(), + cwe=_extract_cwe(item), + )) + except (json.JSONDecodeError, KeyError): + pass + + return ToolResult( + tool="bandit", version=version, available=True, + exit_code=result.returncode, findings=findings, + ) + + +def _extract_cwe(item: dict) -> str | None: + cwe = item.get("issue_cwe", {}) + if isinstance(cwe, dict) and cwe.get("id"): + return f"CWE-{cwe['id']}" + return None + + +# --------------------------------------------------------------------------- +# JavaScript/TypeScript: eslint (linter) +# --------------------------------------------------------------------------- + +def run_eslint(project_root: Path) -> ToolResult: + """Run eslint and return parsed findings.""" + version = _tool_version("eslint") + if version is None: + npx = shutil.which("npx") + if npx is None: + return ToolResult(tool="eslint", version=None, available=False, + error="eslint/npx not found on PATH") + cmd = ["npx", "eslint", ".", "--format=json"] + else: + cmd = ["eslint", ".", "--format=json"] + + try: + result = _run_tool(cmd, cwd=project_root) + except subprocess.TimeoutExpired: + return ToolResult(tool="eslint", version=version, available=True, + error="timed out") + + findings: list[LintFinding] = [] + try: + items = json.loads(result.stdout) if result.stdout.strip() else [] + for file_result in items: + for msg in file_result.get("messages", []): + sev_num = msg.get("severity", 1) + sev = "error" if sev_num == 2 else "warning" + findings.append(LintFinding( + file=file_result.get("filePath", "?"), + line=msg.get("line", 0), + column=msg.get("column", 0), + code=msg.get("ruleId", "?") or "parse-error", + message=msg.get("message", ""), + severity=sev, + )) + except (json.JSONDecodeError, KeyError): + pass + + return ToolResult( + tool="eslint", version=version or "npx", available=True, + exit_code=result.returncode, findings=findings, + ) + + +# --------------------------------------------------------------------------- +# JavaScript/TypeScript: npm audit (security) +# --------------------------------------------------------------------------- + +def run_npm_audit(project_root: Path) -> ToolResult: + """Run npm audit and return parsed findings.""" + npm = shutil.which("npm") + if npm is None: + return ToolResult(tool="npm-audit", version=None, available=False, + error="npm not found on PATH") + + version = _tool_version("npm") + lock_file = project_root / "package-lock.json" + if not lock_file.exists(): + return ToolResult(tool="npm-audit", version=version, available=True, + error="no package-lock.json found") + + try: + result = _run_tool( + ["npm", "audit", "--json"], + cwd=project_root, + ) + except subprocess.TimeoutExpired: + return ToolResult(tool="npm-audit", version=version, available=True, + error="timed out") + + findings: list[SecurityFinding] = [] + try: + data = json.loads(result.stdout) if result.stdout.strip() else {} + vulns = data.get("vulnerabilities", {}) + for name, info in vulns.items(): + findings.append(SecurityFinding( + file=f"package: {name}", + line=0, + code=info.get("via", [{}])[0].get("source", "?") if info.get("via") else "?", + message=info.get("via", [{}])[0].get("title", "") if info.get("via") else name, + severity=info.get("severity", "medium").lower(), + confidence="high", + cwe=None, + )) + except (json.JSONDecodeError, KeyError, IndexError, TypeError): + pass + + return ToolResult( + tool="npm-audit", version=version, available=True, + exit_code=result.returncode, findings=findings, + ) + + +# --------------------------------------------------------------------------- +# Python: semgrep (security scanner) +# --------------------------------------------------------------------------- + +_SEMGREP_SEVERITY_MAP = { + "ERROR": "high", + "WARNING": "medium", + "INFO": "low", +} + + +def run_semgrep(project_root: Path) -> ToolResult: + """Run semgrep security scanner and return parsed findings.""" + version = _tool_version("semgrep", cwd=project_root) + if version is None: + return ToolResult(tool="semgrep", version=None, available=False, + error="semgrep not found on PATH or via uv") + + prefix = _resolve_cmd("semgrep") + if not prefix: + return ToolResult(tool="semgrep", version=None, available=False, + error="semgrep not found on PATH or via uv") + + try: + result = _run_tool( + prefix + ["scan", "--config", "auto", "--json", str(project_root)], + cwd=project_root, + ) + except subprocess.TimeoutExpired: + return ToolResult(tool="semgrep", version=version, available=True, + error="timed out") + + findings: list[SecurityFinding] = [] + try: + data = json.loads(result.stdout) if result.stdout.strip() else {} + for item in data.get("results", []): + raw_sev = item.get("extra", {}).get("severity", "WARNING") + sev = _SEMGREP_SEVERITY_MAP.get(raw_sev, "medium") + raw_path = item.get("path", "?") + try: + rel_path = str(Path(raw_path).relative_to(project_root)) + except ValueError: + rel_path = raw_path + cwe_list = item.get("extra", {}).get("metadata", {}).get("cwe", []) + cwe_str = cwe_list[0] if cwe_list else None + findings.append(SecurityFinding( + file=rel_path, + line=item.get("start", {}).get("line", 0), + code=item.get("check_id", "?"), + message=item.get("extra", {}).get("message", ""), + severity=sev, + confidence=item.get("extra", {}).get("metadata", {}).get("confidence", "MEDIUM").lower(), + cwe=cwe_str, + )) + except (json.JSONDecodeError, KeyError): + pass + + return ToolResult( + tool="semgrep", version=version, available=True, + exit_code=result.returncode, findings=findings, + ) + + +# --------------------------------------------------------------------------- +# PMD CPD (Copy-Paste Detector) — code duplication +# --------------------------------------------------------------------------- + +_CPD_LANGUAGE_MAP = { + "python": "python", + "node": "ecmascript", +} + + +def _resolve_pmd(configured_path: str | None = None) -> str | None: + """Find the pmd executable. + + Uses *configured_path* when provided, otherwise searches PATH. + """ + if configured_path: + p = Path(configured_path).expanduser() + if p.is_file(): + return str(p) + return None + for name in ("pmd", "pmd.bat"): + found = shutil.which(name) + if found: + return found + return None + + +def run_cpd( + project_root: Path, + language: str = "python", + min_tokens: int = 100, + pmd_path: str | None = None, +) -> ToolResult: + """Run PMD CPD and return parsed duplication findings.""" + pmd = _resolve_pmd(pmd_path) + if pmd is None: + return ToolResult(tool="pmd-cpd", version=None, available=False, + error="pmd not found — set tools.pmd_path in config or install pmd on PATH") + + cpd_lang = _CPD_LANGUAGE_MAP.get(language, language) + + _CPD_EXCLUDES = { + ".pytest_cache", "__pycache__", ".venv", "venv", + "node_modules", ".git", ".tox", ".mypy_cache", ".ruff_cache", + ".cache", + } + exclude_args: list[str] = [] + abs_root = project_root.resolve() + for dirpath, dirnames, _ in os.walk(abs_root): + matched = [d for d in dirnames if d in _CPD_EXCLUDES] + for d in matched: + rel = os.path.relpath(os.path.join(dirpath, d), abs_root) + exclude_args.extend(["--exclude", f"./{rel}"]) + # prune so os.walk doesn't descend into excluded dirs + dirnames[:] = [d for d in dirnames if d not in _CPD_EXCLUDES] + + try: + result = _run_tool( + [pmd, "cpd", + "--minimum-tokens", str(min_tokens), + "--dir", ".", + "--language", cpd_lang, + "--format", "xml", + "--no-fail-on-violation", + *exclude_args], + cwd=project_root, + ) + except subprocess.TimeoutExpired: + return ToolResult(tool="pmd-cpd", version=None, available=True, + error="timed out") + + findings: list[DuplicationFinding] = [] + try: + if result.stdout.strip(): + root = ET.fromstring(result.stdout) + ns = root.tag.split("}")[0] + "}" if root.tag.startswith("{") else "" + for dup in root.findall(f"{ns}duplication"): + lines = int(dup.get("lines", 0)) + tokens = int(dup.get("tokens", 0)) + files = [] + for f_elem in dup.findall(f"{ns}file"): + raw_path = f_elem.get("path", "?") + try: + rel = str(Path(raw_path).relative_to(project_root)) + except ValueError: + rel = raw_path + files.append({ + "file": rel, + "line": int(f_elem.get("line", 0)), + "endline": int(f_elem.get("endline", 0)), + }) + codefragment_elem = dup.find(f"{ns}codefragment") + codefragment = (codefragment_elem.text or "").strip() if codefragment_elem is not None else "" + findings.append(DuplicationFinding( + files=files, + tokens=tokens, + lines=lines, + codefragment=codefragment[:500], + )) + except ET.ParseError: + pass + + return ToolResult( + tool="pmd-cpd", version=None, available=True, + exit_code=result.returncode, findings=findings, + ) diff --git a/scripts/aidlc-evaluator/packages/quantitative/src/quantitative/models.py b/scripts/aidlc-evaluator/packages/quantitative/src/quantitative/models.py new file mode 100644 index 00000000..fad7adb6 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/quantitative/src/quantitative/models.py @@ -0,0 +1,85 @@ +"""Data models for code quality analysis results.""" + +from __future__ import annotations + +from dataclasses import dataclass, field + + +@dataclass +class LintFinding: + file: str + line: int + column: int + code: str + message: str + severity: str # "error", "warning", "info" + + +@dataclass +class SecurityFinding: + file: str + line: int + code: str + message: str + severity: str # "high", "medium", "low" + confidence: str # "high", "medium", "low" + cwe: str | None = None + + +@dataclass +class DuplicationFinding: + files: list[dict] # each: {"file": str, "line": int, "endline": int} + tokens: int = 0 + lines: int = 0 + codefragment: str = "" + + +@dataclass +class ToolResult: + tool: str + version: str | None + available: bool + exit_code: int | None = None + error: str | None = None + findings: list = field(default_factory=list) + + +@dataclass +class QualityReport: + project_type: str + project_root: str + lint: ToolResult | None = None + security: ToolResult | None = None + semgrep: ToolResult | None = None + duplication: ToolResult | None = None + summary: dict = field(default_factory=dict) + + def compute_summary(self) -> None: + s: dict = {} + if self.lint and self.lint.available: + findings = self.lint.findings + s["lint_total"] = len(findings) + s["lint_errors"] = sum(1 for f in findings if f.severity == "error") + s["lint_warnings"] = sum(1 for f in findings if f.severity == "warning") + + sec_findings: list = [] + has_security_tool = False + if self.security and self.security.available: + sec_findings.extend(self.security.findings) + has_security_tool = True + if self.semgrep and self.semgrep.available: + sec_findings.extend(self.semgrep.findings) + has_security_tool = True + if has_security_tool: + s["security_total"] = len(sec_findings) + s["security_high"] = sum(1 for f in sec_findings if f.severity == "high") + s["security_medium"] = sum(1 for f in sec_findings if f.severity == "medium") + s["security_low"] = sum(1 for f in sec_findings if f.severity == "low") + + if self.duplication and self.duplication.available: + dup_findings = self.duplication.findings + s["duplication_blocks"] = len(dup_findings) + s["duplication_lines"] = sum(f.lines for f in dup_findings) + s["duplication_tokens"] = sum(f.tokens for f in dup_findings) + + self.summary = s diff --git a/scripts/aidlc-evaluator/packages/quantitative/src/quantitative/scanner.py b/scripts/aidlc-evaluator/packages/quantitative/src/quantitative/scanner.py new file mode 100644 index 00000000..d9613760 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/quantitative/src/quantitative/scanner.py @@ -0,0 +1,170 @@ +"""Orchestrator — detect project type, run appropriate analyzers, produce report.""" + +from __future__ import annotations + +from dataclasses import asdict +from pathlib import Path + +import yaml + +from quantitative.analyzers import ( + run_bandit, + run_cpd, + run_eslint, + run_npm_audit, + run_ruff, + run_semgrep, +) +from quantitative.models import QualityReport, ToolResult + +_PYTHON_MARKERS = ("pyproject.toml", "setup.py", "setup.cfg", "requirements.txt") +_NODE_MARKERS = ("package.json",) + +_MAX_SEARCH_DEPTH = 3 +_SKIP_DIRS = frozenset({ + ".venv", "venv", ".env", "env", "node_modules", + "__pycache__", ".pytest_cache", ".ruff_cache", ".mypy_cache", + ".git", ".hg", ".svn", "target", "dist", "build", + ".tox", ".nox", ".cache", +}) + + +def _detect_project(workspace: Path) -> tuple[str, Path] | None: + """BFS for a project root with a recognizable marker. + + Returns (project_type, project_root) or None. + """ + if not workspace.is_dir(): + return None + + def _check(d: Path) -> str | None: + for m in _PYTHON_MARKERS: + if (d / m).exists(): + return "python" + for m in _NODE_MARKERS: + if (d / m).exists(): + return "node" + return None + + pt = _check(workspace) + if pt: + return pt, workspace + + queue: list[tuple[Path, int]] = [(workspace, 0)] + while queue: + current, depth = queue.pop(0) + if depth >= _MAX_SEARCH_DEPTH: + continue + try: + children = sorted( + p for p in current.iterdir() + if p.is_dir() and not p.name.startswith(".") and p.name not in _SKIP_DIRS + ) + except OSError: + continue + for child in children: + pt = _check(child) + if pt: + return pt, child + queue.append((child, depth + 1)) + + return None + + +def _run_python_analyzers( + project_root: Path, + pmd_path: str | None = None, +) -> tuple[ToolResult, ToolResult, ToolResult, ToolResult]: + lint = run_ruff(project_root) + security = run_bandit(project_root) + semgrep = run_semgrep(project_root) + duplication = run_cpd(project_root, language="python", pmd_path=pmd_path) + return lint, security, semgrep, duplication + + +def _run_node_analyzers( + project_root: Path, + pmd_path: str | None = None, +) -> tuple[ToolResult, ToolResult, ToolResult, ToolResult]: + lint = run_eslint(project_root) + security = run_npm_audit(project_root) + semgrep = run_semgrep(project_root) + duplication = run_cpd(project_root, language="node", pmd_path=pmd_path) + return lint, security, semgrep, duplication + + +def scan_workspace(workspace: Path, pmd_path: str | None = None) -> QualityReport | None: + """Run lint + security + duplication analysis on a workspace directory. + + Returns a QualityReport, or None if no recognizable project is found. + """ + detection = _detect_project(workspace) + if detection is None: + return None + + project_type, project_root = detection + + if project_type == "python": + lint, security, semgrep, duplication = _run_python_analyzers(project_root, pmd_path=pmd_path) + elif project_type == "node": + lint, security, semgrep, duplication = _run_node_analyzers(project_root, pmd_path=pmd_path) + else: + return None + + report = QualityReport( + project_type=project_type, + project_root=str(project_root.relative_to(workspace)) + if project_root != workspace else ".", + lint=lint, + security=security, + semgrep=semgrep, + duplication=duplication, + ) + report.compute_summary() + return report + + +def write_report(report: QualityReport, output_path: Path) -> None: + """Serialize a QualityReport to YAML.""" + data = asdict(report) + with open(output_path, "w", encoding="utf-8") as f: + yaml.dump(data, f, default_flow_style=False, sort_keys=False) + + +def print_report(report: QualityReport) -> None: + """Print a human-readable summary to stdout.""" + print(f"\nProject type: {report.project_type}") + print(f"Project root: {report.project_root}") + + if report.lint: + _print_tool("Linter", report.lint) + if report.security: + _print_tool("Security (bandit)", report.security) + if report.semgrep: + _print_tool("Security (semgrep)", report.semgrep) + if report.duplication: + _print_tool("Duplication (CPD)", report.duplication) + + if report.summary: + print(f"\nSummary:") + for k, v in report.summary.items(): + print(f" {k}: {v}") + + +def _print_tool(label: str, result: ToolResult) -> None: + if not result.available: + print(f"\n{label} ({result.tool}): NOT AVAILABLE — {result.error}") + return + count = len(result.findings) + status = "clean" if count == 0 else f"{count} finding(s)" + print(f"\n{label} ({result.tool} {result.version}): {status}") + if result.error: + print(f" Error: {result.error}") + for f in result.findings[:20]: + if hasattr(f, "file"): + print(f" {f.file}:{f.line} [{f.code}] {f.message}") + elif hasattr(f, "files"): + locs = ", ".join(f"{e['file']}:{e['line']}" for e in f.files) + print(f" {f.lines} lines across {locs}") + if count > 20: + print(f" ... and {count - 20} more") diff --git a/scripts/aidlc-evaluator/packages/quantitative/tests/test_analyzers.py b/scripts/aidlc-evaluator/packages/quantitative/tests/test_analyzers.py new file mode 100644 index 00000000..95de9fdb --- /dev/null +++ b/scripts/aidlc-evaluator/packages/quantitative/tests/test_analyzers.py @@ -0,0 +1,170 @@ +"""Tests for analyzer JSON parsers. + +These tests mock subprocess.run to avoid requiring ruff/bandit/eslint +to be installed, and verify the parsing logic handles real tool output. +""" + +import json +from pathlib import Path +from unittest.mock import patch, MagicMock + +from quantitative.analyzers import run_ruff, run_bandit, run_eslint, run_npm_audit + + +def _mock_run(stdout="", stderr="", returncode=0): + result = MagicMock() + result.stdout = stdout + result.stderr = stderr + result.returncode = returncode + return result + + +# --------------------------------------------------------------------------- +# ruff +# --------------------------------------------------------------------------- + +class TestRuff: + def test_not_installed(self): + with patch("quantitative.analyzers.shutil.which", return_value=None): + result = run_ruff(Path(".")) + assert not result.available + assert "ruff not found" in result.error + + def test_clean_output(self): + with ( + patch("quantitative.analyzers.shutil.which", return_value="/usr/bin/ruff"), + patch("quantitative.analyzers._tool_version", return_value="0.8.0"), + patch("quantitative.analyzers._run_tool", return_value=_mock_run(stdout="[]")), + ): + result = run_ruff(Path(".")) + assert result.available + assert len(result.findings) == 0 + + def test_findings_parsed(self): + items = [ + { + "filename": "app.py", + "location": {"row": 10, "column": 5}, + "code": "E501", + "message": "Line too long", + }, + { + "filename": "utils.py", + "location": {"row": 3, "column": 1}, + "code": "W291", + "message": "Trailing whitespace", + }, + ] + with ( + patch("quantitative.analyzers.shutil.which", return_value="/usr/bin/ruff"), + patch("quantitative.analyzers._tool_version", return_value="0.8.0"), + patch("quantitative.analyzers._run_tool", + return_value=_mock_run(stdout=json.dumps(items), returncode=1)), + ): + result = run_ruff(Path(".")) + assert len(result.findings) == 2 + assert result.findings[0].code == "E501" + assert result.findings[0].severity == "error" + assert result.findings[1].code == "W291" + assert result.findings[1].severity == "warning" + + +# --------------------------------------------------------------------------- +# bandit +# --------------------------------------------------------------------------- + +class TestBandit: + def test_not_installed(self): + with patch("quantitative.analyzers.shutil.which", return_value=None): + result = run_bandit(Path(".")) + assert not result.available + + def test_clean_output(self): + with ( + patch("quantitative.analyzers.shutil.which", return_value="/usr/bin/bandit"), + patch("quantitative.analyzers._tool_version", return_value="1.7.0"), + patch("quantitative.analyzers._run_tool", + return_value=_mock_run(stdout=json.dumps({"results": []}))), + ): + result = run_bandit(Path(".")) + assert result.available + assert len(result.findings) == 0 + + def test_findings_parsed(self): + data = { + "results": [ + { + "filename": "app.py", + "line_number": 42, + "test_id": "B608", + "issue_text": "Possible SQL injection", + "issue_severity": "HIGH", + "issue_confidence": "MEDIUM", + "issue_cwe": {"id": 89, "link": "https://cwe.mitre.org/data/definitions/89.html"}, + }, + ] + } + with ( + patch("quantitative.analyzers.shutil.which", return_value="/usr/bin/bandit"), + patch("quantitative.analyzers._tool_version", return_value="1.7.0"), + patch("quantitative.analyzers._run_tool", + return_value=_mock_run(stdout=json.dumps(data), returncode=1)), + ): + result = run_bandit(Path(".")) + assert len(result.findings) == 1 + f = result.findings[0] + assert f.code == "B608" + assert f.severity == "high" + assert f.cwe == "CWE-89" + + +# --------------------------------------------------------------------------- +# eslint +# --------------------------------------------------------------------------- + +class TestEslint: + def test_not_installed(self): + with patch("quantitative.analyzers.shutil.which", return_value=None): + result = run_eslint(Path(".")) + assert not result.available + + def test_findings_parsed(self): + items = [ + { + "filePath": "/app/index.js", + "messages": [ + {"severity": 2, "ruleId": "no-unused-vars", "message": "'x' is unused", "line": 5, "column": 1}, + {"severity": 1, "ruleId": "semi", "message": "Missing semicolon", "line": 10, "column": 20}, + ], + } + ] + with ( + patch("quantitative.analyzers.shutil.which", return_value="/usr/bin/eslint"), + patch("quantitative.analyzers._tool_version", return_value="8.0.0"), + patch("quantitative.analyzers._run_tool", + return_value=_mock_run(stdout=json.dumps(items), returncode=1)), + ): + result = run_eslint(Path(".")) + assert len(result.findings) == 2 + assert result.findings[0].severity == "error" + assert result.findings[1].severity == "warning" + + +# --------------------------------------------------------------------------- +# npm audit +# --------------------------------------------------------------------------- + +class TestNpmAudit: + def test_not_installed(self): + with patch("quantitative.analyzers.shutil.which", return_value=None): + result = run_npm_audit(Path(".")) + assert not result.available + + def test_no_lockfile(self, tmp_path): + with ( + patch("quantitative.analyzers.shutil.which", return_value="/usr/bin/npm"), + patch("quantitative.analyzers._tool_version", return_value="10.0.0"), + ): + result = run_npm_audit(tmp_path) + assert result.available + assert result.error == "no package-lock.json found" diff --git a/scripts/aidlc-evaluator/packages/quantitative/tests/test_models.py b/scripts/aidlc-evaluator/packages/quantitative/tests/test_models.py new file mode 100644 index 00000000..701f6e4f --- /dev/null +++ b/scripts/aidlc-evaluator/packages/quantitative/tests/test_models.py @@ -0,0 +1,64 @@ +"""Tests for quantitative data models.""" + +from quantitative.models import LintFinding, QualityReport, SecurityFinding, ToolResult + + +def test_compute_summary_lint_only(): + report = QualityReport( + project_type="python", + project_root=".", + lint=ToolResult( + tool="ruff", version="0.8.0", available=True, exit_code=1, + findings=[ + LintFinding("a.py", 1, 1, "E501", "line too long", "error"), + LintFinding("b.py", 2, 1, "W291", "trailing whitespace", "warning"), + LintFinding("c.py", 3, 1, "E302", "expected 2 blank lines", "error"), + ], + ), + ) + report.compute_summary() + assert report.summary["lint_total"] == 3 + assert report.summary["lint_errors"] == 2 + assert report.summary["lint_warnings"] == 1 + + +def test_compute_summary_security_only(): + report = QualityReport( + project_type="python", + project_root=".", + security=ToolResult( + tool="bandit", version="1.7.0", available=True, exit_code=1, + findings=[ + SecurityFinding("s.py", 10, "B101", "assert used", "low", "high"), + SecurityFinding("s.py", 20, "B608", "SQL injection", "high", "medium"), + ], + ), + ) + report.compute_summary() + assert report.summary["security_total"] == 2 + assert report.summary["security_high"] == 1 + assert report.summary["security_low"] == 1 + + +def test_compute_summary_both(): + report = QualityReport( + project_type="python", + project_root=".", + lint=ToolResult(tool="ruff", version="0.8.0", available=True, exit_code=0, findings=[]), + security=ToolResult(tool="bandit", version="1.7.0", available=True, exit_code=0, findings=[]), + ) + report.compute_summary() + assert report.summary["lint_total"] == 0 + assert report.summary["lint_errors"] == 0 + assert report.summary["security_total"] == 0 + assert report.summary["security_high"] == 0 + + +def test_compute_summary_unavailable_tool(): + report = QualityReport( + project_type="python", + project_root=".", + lint=ToolResult(tool="ruff", version=None, available=False, error="not found"), + ) + report.compute_summary() + assert "lint_total" not in report.summary diff --git a/scripts/aidlc-evaluator/packages/quantitative/tests/test_scanner.py b/scripts/aidlc-evaluator/packages/quantitative/tests/test_scanner.py new file mode 100644 index 00000000..eed01558 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/quantitative/tests/test_scanner.py @@ -0,0 +1,91 @@ +"""Tests for the scanner orchestrator.""" + +from pathlib import Path +from unittest.mock import patch + +from quantitative.models import ToolResult +from quantitative.scanner import scan_workspace, write_report, _detect_project + +import yaml + + +class TestDetectProject: + def test_python_at_root(self, tmp_path): + (tmp_path / "pyproject.toml").write_text("[project]\nname = 'x'\n") + result = _detect_project(tmp_path) + assert result is not None + assert result[0] == "python" + assert result[1] == tmp_path + + def test_python_nested(self, tmp_path): + nested = tmp_path / "app" + nested.mkdir() + (nested / "package.json").write_text("{}") + result = _detect_project(tmp_path) + assert result is not None + assert result[0] == "node" + assert result[1] == nested + + def test_empty_workspace(self, tmp_path): + assert _detect_project(tmp_path) is None + + def test_skips_venv(self, tmp_path): + venv = tmp_path / ".venv" + venv.mkdir() + (venv / "pyproject.toml").write_text("[project]\nname='x'\n") + assert _detect_project(tmp_path) is None + + def test_skips_node_modules(self, tmp_path): + nm = tmp_path / "node_modules" + nm.mkdir() + (nm / "package.json").write_text("{}") + assert _detect_project(tmp_path) is None + + +class TestScanWorkspace: + def test_no_project(self, tmp_path): + assert scan_workspace(tmp_path) is None + + def test_python_project(self, tmp_path): + (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n") + (tmp_path / "src").mkdir() + + mock_lint = ToolResult(tool="ruff", version="0.8.0", available=True, exit_code=0, findings=[]) + mock_sec = ToolResult(tool="bandit", version="1.7.0", available=True, exit_code=0, findings=[]) + + with ( + patch("quantitative.scanner.run_ruff", return_value=mock_lint), + patch("quantitative.scanner.run_bandit", return_value=mock_sec), + ): + report = scan_workspace(tmp_path) + + assert report is not None + assert report.project_type == "python" + assert report.lint.tool == "ruff" + assert report.security.tool == "bandit" + assert report.summary["lint_total"] == 0 + assert report.summary["security_total"] == 0 + + +class TestWriteReport: + def test_roundtrip(self, tmp_path): + mock_lint = ToolResult(tool="ruff", version="0.8.0", available=True, exit_code=0, findings=[]) + mock_sec = ToolResult(tool="bandit", version="1.7.0", available=True, exit_code=0, findings=[]) + + with ( + patch("quantitative.scanner.run_ruff", return_value=mock_lint), + patch("quantitative.scanner.run_bandit", return_value=mock_sec), + ): + (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n") + report = scan_workspace(tmp_path) + + out = tmp_path / "quality-report.yaml" + write_report(report, out) + + with open(out) as f: + data = yaml.safe_load(f) + + assert data["project_type"] == "python" + assert data["lint"]["tool"] == "ruff" + assert data["security"]["tool"] == "bandit" + assert data["summary"]["lint_total"] == 0 diff --git a/scripts/aidlc-evaluator/packages/reporting/pyproject.toml b/scripts/aidlc-evaluator/packages/reporting/pyproject.toml new file mode 100644 index 00000000..d9514afa --- /dev/null +++ b/scripts/aidlc-evaluator/packages/reporting/pyproject.toml @@ -0,0 +1,15 @@ +[project] +name = "aidlc-reporting" +version = "0.1.0" +description = "Consolidated report generation aggregating all evaluation dimensions" +requires-python = ">=3.13" +dependencies = [ + "pyyaml>=6.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/reporting"] diff --git a/scripts/aidlc-evaluator/packages/reporting/src/reporting/__init__.py b/scripts/aidlc-evaluator/packages/reporting/src/reporting/__init__.py new file mode 100644 index 00000000..04286451 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/reporting/src/reporting/__init__.py @@ -0,0 +1,42 @@ +"""Consolidated report generation aggregating all evaluation dimensions. + +Usage: + from reporting import collect, write_markdown, write_html + data = collect(Path("runs/20260218T.../")) + write_markdown(data, Path("runs/.../report.md")) + write_html(data, Path("runs/.../report.html")) + +CLI: + python -m reporting generate <run-folder> [--format markdown|html|both] +""" + +from reporting.baseline import ( + BaselineMetrics, + ComparisonResult, + compare, + compare_run_to_baseline, + extract_baseline, + load_baseline, + promote, + write_baseline, +) +from reporting.collector import ReportData, collect +from reporting.render_html import render_html, write_html +from reporting.render_md import render_markdown, write_markdown + +__all__ = [ + "BaselineMetrics", + "ComparisonResult", + "ReportData", + "collect", + "compare", + "compare_run_to_baseline", + "extract_baseline", + "load_baseline", + "promote", + "render_html", + "render_markdown", + "write_baseline", + "write_html", + "write_markdown", +] diff --git a/scripts/aidlc-evaluator/packages/reporting/src/reporting/__main__.py b/scripts/aidlc-evaluator/packages/reporting/src/reporting/__main__.py new file mode 100644 index 00000000..b11a7f61 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/reporting/src/reporting/__main__.py @@ -0,0 +1,146 @@ +"""CLI entry point: python -m reporting <command>.""" + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +from reporting.baseline import ( + compare_run_to_baseline, + load_baseline, + promote, +) +from reporting.collector import collect +from reporting.render_html import write_html +from reporting.render_md import write_markdown + + +def cmd_generate(args: argparse.Namespace) -> None: + if not args.run_folder.is_dir(): + print(f"Error: run folder not found: {args.run_folder}", file=sys.stderr) + sys.exit(1) + + out_dir = args.output_dir or args.run_folder + out_dir.mkdir(parents=True, exist_ok=True) + + data = collect(args.run_folder) + + if args.baseline and args.baseline.is_file(): + from reporting.baseline import compare, extract_baseline, load_baseline as _lb + current = extract_baseline(data) + golden = _lb(args.baseline) + data.comparison = compare(current, golden) + + if args.format in ("markdown", "both"): + md_path = out_dir / "report.md" + write_markdown(data, md_path) + print(f" Markdown: {md_path}") + + if args.format in ("html", "both"): + html_path = out_dir / "report.html" + write_html(data, html_path) + print(f" HTML: {html_path}") + + +def cmd_promote(args: argparse.Namespace) -> None: + if not args.run_folder.is_dir(): + print(f"Error: run folder not found: {args.run_folder}", file=sys.stderr) + sys.exit(1) + + golden_path = args.output + baseline = promote(args.run_folder, golden_path) + run_name = Path(baseline.run_folder).name + print(f" Promoted: {run_name}") + print(f" Baseline: {golden_path}") + print(f" Tests: {baseline.tests_passed}/{baseline.tests_total}") + print(f" Contract: {baseline.contract_passed}/{baseline.contract_total}") + print(f" Lint: {baseline.lint_total} ({baseline.lint_errors} errors)") + print(f" Quality: {baseline.qualitative_score:.4f}") + + +def cmd_compare(args: argparse.Namespace) -> None: + if not args.run_folder.is_dir(): + print(f"Error: run folder not found: {args.run_folder}", file=sys.stderr) + sys.exit(1) + if not args.baseline.is_file(): + print(f"Error: baseline not found: {args.baseline}", file=sys.stderr) + sys.exit(1) + + result = compare_run_to_baseline(args.run_folder, args.baseline) + + golden_name = Path(result.golden_run).name if result.golden_run else "unknown" + print(f" Baseline: {golden_name}") + print(f" Improved: {result.improved}") + print(f" Regressed: {result.regressed}") + print(f" Unchanged: {result.unchanged}") + print() + + for d in result.deltas: + if d.direction == "regressed": + icon = "[-]" + elif d.direction == "improved": + icon = "[+]" + else: + icon = "[ ]" + golden_str = f"{d.golden}" if d.golden is not None else "---" + current_str = f"{d.current}" if d.current is not None else "---" + print(f" {icon} {d.name:<20} {golden_str:>12} -> {current_str:>12} ({d.direction})") + + if result.regressed > 0: + sys.exit(1) + + +def main() -> None: + parser = argparse.ArgumentParser( + prog="reporting", + description="AIDLC evaluation reporting and baseline management", + ) + sub = parser.add_subparsers(dest="command") + + # ── generate ─────────────────────────────────────────────── + gen = sub.add_parser("generate", help="Generate consolidated report") + gen.add_argument("run_folder", type=Path, help="Path to the run folder") + gen.add_argument( + "--format", "-f", choices=["markdown", "html", "both"], default="both", + help="Output format (default: both)", + ) + gen.add_argument( + "--output-dir", "-o", type=Path, default=None, + help="Output directory (default: the run folder)", + ) + gen.add_argument( + "--baseline", "-b", type=Path, default=None, + help="Path to golden.yaml for baseline comparison", + ) + + # ── promote ──────────────────────────────────────────────── + prom = sub.add_parser("promote", help="Promote a run as a golden baseline") + prom.add_argument("run_folder", type=Path, help="Path to the run folder to promote") + prom.add_argument( + "--output", "-o", type=Path, required=True, + help="Where to write golden.yaml", + ) + + # ── compare ──────────────────────────────────────────────── + comp = sub.add_parser("compare", help="Compare a run against a golden baseline") + comp.add_argument("run_folder", type=Path, help="Path to the run folder") + comp.add_argument( + "--baseline", "-b", type=Path, required=True, + help="Path to golden.yaml", + ) + + args = parser.parse_args() + if args.command == "generate": + cmd_generate(args) + elif args.command == "promote": + cmd_promote(args) + elif args.command == "compare": + cmd_compare(args) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/aidlc-evaluator/packages/reporting/src/reporting/baseline.py b/scripts/aidlc-evaluator/packages/reporting/src/reporting/baseline.py new file mode 100644 index 00000000..9c8de0fb --- /dev/null +++ b/scripts/aidlc-evaluator/packages/reporting/src/reporting/baseline.py @@ -0,0 +1,453 @@ +"""Golden baseline: promote a run's metrics and compare against them. + +A golden.yaml captures the key numeric metrics from a run so future runs +can be compared for regressions or improvements without re-reading all +the individual YAML artifacts. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +import yaml + +from reporting.collector import ReportData, collect + + +@dataclass +class BaselineMetrics: + """Flat numeric snapshot of a run's key evaluation metrics.""" + + # Identity + run_folder: str = "" + promoted_at: str = "" + executor_model: str = "" + simulator_model: str = "" + + # Execution (aggregate) + wall_clock_ms: int = 0 + total_tokens: int = 0 + input_tokens: int = 0 + output_tokens: int = 0 + handoffs: int = 0 + + # Execution (per-agent tokens) + executor_input_tokens: int = 0 + executor_output_tokens: int = 0 + executor_total_tokens: int = 0 + simulator_input_tokens: int = 0 + simulator_output_tokens: int = 0 + simulator_total_tokens: int = 0 + + # Execution (repeated context and API totals) + repeated_context_input_tokens: int = 0 + repeated_context_output_tokens: int = 0 + repeated_context_total_tokens: int = 0 + api_total_input_tokens: int = 0 + api_total_output_tokens: int = 0 + api_total_total_tokens: int = 0 + + # Context size + context_size_max: int = 0 + context_size_avg: int = 0 + context_size_median: int = 0 + + # Artifacts + source_files: int = 0 + test_files: int = 0 + total_files: int = 0 + lines_of_code: int = 0 + doc_files: int = 0 + + # Unit tests + tests_passed: int = 0 + tests_failed: int = 0 + tests_total: int = 0 + tests_pass_pct: float = 0.0 + coverage_pct: float | None = None + + # Contract tests + contract_passed: int = 0 + contract_failed: int = 0 + contract_total: int = 0 + + # Code quality + lint_errors: int = 0 + lint_warnings: int = 0 + lint_total: int = 0 + security_total: int = 0 + security_high: int = 0 + duplication_blocks: int = 0 + + # Qualitative + qualitative_score: float = 0.0 + inception_score: float = 0.0 + construction_score: float = 0.0 + + +def extract_baseline(data: ReportData) -> BaselineMetrics: + """Extract a flat BaselineMetrics from a fully-collected ReportData.""" + b = BaselineMetrics( + run_folder=data.meta.run_folder, + promoted_at=datetime.now(UTC).isoformat(timespec="seconds"), + executor_model=data.meta.executor_model, + simulator_model=data.meta.simulator_model, + wall_clock_ms=data.metrics.wall_clock_ms, + total_tokens=data.metrics.total_tokens.total_tokens, + input_tokens=data.metrics.total_tokens.input_tokens, + output_tokens=data.metrics.total_tokens.output_tokens, + handoffs=data.meta.total_handoffs, + executor_input_tokens=data.metrics.executor_tokens.input_tokens, + executor_output_tokens=data.metrics.executor_tokens.output_tokens, + executor_total_tokens=data.metrics.executor_tokens.total_tokens, + simulator_input_tokens=data.metrics.simulator_tokens.input_tokens, + simulator_output_tokens=data.metrics.simulator_tokens.output_tokens, + simulator_total_tokens=data.metrics.simulator_tokens.total_tokens, + repeated_context_input_tokens=data.metrics.repeated_context_tokens.input_tokens, + repeated_context_output_tokens=data.metrics.repeated_context_tokens.output_tokens, + repeated_context_total_tokens=data.metrics.repeated_context_tokens.total_tokens, + api_total_input_tokens=data.metrics.api_total_tokens.input_tokens, + api_total_output_tokens=data.metrics.api_total_tokens.output_tokens, + api_total_total_tokens=data.metrics.api_total_tokens.total_tokens, + source_files=data.metrics.artifacts.source_files, + test_files=data.metrics.artifacts.test_files, + total_files=data.metrics.artifacts.total_files, + lines_of_code=data.metrics.artifacts.total_lines_of_code, + doc_files=data.metrics.artifacts.total_doc_files, + ) + + if data.metrics.context_size_total: + b.context_size_max = data.metrics.context_size_total.max_tokens + b.context_size_avg = data.metrics.context_size_total.avg_tokens + b.context_size_median = data.metrics.context_size_total.median_tokens + + if data.tests: + b.tests_passed = data.tests.passed + b.tests_failed = data.tests.failed + b.tests_total = data.tests.total + b.tests_pass_pct = data.tests.pass_pct + b.coverage_pct = data.tests.coverage_pct + + if data.contracts: + b.contract_passed = data.contracts.passed + b.contract_failed = data.contracts.failed + b.contract_total = data.contracts.total + + if data.quality: + b.lint_errors = data.quality.lint_errors + b.lint_warnings = data.quality.lint_warnings + b.lint_total = data.quality.lint_total + b.security_total = data.quality.security_total + b.security_high = data.quality.security_high + b.duplication_blocks = data.quality.duplication_blocks + + if data.qualitative: + b.qualitative_score = data.qualitative.overall_score + for phase in data.qualitative.phases: + if phase.phase == "inception": + b.inception_score = phase.avg_overall + elif phase.phase == "construction": + b.construction_score = phase.avg_overall + + return b + + +def write_baseline(baseline: BaselineMetrics, path: Path) -> None: + """Write a golden.yaml file.""" + d: dict[str, Any] = { + "run_folder": baseline.run_folder, + "promoted_at": baseline.promoted_at, + "executor_model": baseline.executor_model, + "simulator_model": baseline.simulator_model, + "execution": { + "wall_clock_ms": baseline.wall_clock_ms, + "total_tokens": baseline.total_tokens, + "input_tokens": baseline.input_tokens, + "output_tokens": baseline.output_tokens, + "handoffs": baseline.handoffs, + "executor": { + "input_tokens": baseline.executor_input_tokens, + "output_tokens": baseline.executor_output_tokens, + "total_tokens": baseline.executor_total_tokens, + }, + "simulator": { + "input_tokens": baseline.simulator_input_tokens, + "output_tokens": baseline.simulator_output_tokens, + "total_tokens": baseline.simulator_total_tokens, + }, + "repeated_context": { + "input_tokens": baseline.repeated_context_input_tokens, + "output_tokens": baseline.repeated_context_output_tokens, + "total_tokens": baseline.repeated_context_total_tokens, + }, + "api_total": { + "input_tokens": baseline.api_total_input_tokens, + "output_tokens": baseline.api_total_output_tokens, + "total_tokens": baseline.api_total_total_tokens, + }, + }, + "context_size": { + "max_tokens": baseline.context_size_max, + "avg_tokens": baseline.context_size_avg, + "median_tokens": baseline.context_size_median, + }, + "artifacts": { + "source_files": baseline.source_files, + "test_files": baseline.test_files, + "total_files": baseline.total_files, + "lines_of_code": baseline.lines_of_code, + "doc_files": baseline.doc_files, + }, + "unit_tests": { + "passed": baseline.tests_passed, + "failed": baseline.tests_failed, + "total": baseline.tests_total, + "pass_pct": baseline.tests_pass_pct, + "coverage_pct": baseline.coverage_pct, + }, + "contract_tests": { + "passed": baseline.contract_passed, + "failed": baseline.contract_failed, + "total": baseline.contract_total, + }, + "code_quality": { + "lint_errors": baseline.lint_errors, + "lint_warnings": baseline.lint_warnings, + "lint_total": baseline.lint_total, + "security_total": baseline.security_total, + "security_high": baseline.security_high, + "duplication_blocks": baseline.duplication_blocks, + }, + "qualitative": { + "overall_score": baseline.qualitative_score, + "inception_score": baseline.inception_score, + "construction_score": baseline.construction_score, + }, + } + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + yaml.safe_dump(d, f, default_flow_style=False, sort_keys=False) + + +def load_baseline(path: Path) -> BaselineMetrics: + """Read a golden.yaml into a BaselineMetrics.""" + with open(path, encoding="utf-8") as f: + d = yaml.safe_load(f) or {} + ex = d.get("execution", {}) + ex_agent = ex.get("executor", {}) + si_agent = ex.get("simulator", {}) + repeated = ex.get("repeated_context", {}) + api_tot = ex.get("api_total", {}) + ctx = d.get("context_size", {}) + art = d.get("artifacts", {}) + ut = d.get("unit_tests", {}) + ct = d.get("contract_tests", {}) + cq = d.get("code_quality", {}) + ql = d.get("qualitative", {}) + return BaselineMetrics( + run_folder=d.get("run_folder", ""), + promoted_at=d.get("promoted_at", ""), + executor_model=d.get("executor_model", ""), + simulator_model=d.get("simulator_model", ""), + wall_clock_ms=ex.get("wall_clock_ms", 0), + total_tokens=ex.get("total_tokens", 0), + input_tokens=ex.get("input_tokens", 0), + output_tokens=ex.get("output_tokens", 0), + handoffs=ex.get("handoffs", 0), + executor_input_tokens=ex_agent.get("input_tokens", 0), + executor_output_tokens=ex_agent.get("output_tokens", 0), + executor_total_tokens=ex_agent.get("total_tokens", 0), + simulator_input_tokens=si_agent.get("input_tokens", 0), + simulator_output_tokens=si_agent.get("output_tokens", 0), + simulator_total_tokens=si_agent.get("total_tokens", 0), + repeated_context_input_tokens=repeated.get("input_tokens", 0), + repeated_context_output_tokens=repeated.get("output_tokens", 0), + repeated_context_total_tokens=repeated.get("total_tokens", 0), + api_total_input_tokens=api_tot.get("input_tokens", 0), + api_total_output_tokens=api_tot.get("output_tokens", 0), + api_total_total_tokens=api_tot.get("total_tokens", 0), + context_size_max=ctx.get("max_tokens", 0), + context_size_avg=ctx.get("avg_tokens", 0), + context_size_median=ctx.get("median_tokens", 0), + source_files=art.get("source_files", 0), + test_files=art.get("test_files", 0), + total_files=art.get("total_files", 0), + lines_of_code=art.get("lines_of_code", 0), + doc_files=art.get("doc_files", 0), + tests_passed=ut.get("passed", 0), + tests_failed=ut.get("failed", 0), + tests_total=ut.get("total", 0), + tests_pass_pct=ut.get("pass_pct", 0.0), + coverage_pct=ut.get("coverage_pct"), + contract_passed=ct.get("passed", 0), + contract_failed=ct.get("failed", 0), + contract_total=ct.get("total", 0), + lint_errors=cq.get("lint_errors", 0), + lint_warnings=cq.get("lint_warnings", 0), + lint_total=cq.get("lint_total", 0), + security_total=cq.get("security_total", 0), + security_high=cq.get("security_high", 0), + duplication_blocks=cq.get("duplication_blocks", 0), + qualitative_score=ql.get("overall_score", 0), + inception_score=ql.get("inception_score", 0), + construction_score=ql.get("construction_score", 0), + ) + + +def promote(run_folder: Path, golden_path: Path) -> BaselineMetrics: + """Collect a run's data and write it as a golden baseline.""" + data = collect(run_folder) + baseline = extract_baseline(data) + write_baseline(baseline, golden_path) + return baseline + + +def promote_for_scenario(run_folder: Path, scenario_path: Path) -> BaselineMetrics: + """Promote a run as the golden baseline for a scenario. + + Writes ``golden.yaml`` into *scenario_path* (the test-case directory). + If the scenario has a ``scenario.yaml`` manifest, the ``golden_baseline`` + field is used to determine the filename; otherwise defaults to + ``golden.yaml``. + """ + golden_name = "golden.yaml" + manifest = scenario_path / "scenario.yaml" + if manifest.is_file(): + with open(manifest, encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + golden_name = data.get("golden_baseline", golden_name) + + golden_path = scenario_path / golden_name + return promote(run_folder, golden_path) + + +# ── Comparison ────────────────────────────────────────────────────────── + + +@dataclass +class MetricDelta: + """A single metric compared between current run and golden baseline.""" + name: str + category: str + current: float | int | None + golden: float | int | None + delta: float | None = None + pct_change: float | None = None + direction: str = "unchanged" # "improved", "regressed", "unchanged", "new" + higher_is_better: bool = True + + +@dataclass +class ComparisonResult: + """Full comparison of a run against a golden baseline.""" + golden_run: str = "" + golden_promoted_at: str = "" + current_run: str = "" + improved: int = 0 + regressed: int = 0 + unchanged: int = 0 + deltas: list[MetricDelta] = field(default_factory=list) + + +def _classify(current: float | int | None, golden: float | int | None, + higher_is_better: bool, tolerance: float = 0.001) -> tuple[str, float | None, float | None]: + """Return (direction, delta, pct_change).""" + if current is None or golden is None: + return ("new" if golden is None else "unchanged"), None, None + delta = float(current) - float(golden) + pct = (delta / float(golden) * 100) if golden != 0 else (100.0 if delta != 0 else 0.0) + if abs(delta) <= tolerance: + return "unchanged", delta, pct + if higher_is_better: + return ("improved" if delta > 0 else "regressed"), delta, pct + else: + return ("improved" if delta < 0 else "regressed"), delta, pct + + +def compare(current: BaselineMetrics, golden: BaselineMetrics) -> ComparisonResult: + """Compare current run metrics against a golden baseline.""" + result = ComparisonResult( + golden_run=golden.run_folder, + golden_promoted_at=golden.promoted_at, + current_run=current.run_folder, + ) + + metrics_spec: list[tuple[str, str, Any, Any, bool]] = [ + # (name, category, current_val, golden_val, higher_is_better) + ("Tests Pass %", "Unit Tests", current.tests_pass_pct, golden.tests_pass_pct, True), + ("Tests Failed", "Unit Tests", current.tests_failed, golden.tests_failed, False), + ("Coverage %", "Unit Tests", current.coverage_pct, golden.coverage_pct, True), + ("Contract Passed", "Contract Tests", current.contract_passed, golden.contract_passed, True), + ("Contract Failed", "Contract Tests", current.contract_failed, golden.contract_failed, False), + ("Contract Total", "Contract Tests", current.contract_total, golden.contract_total, True), + ("Lint Errors", "Code Quality", current.lint_errors, golden.lint_errors, False), + ("Lint Warnings", "Code Quality", current.lint_warnings, golden.lint_warnings, False), + ("Lint Total", "Code Quality", current.lint_total, golden.lint_total, False), + ("Security Findings", "Code Quality", current.security_total, golden.security_total, False), + ("Security High", "Code Quality", current.security_high, golden.security_high, False), + ("Duplication Blocks", "Code Quality", current.duplication_blocks, golden.duplication_blocks, False), + ("Qualitative Score", "Qualitative", current.qualitative_score, golden.qualitative_score, True), + ("Inception Score", "Qualitative", current.inception_score, golden.inception_score, True), + ("Construction Score", "Qualitative", current.construction_score, golden.construction_score, True), + ("Source Files", "Artifacts", current.source_files, golden.source_files, True), + ("Test Files", "Artifacts", current.test_files, golden.test_files, True), + ("Lines of Code", "Artifacts", current.lines_of_code, golden.lines_of_code, True), + ("Doc Files", "Artifacts", current.doc_files, golden.doc_files, True), + ("Total Tokens", "Execution", current.total_tokens, golden.total_tokens, False), + ("Executor Input Tokens", "Execution", current.executor_input_tokens, golden.executor_input_tokens, False), + ("Executor Total Tokens", "Execution", current.executor_total_tokens, golden.executor_total_tokens, False), + ("Simulator Input Tokens", "Execution", current.simulator_input_tokens, golden.simulator_input_tokens, False), + ("Simulator Total Tokens", "Execution", current.simulator_total_tokens, golden.simulator_total_tokens, False), + ("Repeated Context Tokens", "Execution", current.repeated_context_total_tokens, golden.repeated_context_total_tokens, False), + ("API Total Tokens", "Execution", current.api_total_total_tokens, golden.api_total_total_tokens, False), + ("Wall Clock (ms)", "Execution", current.wall_clock_ms, golden.wall_clock_ms, False), + ("Handoffs", "Execution", current.handoffs, golden.handoffs, False), + ("Context Size Max", "Context Size", current.context_size_max, golden.context_size_max, False), + ("Context Size Avg", "Context Size", current.context_size_avg, golden.context_size_avg, False), + ("Context Size Median", "Context Size", current.context_size_median, golden.context_size_median, False), + ] + + for name, category, cur, gld, hib in metrics_spec: + direction, delta, pct = _classify(cur, gld, hib) + result.deltas.append(MetricDelta( + name=name, category=category, + current=cur, golden=gld, + delta=delta, pct_change=pct, + direction=direction, higher_is_better=hib, + )) + if direction == "improved": + result.improved += 1 + elif direction == "regressed": + result.regressed += 1 + else: + result.unchanged += 1 + + return result + + +def compare_run_to_baseline(run_folder: Path, golden_path: Path) -> ComparisonResult: + """Convenience: collect a run, load a baseline, and compare.""" + data = collect(run_folder) + current = extract_baseline(data) + golden = load_baseline(golden_path) + return compare(current, golden) + + +def compare_run_for_scenario(run_folder: Path, scenario_path: Path) -> ComparisonResult: + """Compare a run against the golden baseline for a scenario. + + Reads the ``golden_baseline`` filename from ``scenario.yaml`` in + *scenario_path*, falling back to ``golden.yaml``. + """ + golden_name = "golden.yaml" + manifest = scenario_path / "scenario.yaml" + if manifest.is_file(): + with open(manifest, encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + golden_name = data.get("golden_baseline", golden_name) + + golden_path = scenario_path / golden_name + return compare_run_to_baseline(run_folder, golden_path) diff --git a/scripts/aidlc-evaluator/packages/reporting/src/reporting/collector.py b/scripts/aidlc-evaluator/packages/reporting/src/reporting/collector.py new file mode 100644 index 00000000..fefc1e71 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/reporting/src/reporting/collector.py @@ -0,0 +1,421 @@ +"""Collect all run artifacts into a unified report data structure.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + +import yaml + + +@dataclass +class RunMeta: + run_folder: str = "" + started_at: str = "" + completed_at: str = "" + status: str = "" + execution_time_ms: int = 0 + total_handoffs: int = 0 + node_history: list[str] = field(default_factory=list) + executor_model: str = "" + simulator_model: str = "" + aws_region: str = "" + rules_source: str = "" + rules_repo: str = "" + rules_ref: str = "" + rules_local_path: str = "" + vision_file: str = "" + tech_env_file: str = "" + + +@dataclass +class TokenUsage: + input_tokens: int = 0 + output_tokens: int = 0 + total_tokens: int = 0 + + +@dataclass +class HandoffTiming: + handoff: int = 0 + node_id: str = "" + duration_ms: int = 0 + + +@dataclass +class Artifacts: + source_files: int = 0 + test_files: int = 0 + config_files: int = 0 + total_files: int = 0 + total_lines_of_code: int = 0 + inception_files: int = 0 + construction_files: int = 0 + total_doc_files: int = 0 + + +@dataclass +class ContextSizeStats: + min_tokens: int = 0 + max_tokens: int = 0 + avg_tokens: int = 0 + median_tokens: int = 0 + sample_count: int = 0 + + +@dataclass +class RunMetrics: + total_tokens: TokenUsage = field(default_factory=TokenUsage) + executor_tokens: TokenUsage = field(default_factory=TokenUsage) + simulator_tokens: TokenUsage = field(default_factory=TokenUsage) + repeated_context_tokens: TokenUsage = field(default_factory=TokenUsage) + api_total_tokens: TokenUsage = field(default_factory=TokenUsage) + wall_clock_ms: int = 0 + handoffs: list[HandoffTiming] = field(default_factory=list) + artifacts: Artifacts = field(default_factory=Artifacts) + errors: dict[str, int] = field(default_factory=dict) + context_size_total: ContextSizeStats | None = None + context_size_executor: ContextSizeStats | None = None + context_size_simulator: ContextSizeStats | None = None + + +@dataclass +class TestResults: + status: str = "" + install_ok: bool = False + test_ok: bool = False + passed: int = 0 + failed: int = 0 + errors: int = 0 + total: int = 0 + pass_pct: float = 0.0 + coverage_pct: float | None = None + + +@dataclass +class LintFinding: + file: str = "" + line: int = 0 + code: str = "" + message: str = "" + severity: str = "" + + +@dataclass +class QualityReport: + project_type: str = "" + lint_tool: str = "" + lint_version: str = "" + lint_available: bool = False + lint_findings: list[LintFinding] = field(default_factory=list) + lint_total: int = 0 + lint_errors: int = 0 + lint_warnings: int = 0 + security_tool: str = "" + security_available: bool = False + security_total: int = 0 + security_high: int = 0 + semgrep_tool: str = "" + semgrep_available: bool = False + semgrep_total: int = 0 + semgrep_high: int = 0 + duplication_tool: str = "" + duplication_available: bool = False + duplication_blocks: int = 0 + duplication_lines: int = 0 + + +@dataclass +class ContractCase: + name: str = "" + path: str = "" + method: str = "" + passed: bool = False + expected_status: int = 0 + actual_status: int | None = None + failures: list[str] = field(default_factory=list) + latency_ms: float | None = None + error: str | None = None + + +@dataclass +class ContractResults: + total: int = 0 + passed: int = 0 + failed: int = 0 + errors: int = 0 + server_started: bool = False + server_error: str | None = None + cases: list[ContractCase] = field(default_factory=list) + + +@dataclass +class DocScore: + path: str = "" + intent: float = 0.0 + design: float = 0.0 + completeness: float = 0.0 + overall: float = 0.0 + notes: str = "" + + +@dataclass +class PhaseScore: + phase: str = "" + avg_intent: float = 0.0 + avg_design: float = 0.0 + avg_completeness: float = 0.0 + avg_overall: float = 0.0 + documents: list[DocScore] = field(default_factory=list) + + +@dataclass +class QualitativeResults: + overall_score: float = 0.0 + phases: list[PhaseScore] = field(default_factory=list) + unmatched_reference: list[str] = field(default_factory=list) + unmatched_candidate: list[str] = field(default_factory=list) + + +@dataclass +class ReportData: + """All data needed to render a consolidated report.""" + meta: RunMeta = field(default_factory=RunMeta) + metrics: RunMetrics = field(default_factory=RunMetrics) + tests: TestResults | None = None + quality: QualityReport | None = None + contracts: ContractResults | None = None + qualitative: QualitativeResults | None = None + comparison: Any | None = None # ComparisonResult when baseline exists + generated_at: str = "" + + +def _load_yaml(path: Path) -> dict | None: + if not path.exists(): + return None + with open(path, encoding="utf-8") as f: + return yaml.safe_load(f) + + +def _parse_coverage(test_output: str) -> float | None: + """Extract coverage percentage from pytest output.""" + import re + m = re.search(r"Total coverage:\s*([\d.]+)%", test_output) + if m: + return float(m.group(1)) + m = re.search(r"TOTAL\s+\d+\s+\d+\s+(\d+)%", test_output) + if m: + return float(m.group(1)) + return None + + +def _parse_context_stats(d: dict) -> ContextSizeStats: + """Parse a context_size stats dict from YAML into a ContextSizeStats.""" + return ContextSizeStats( + min_tokens=d.get("min_tokens", 0), + max_tokens=d.get("max_tokens", 0), + avg_tokens=d.get("avg_tokens", 0), + median_tokens=d.get("median_tokens", 0), + sample_count=d.get("sample_count", 0), + ) + + +def collect(run_folder: Path) -> ReportData: + """Read all YAML artifacts from a run folder into a ReportData.""" + report = ReportData(generated_at=datetime.now(UTC).isoformat(timespec="seconds")) + + # ── run-meta.yaml ────────────────────────────────────────── + raw = _load_yaml(run_folder / "run-meta.yaml") + if raw: + cfg = raw.get("config", {}) + report.meta = RunMeta( + run_folder=raw.get("run_folder", str(run_folder)), + started_at=raw.get("started_at", ""), + completed_at=raw.get("completed_at", ""), + status=raw.get("status", ""), + execution_time_ms=raw.get("execution_time_ms", 0), + total_handoffs=raw.get("total_handoffs", 0), + node_history=raw.get("node_history", []), + executor_model=cfg.get("executor_model", ""), + simulator_model=cfg.get("simulator_model", ""), + aws_region=cfg.get("aws_region", ""), + rules_source=cfg.get("rules_source", ""), + rules_repo=cfg.get("rules_repo") or "", + rules_ref=cfg.get("rules_ref") or "", + rules_local_path=cfg.get("rules_local_path") or "", + vision_file=raw.get("vision_file", ""), + tech_env_file=raw.get("tech_env_file", ""), + ) + + # ── run-metrics.yaml ─────────────────────────────────────── + raw = _load_yaml(run_folder / "run-metrics.yaml") + if raw: + tok = raw.get("tokens", {}) + tot = tok.get("total", {}) + pa = tok.get("per_agent", {}) + ex = pa.get("executor", {}) + si = pa.get("simulator", {}) + repeated = tok.get("repeated_context", {}) + api_tot = tok.get("api_total", {}) + timing = raw.get("timing", {}) + art_ws = raw.get("artifacts", {}).get("workspace", {}) + art_doc = raw.get("artifacts", {}).get("aidlc_docs", {}) + errs = raw.get("errors", {}) + + handoffs = [] + for h in timing.get("handoffs", []): + handoffs.append(HandoffTiming( + handoff=h.get("handoff", 0), + node_id=h.get("node_id", ""), + duration_ms=h.get("duration_ms", 0), + )) + + report.metrics = RunMetrics( + total_tokens=TokenUsage(tot.get("input_tokens", 0), tot.get("output_tokens", 0), tot.get("total_tokens", 0)), + executor_tokens=TokenUsage(ex.get("input_tokens", 0), ex.get("output_tokens", 0), ex.get("total_tokens", 0)), + simulator_tokens=TokenUsage(si.get("input_tokens", 0), si.get("output_tokens", 0), si.get("total_tokens", 0)), + repeated_context_tokens=TokenUsage(repeated.get("input_tokens", 0), repeated.get("output_tokens", 0), repeated.get("total_tokens", 0)), + api_total_tokens=TokenUsage(api_tot.get("input_tokens", 0), api_tot.get("output_tokens", 0), api_tot.get("total_tokens", 0)), + wall_clock_ms=timing.get("total_wall_clock_ms", 0), + handoffs=handoffs, + artifacts=Artifacts( + source_files=art_ws.get("source_files", 0), + test_files=art_ws.get("test_files", 0), + config_files=art_ws.get("config_files", 0), + total_files=art_ws.get("total_files", 0), + total_lines_of_code=art_ws.get("total_lines_of_code", 0), + inception_files=art_doc.get("inception_files", 0), + construction_files=art_doc.get("construction_files", 0), + total_doc_files=art_doc.get("total_files", 0), + ), + errors={k: v for k, v in errs.items() if k != "details" and isinstance(v, int)}, + ) + + # Context size stats (may be absent in older runs) + ctx = raw.get("context_size", {}) + if ctx: + report.metrics.context_size_total = _parse_context_stats(ctx.get("total", {})) + ctx_pa = ctx.get("per_agent", {}) + if "executor" in ctx_pa: + report.metrics.context_size_executor = _parse_context_stats(ctx_pa["executor"]) + if "simulator" in ctx_pa: + report.metrics.context_size_simulator = _parse_context_stats(ctx_pa["simulator"]) + + # ── test-results.yaml ────────────────────────────────────── + raw = _load_yaml(run_folder / "test-results.yaml") + if raw: + parsed = raw.get("test", {}).get("parsed_results", {}) + test_output = raw.get("test", {}).get("output", "") + _passed = parsed.get("passed") or 0 + _total = parsed.get("total") or 0 + report.tests = TestResults( + status=raw.get("status", ""), + install_ok=raw.get("install", {}).get("success", False), + test_ok=raw.get("test", {}).get("success", False), + passed=_passed, + failed=parsed.get("failed") or 0, + errors=parsed.get("errors") or 0, + total=_total, + pass_pct=(_passed / _total * 100) if _total > 0 else 0.0, + coverage_pct=_parse_coverage(test_output), + ) + + # ── quality-report.yaml ──────────────────────────────────── + raw = _load_yaml(run_folder / "quality-report.yaml") + if raw: + lint = raw.get("lint", {}) + sec = raw.get("security", {}) + sem = raw.get("semgrep", {}) + dup = raw.get("duplication", {}) + summary = raw.get("summary", {}) + findings = [] + for f in lint.get("findings", []): + findings.append(LintFinding( + file=Path(f.get("file", "")).name, + line=f.get("line", 0), + code=f.get("code", ""), + message=f.get("message", ""), + severity=f.get("severity", ""), + )) + report.quality = QualityReport( + project_type=raw.get("project_type", ""), + lint_tool=lint.get("tool", ""), + lint_version=lint.get("version") or "", + lint_available=lint.get("available", False), + lint_findings=findings, + lint_total=summary.get("lint_total", 0), + lint_errors=summary.get("lint_errors", 0), + lint_warnings=summary.get("lint_warnings", 0), + security_tool=sec.get("tool", ""), + security_available=sec.get("available", False), + security_total=summary.get("security_total", 0), + security_high=summary.get("security_high", 0), + semgrep_tool=sem.get("tool", ""), + semgrep_available=sem.get("available", False), + semgrep_total=len(sem.get("findings", [])), + semgrep_high=sum(1 for f in sem.get("findings", []) if f.get("severity") == "high"), + duplication_tool=dup.get("tool", ""), + duplication_available=dup.get("available", False), + duplication_blocks=summary.get("duplication_blocks", 0), + duplication_lines=summary.get("duplication_lines", 0), + ) + + # ── contract-test-results.yaml ───────────────────────────── + raw = _load_yaml(run_folder / "contract-test-results.yaml") + if raw: + cases = [] + for c in raw.get("cases", []): + cases.append(ContractCase( + name=c.get("name", ""), + path=c.get("path", ""), + method=c.get("method", ""), + passed=c.get("passed", False), + expected_status=c.get("expected_status", 0), + actual_status=c.get("actual_status"), + failures=c.get("failures", []), + latency_ms=c.get("latency_ms"), + error=c.get("error"), + )) + report.contracts = ContractResults( + total=raw.get("total", 0), + passed=raw.get("passed", 0), + failed=raw.get("failed", 0), + errors=raw.get("errors", 0), + server_started=raw.get("server_started", False), + server_error=raw.get("server_error"), + cases=cases, + ) + + # ── qualitative-comparison.yaml ──────────────────────────── + raw = _load_yaml(run_folder / "qualitative-comparison.yaml") + if raw: + phases = [] + for p in raw.get("phases", []): + docs = [] + for d in p.get("documents", []): + docs.append(DocScore( + path=d.get("path", ""), + intent=d.get("intent_similarity", 0), + design=d.get("design_similarity", 0), + completeness=d.get("completeness", 0), + overall=d.get("overall", 0), + notes=d.get("notes", ""), + )) + phases.append(PhaseScore( + phase=p.get("phase", ""), + avg_intent=p.get("avg_intent", 0), + avg_design=p.get("avg_design", 0), + avg_completeness=p.get("avg_completeness", 0), + avg_overall=p.get("avg_overall", 0), + documents=docs, + )) + report.qualitative = QualitativeResults( + overall_score=raw.get("overall_score", 0), + phases=phases, + unmatched_reference=raw.get("unmatched_reference", []), + unmatched_candidate=raw.get("unmatched_candidate", []), + ) + + return report diff --git a/scripts/aidlc-evaluator/packages/reporting/src/reporting/render_html.py b/scripts/aidlc-evaluator/packages/reporting/src/reporting/render_html.py new file mode 100644 index 00000000..362044de --- /dev/null +++ b/scripts/aidlc-evaluator/packages/reporting/src/reporting/render_html.py @@ -0,0 +1,585 @@ +"""Render a ReportData into a self-contained HTML report with modern styling.""" + +from __future__ import annotations + +import html as html_mod +from pathlib import Path + +from reporting.collector import ReportData + +CSS = """ +:root { + --bg: #0f172a; --surface: #1e293b; --surface2: #334155; + --text: #e2e8f0; --text2: #94a3b8; --border: #475569; + --green: #22c55e; --green-bg: #052e16; --green-border: #166534; + --red: #ef4444; --red-bg: #450a0a; --red-border: #991b1b; + --yellow: #eab308; --yellow-bg: #422006; --yellow-border: #854d0e; + --blue: #3b82f6; --blue-bg: #172554; --blue-border: #1d4ed8; + --purple: #a855f7; --accent: #38bdf8; +} +*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; } +body { + font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; + background: var(--bg); color: var(--text); line-height: 1.6; + max-width: 1100px; margin: 0 auto; padding: 2rem 1.5rem; +} +h1 { font-size: 2rem; font-weight: 700; margin-bottom: .25rem; } +h2 { + font-size: 1.25rem; font-weight: 600; color: var(--accent); + margin: 2.5rem 0 1rem; padding-bottom: .5rem; border-bottom: 1px solid var(--border); +} +h3 { font-size: 1.05rem; font-weight: 600; margin: 1.5rem 0 .75rem; } +.subtitle { color: var(--text2); font-size: .9rem; margin-bottom: 2rem; } +code { + font-family: 'JetBrains Mono', 'Fira Code', monospace; font-size: .85em; + background: var(--surface2); padding: .15em .4em; border-radius: 4px; +} + +/* ── Cards ── */ +.card-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); gap: 1rem; margin-bottom: 1.5rem; } +.card { + background: var(--surface); border: 1px solid var(--border); border-radius: 12px; + padding: 1.25rem; transition: border-color .2s; +} +.card:hover { border-color: var(--accent); } +.card-label { font-size: .75rem; text-transform: uppercase; letter-spacing: .08em; color: var(--text2); margin-bottom: .5rem; } +.card-value { font-size: 1.75rem; font-weight: 700; } +.card-detail { font-size: .8rem; color: var(--text2); margin-top: .25rem; } + +/* ── Badges ── */ +.badge { + display: inline-flex; align-items: center; gap: .35rem; + padding: .25rem .75rem; border-radius: 999px; font-size: .8rem; font-weight: 600; +} +.badge-pass { background: var(--green-bg); color: var(--green); border: 1px solid var(--green-border); } +.badge-fail { background: var(--red-bg); color: var(--red); border: 1px solid var(--red-border); } +.badge-warn { background: var(--yellow-bg); color: var(--yellow); border: 1px solid var(--yellow-border); } +.badge-info { background: var(--blue-bg); color: var(--blue); border: 1px solid var(--blue-border); } + +/* ── Progress bar ── */ +.progress-wrap { width: 100%; background: var(--surface2); border-radius: 6px; overflow: hidden; height: 10px; } +.progress-bar { height: 100%; border-radius: 6px; transition: width .4s ease; } +.progress-green { background: linear-gradient(90deg, #16a34a, #22c55e); } +.progress-yellow { background: linear-gradient(90deg, #ca8a04, #eab308); } +.progress-red { background: linear-gradient(90deg, #dc2626, #ef4444); } + +/* ── Tables ── */ +table { width: 100%; border-collapse: collapse; margin-bottom: 1.5rem; font-size: .875rem; } +th { text-align: left; padding: .6rem .75rem; background: var(--surface); color: var(--text2); + font-weight: 600; font-size: .75rem; text-transform: uppercase; letter-spacing: .05em; + border-bottom: 2px solid var(--border); } +td { padding: .55rem .75rem; border-bottom: 1px solid var(--surface2); } +tr:hover td { background: var(--surface); } +.num { text-align: right; font-variant-numeric: tabular-nums; } +.pass-icon::before { content: '\\2714'; color: var(--green); margin-right: .3rem; } +.fail-icon::before { content: '\\2718'; color: var(--red); margin-right: .3rem; } + +/* ── Accordion ── */ +details { margin: .5rem 0; } +details summary { + cursor: pointer; padding: .5rem .75rem; background: var(--surface); + border-radius: 8px; font-size: .85rem; color: var(--text2); + transition: background .2s; +} +details summary:hover { background: var(--surface2); } +details[open] summary { border-radius: 8px 8px 0 0; } +details .detail-body { background: var(--surface); padding: .75rem; border-radius: 0 0 8px 8px; + font-size: .82rem; line-height: 1.65; color: var(--text2); } + +/* ── Score ring ── */ +.score-ring { display: inline-flex; align-items: center; gap: .75rem; } +.ring-container { position: relative; width: 80px; height: 80px; } +.ring-container svg { transform: rotate(-90deg); } +.ring-container circle { fill: none; stroke-width: 6; } +.ring-bg { stroke: var(--surface2); } +.ring-fg { stroke-linecap: round; transition: stroke-dashoffset .6s ease; } +.ring-label { + position: absolute; inset: 0; display: flex; align-items: center; justify-content: center; + font-size: 1.1rem; font-weight: 700; +} + +/* ── Phase bar chart ── */ +.phase-bars { display: flex; gap: 2rem; margin: 1rem 0; } +.phase-bar-group { flex: 1; } +.phase-bar-title { font-size: .8rem; font-weight: 600; margin-bottom: .5rem; text-transform: capitalize; } +.bar-row { display: flex; align-items: center; gap: .5rem; margin: .35rem 0; } +.bar-row-label { width: 80px; font-size: .75rem; color: var(--text2); text-align: right; } +.bar-track { flex: 1; height: 8px; background: var(--surface2); border-radius: 4px; overflow: hidden; } +.bar-fill { height: 100%; border-radius: 4px; } +.bar-val { width: 35px; font-size: .75rem; font-weight: 600; } + +.footer { margin-top: 3rem; padding-top: 1rem; border-top: 1px solid var(--border); + color: var(--text2); font-size: .75rem; text-align: center; } + +/* ── Comparison ── */ +.cmp-summary { display: flex; gap: 1.5rem; margin-bottom: 1.5rem; } +.cmp-stat { text-align: center; } +.cmp-stat-val { font-size: 1.75rem; font-weight: 700; } +.cmp-stat-label { font-size: .75rem; color: var(--text2); text-transform: uppercase; letter-spacing: .05em; } +.delta-improved { color: var(--green); } +.delta-regressed { color: var(--red); } +.delta-unchanged { color: var(--text2); } +.delta-dot { display: inline-block; width: 8px; height: 8px; border-radius: 50%; margin-right: .4rem; vertical-align: middle; } +.dot-improved { background: var(--green); } +.dot-regressed { background: var(--red); } +.dot-unchanged { background: var(--text2); } +""" + + +def _esc(s: str) -> str: + return html_mod.escape(str(s)) + + +def _ms_to_human(ms: int) -> str: + secs = ms / 1000 + if secs < 60: + return f"{secs:.0f}s" + mins = secs / 60 + if mins < 60: + return f"{mins:.1f}m" + return f"{mins / 60:.1f}h" + + +def _fmt_tokens(n: int) -> str: + if n >= 1_000_000: + return f"{n / 1_000_000:.1f}M" + if n >= 1_000: + return f"{n / 1_000:.0f}K" + return str(n) + + +def _score_color(score: float) -> str: + if score >= 0.8: + return "var(--green)" + if score >= 0.6: + return "var(--yellow)" + return "var(--red)" + + +def _progress_class(ratio: float) -> str: + if ratio >= 0.9: + return "progress-green" + if ratio >= 0.7: + return "progress-yellow" + return "progress-red" + + +def _fmt_val_html(v: float | int | None) -> str: + if v is None: + return "---" + if isinstance(v, float): + return f"{v:.4f}" if v < 10 else f"{v:,.0f}" + return f"{v:,}" + + +def _score_ring(score: float, size: int = 80) -> str: + r = (size - 6) / 2 + circ = 2 * 3.14159 * r + offset = circ * (1 - score) + color = _score_color(score) + return f"""<div class="ring-container" style="width:{size}px;height:{size}px"> + <svg width="{size}" height="{size}" viewBox="0 0 {size} {size}"> + <circle class="ring-bg" cx="{size//2}" cy="{size//2}" r="{r}"/> + <circle class="ring-fg" cx="{size//2}" cy="{size//2}" r="{r}" + stroke="{color}" stroke-dasharray="{circ:.1f}" stroke-dashoffset="{offset:.1f}"/> + </svg> + <div class="ring-label" style="color:{color}">{score:.0%}</div> +</div>""" + + +def _badge(label: str, cls: str) -> str: + return f'<span class="badge badge-{cls}">{_esc(label)}</span>' + + +def _fmt_delta_val(delta: float, metric_name: str) -> str: + """Format a delta value with appropriate units for the metric.""" + sign = "+" if delta > 0 else "" + if metric_name == "Wall Clock (ms)": + abs_ms = abs(delta) + if abs_ms >= 60_000: + return f"{sign}{delta / 60_000:.1f}m" + return f"{sign}{delta / 1_000:.1f}s" + if "Tokens" in metric_name: + abs_t = abs(delta) + if abs_t >= 1_000_000: + return f"{sign}{delta / 1_000_000:.2f}M" + if abs_t >= 1_000: + return f"{sign}{delta / 1_000:.1f}k" + return f"{sign}{int(delta)}" + if isinstance(delta, float) and not delta.is_integer(): + return f"{sign}{delta:.3f}" + return f"{sign}{int(delta)}" + + +def _delta_tag(cmp, metric_name: str) -> str: + """Return an HTML snippet showing the delta vs golden for a named metric.""" + if cmp is None: + return "" + for d in cmp.deltas: + if d.name == metric_name and d.delta is not None and abs(d.delta) > 0.001: + val = _fmt_delta_val(d.delta, metric_name) + cls = f"delta-{d.direction}" + return f' <span class="{cls}" style="font-size:.7rem;font-weight:600">{val} vs golden</span>' + return ' <span class="delta-unchanged" style="font-size:.7rem">= golden</span>' + + +def render_html(data: ReportData) -> str: + out: list[str] = [] + w = out.append + + run_name = Path(data.meta.run_folder).name if data.meta.run_folder else "unknown" + cmp = data.comparison + + w("<!DOCTYPE html>") + w(f'<html lang="en"><head><meta charset="utf-8">') + w(f'<meta name="viewport" content="width=device-width,initial-scale=1">') + w(f"<title>AIDLC Report — {_esc(run_name)}") + w(f"") + w(f"") + w(f"") + w("") + + # ── Header ───────────────────────────────────────────────── + w(f"

AIDLC Evaluation Report

") + w(f'
{_esc(run_name)} · {_esc(data.generated_at)}
') + + # ── Test metadata ────────────────────────────────────────── + w('') + w(f'') + w(f'') + if data.meta.rules_source == "git" and data.meta.rules_repo: + w(f'') + elif data.meta.rules_source == "local" and data.meta.rules_local_path: + w(f'') + elif data.meta.rules_source: + w(f'') + w('
Executor Model{_esc(data.meta.executor_model)}
Simulator Model{_esc(data.meta.simulator_model)}
Rules Source{_esc(data.meta.rules_repo)} @ {_esc(data.meta.rules_ref)}
Rules Sourcelocal: {_esc(data.meta.rules_local_path)}
Rules Source{_esc(data.meta.rules_source)}
') + + # ── Verdict Cards ────────────────────────────────────────── + test_ok = data.tests and data.tests.test_ok and data.tests.failed == 0 + contract_ok = data.contracts and data.contracts.failed == 0 and data.contracts.errors == 0 + qual_score = data.qualitative.overall_score if data.qualitative else 0 + + w('
') + + if data.tests: + t = data.tests + cls = "pass" if test_ok else "fail" + w(f'
Unit Tests
') + w(f'
{_badge(f"{t.pass_pct:.1f}% ({t.passed}/{t.total})", cls)}{_delta_tag(cmp, "Tests Pass %")}
') + if t.coverage_pct is not None: + w(f'
Coverage: {t.coverage_pct:.1f}%{_delta_tag(cmp, "Coverage %")}
') + w("
") + + if data.contracts: + ct = data.contracts + cls = "pass" if contract_ok else "fail" + w(f'
Contract Tests
') + w(f'
{_badge(f"{ct.passed}/{ct.total}", cls)}{_delta_tag(cmp, "Contract Passed")}
') + w(f'
API endpoints validated
') + w("
") + + if data.quality: + q = data.quality + q_ok = q.lint_errors == 0 and q.security_high == 0 + cls = "pass" if q_ok else "warn" + w(f'
Code Quality
') + w(f'
{_badge(f"{q.lint_total} lint / {q.security_total} security", cls)}{_delta_tag(cmp, "Lint Errors")}
') + w(f'
{q.lint_errors} errors, {q.security_high} high severity
') + w("
") + + if data.qualitative: + w(f'
Qualitative Score
') + w(f'
') + w(_score_ring(qual_score, 64)) + w(f'{_delta_tag(cmp, "Qualitative Score")}') + w(f"
") + + # Timing + tokens + w(f'
Execution Time
') + w(f'
{_ms_to_human(data.metrics.wall_clock_ms)}{_delta_tag(cmp, "Wall Clock (ms)")}
') + w(f'
{data.meta.total_handoffs} handoffs
') + w("
") + + w(f'
Total Tokens
') + w(f'
{_fmt_tokens(data.metrics.total_tokens.total_tokens)}{_delta_tag(cmp, "Total Tokens")}
') + w(f'
in: {_fmt_tokens(data.metrics.total_tokens.input_tokens)} / out: {_fmt_tokens(data.metrics.total_tokens.output_tokens)}
') + w("
") + + w("
") # card-grid + + # ── Run Overview ─────────────────────────────────────────── + w("

Run Overview

") + w("") + rows = [ + ("Status", f"{_esc(data.meta.status)}"), + ("Executor", f"{_esc(data.meta.executor_model)}"), + ("Simulator", f"{_esc(data.meta.simulator_model)}"), + ("Region", f"{_esc(data.meta.aws_region)}"), + ("Handoffs", f"{data.meta.total_handoffs} ({' → '.join(_esc(n) for n in data.meta.node_history)})"), + ] + for label, val in rows: + w(f"") + w("
{label}{val}
") + + # ── Handoff Timeline ─────────────────────────────────────── + if data.metrics.handoffs: + total_ms = data.metrics.wall_clock_ms or 1 + w("

Handoff Timeline

") + w('
') + colors = {"executor": "var(--blue)", "simulator": "var(--purple)"} + for h in data.metrics.handoffs: + pct = max(h.duration_ms / total_ms * 100, 2) + col = colors.get(h.node_id, "var(--accent)") + w(f'
{_esc(h.node_id[0].upper())}{h.handoff}
') + w("
") + w("") + for h in data.metrics.handoffs: + pct = h.duration_ms / total_ms * 100 + w(f'' + f'') + w("
#AgentDuration% of Total
{h.handoff}{_esc(h.node_id)}{_ms_to_human(h.duration_ms)}{pct:.1f}%
") + + # ── Token Breakdown ──────────────────────────────────────── + w("

Token Usage

") + w("

Unique Tokens by Agent

") + w("") + for name, tok in [("Executor", data.metrics.executor_tokens), ("Simulator", data.metrics.simulator_tokens), ("Total Unique", data.metrics.total_tokens)]: + w(f"" + f"") + w("
AgentInputOutputTotal
{name}{_fmt_tokens(tok.input_tokens)}{_fmt_tokens(tok.output_tokens)}{_fmt_tokens(tok.total_tokens)}
") + + # Show repeated context if present + if data.metrics.repeated_context_tokens.total_tokens > 0: + w("

Context Repetition

") + w("

Tokens re-sent across multiple conversation turns:

") + w("") + w(f"" + f"" + f"") + w(f"" + f"" + f"") + w("
CategoryInputOutputTotal
Repeated Context{_fmt_tokens(data.metrics.repeated_context_tokens.input_tokens)}{_fmt_tokens(data.metrics.repeated_context_tokens.output_tokens)}{_fmt_tokens(data.metrics.repeated_context_tokens.total_tokens)}
API Total{_fmt_tokens(data.metrics.api_total_tokens.input_tokens)}{_fmt_tokens(data.metrics.api_total_tokens.output_tokens)}{_fmt_tokens(data.metrics.api_total_tokens.total_tokens)}
") + + # ── Context Size ────────────────────────────────────────── + ctx_total = data.metrics.context_size_total + if ctx_total and ctx_total.sample_count > 0: + ctx_ex = data.metrics.context_size_executor + ctx_si = data.metrics.context_size_simulator + w("

Context Size (Input Tokens per Invocation)

") + w("" + "") + if ctx_ex and ctx_ex.sample_count > 0: + w(f"" + f"" + f"") + if ctx_si and ctx_si.sample_count > 0: + w(f"" + f"" + f"") + w(f"" + f"" + f"" + f"" + f"") + w("
AgentMinMaxAverageMedianSamples
Executor{_fmt_tokens(ctx_ex.min_tokens)}{_fmt_tokens(ctx_ex.max_tokens)}{_fmt_tokens(ctx_ex.avg_tokens)}{_fmt_tokens(ctx_ex.median_tokens)}{ctx_ex.sample_count}
Simulator{_fmt_tokens(ctx_si.min_tokens)}{_fmt_tokens(ctx_si.max_tokens)}{_fmt_tokens(ctx_si.avg_tokens)}{_fmt_tokens(ctx_si.median_tokens)}{ctx_si.sample_count}
Total{_fmt_tokens(ctx_total.min_tokens)}{_fmt_tokens(ctx_total.max_tokens)}{_fmt_tokens(ctx_total.avg_tokens)}{_fmt_tokens(ctx_total.median_tokens)}{ctx_total.sample_count}
") + + # ── Unit Tests ───────────────────────────────────────────── + if data.tests: + t = data.tests + w("

Unit Tests

") + ratio = t.passed / t.total if t.total else 0 + w(f'
') + w(f'{_badge(f"{t.pass_pct:.1f}% passed ({t.passed}/{t.total})", "pass" if test_ok else "fail")}') + if t.coverage_pct is not None: + cov_cls = "pass" if t.coverage_pct >= 90 else "warn" if t.coverage_pct >= 70 else "fail" + w(f'{_badge(f"{t.coverage_pct:.1f}% coverage", cov_cls)}') + w("
") + w(f'
') + + # ── Contract Tests ───────────────────────────────────────── + if data.contracts: + ct = data.contracts + w("

Contract Tests

") + ratio = ct.passed / ct.total if ct.total else 0 + w(f'
') + w(f'{_badge(f"{ct.passed}/{ct.total} passed", "pass" if contract_ok else "fail")}') + if ct.failed: + w(f'{_badge(f"{ct.failed} failed", "fail")}') + if ct.errors: + w(f'{_badge(f"{ct.errors} errors", "fail")}') + w("
") + w(f'
') + + groups: dict[str, list] = {} + for c in ct.cases: + parts = c.path.strip("/").split("/") + group = parts[2] if len(parts) >= 3 else parts[0] + groups.setdefault(group, []).append(c) + + for group_name, cases in groups.items(): + passed_g = sum(1 for c in cases if c.passed) + total_g = len(cases) + ok_g = passed_g == total_g + w(f"

{_esc(group_name.title())} {_badge(f'{passed_g}/{total_g}', 'pass' if ok_g else 'fail')}

") + w("") + for c in cases: + icon_cls = "pass-icon" if c.passed else "fail-icon" + status_str = str(c.actual_status) if c.actual_status else "---" + lat = f"{c.latency_ms:.0f}ms" if c.latency_ms else "---" + w(f'' + f'') + w("
TestMethodPathStatusLatency
{_esc(c.name)}{c.method}{_esc(c.path)}{status_str}{lat}
") + + # ── Code Quality ─────────────────────────────────────────── + if data.quality: + q = data.quality + w("

Code Quality

") + q_ok = q.lint_errors == 0 and q.security_high == 0 + w(f'
') + w(f'{_badge(f"{q.lint_errors} lint errors", "pass" if q.lint_errors == 0 else "fail")}') + w(f'{_badge(f"{q.lint_warnings} warnings", "warn" if q.lint_warnings else "pass")}') + w(f'{_badge(f"{q.security_total} security findings", "pass" if q.security_high == 0 else "fail")}') + if q.lint_available: + w(f'{_badge(f"{_esc(q.lint_tool)} {_esc(q.lint_version)}", "info")}') + if q.semgrep_available: + w(f'{_badge("semgrep", "info")}') + w("
") + + if q.lint_findings: + w("

Lint Findings

") + w("") + for f in q.lint_findings: + sev_cls = "fail" if f.severity == "error" else "warn" + w(f'' + f'' + f'') + w("
FileLineCodeMessageSeverity
{_esc(f.file)}{f.line}{_esc(f.code)}{_esc(f.message)}{_badge(f.severity, sev_cls)}
") + + if q.duplication_available: + dup_ok = q.duplication_blocks == 0 + w("

Code Duplication

") + w(f'
') + w(f'{_badge(f"{q.duplication_blocks} duplicate blocks", "pass" if dup_ok else "warn")}') + w(f'{_badge(f"{q.duplication_lines} duplicated lines", "info")}') + w("
") + + # ── Qualitative Evaluation ───────────────────────────────── + if data.qualitative: + ql = data.qualitative + w("

Qualitative Evaluation

") + w(f'
') + w(_score_ring(ql.overall_score)) + w(f'
Overall Score
' + f'
Semantic similarity to golden baseline
') + w("
") + + if ql.phases: + w('
') + for phase in ql.phases: + w(f'
{_esc(phase.phase)}
') + for dim, val in [("Intent", phase.avg_intent), ("Design", phase.avg_design), + ("Complete", phase.avg_completeness), ("Overall", phase.avg_overall)]: + col = _score_color(val) + w(f'
{dim}
' + f'
' + f'
{val:.2f}
') + w("
") + w("
") + + for phase in ql.phases: + w(f"

{_esc(phase.phase.title())} Phase — Documents

") + w("" + "") + for d in phase.documents: + name = Path(d.path).name + w(f'' + f'' + f'' + f'' + f'') + w("
DocumentIntentDesignCompletenessOverall
{_esc(name)}{d.intent:.2f}{d.design:.2f}{d.completeness:.2f}{d.overall:.2f}
") + + for d in phase.documents: + if d.notes: + name = Path(d.path).name + w(f'
{_esc(name)} — {d.overall:.2f}') + w(f'
{_esc(d.notes)}
') + + # ── Artifacts ────────────────────────────────────────────── + art = data.metrics.artifacts + if art.total_files > 0: + w("

Generated Artifacts

") + w('
') + for label, val in [("Source Files", art.source_files), ("Test Files", art.test_files), + ("Config Files", art.config_files), ("Total Files", art.total_files), + ("Lines of Code", f"{art.total_lines_of_code:,}"), + ("AIDLC Docs", art.total_doc_files)]: + w(f'
{label}
{val}
') + w("
") + + # ── Baseline Comparison ────────────────────────────────────── + if data.comparison: + cmp = data.comparison + golden_name = Path(cmp.golden_run).name if cmp.golden_run else "unknown" + + w("

Baseline Comparison

") + w(f'
' + f'vs golden {_esc(golden_name)}') + if cmp.golden_promoted_at: + w(f' · promoted {_esc(cmp.golden_promoted_at)}') + w("
") + + w('
') + w(f'
{cmp.improved}
Improved
') + w(f'
{cmp.regressed}
Regressed
') + w(f'
{cmp.unchanged}
Unchanged
') + w("
") + + current_cat = "" + for d in cmp.deltas: + if d.category != current_cat: + if current_cat: + w("") + current_cat = d.category + w(f"

{_esc(d.category)}

") + w("" + "") + + dot_cls = f"dot-{d.direction}" + delta_cls = f"delta-{d.direction}" + golden_str = _fmt_val_html(d.golden) + current_str = _fmt_val_html(d.current) + if d.delta is not None: + sign = "+" if d.delta > 0 else "" + if isinstance(d.delta, float) and not d.delta.is_integer(): + delta_str = f"{sign}{d.delta:.4f}" + else: + delta_str = f"{sign}{int(d.delta)}" + pct_str = f" ({d.pct_change:+.1f}%)" if d.pct_change is not None and abs(d.pct_change) >= 0.1 else "" + else: + delta_str = "---" + pct_str = "" + + w(f'' + f'' + f'' + f'' + f'' + f'') + + if current_cat: + w("
MetricGoldenCurrentDeltaChange
{_esc(d.name)}{golden_str}{current_str}{delta_str}{pct_str}{d.direction}
") + + # ── Footer ───────────────────────────────────────────────── + w(f'') + w("") + + return "\n".join(out) + + +def write_html(data: ReportData, output_path: Path) -> None: + html_str = render_html(data) + with open(output_path, "w", encoding="utf-8") as f: + f.write(html_str) diff --git a/scripts/aidlc-evaluator/packages/reporting/src/reporting/render_md.py b/scripts/aidlc-evaluator/packages/reporting/src/reporting/render_md.py new file mode 100644 index 00000000..3505e294 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/reporting/src/reporting/render_md.py @@ -0,0 +1,429 @@ +"""Render a ReportData into polished GitHub-flavoured Markdown.""" + +from __future__ import annotations + +from pathlib import Path + +from reporting.collector import ReportData + + +def _ms_to_human(ms: int) -> str: + secs = ms / 1000 + if secs < 60: + return f"{secs:.0f}s" + mins = secs / 60 + if mins < 60: + return f"{mins:.1f}m" + hrs = mins / 60 + return f"{hrs:.1f}h" + + +def _fmt_tokens(n: int) -> str: + if n >= 1_000_000: + return f"{n / 1_000_000:.1f}M" + if n >= 1_000: + return f"{n / 1_000:.0f}K" + return str(n) + + +def _pct_bar(value: float, width: int = 20) -> str: + filled = round(value * width) + return "\u2588" * filled + "\u2591" * (width - filled) + + +def _fmt_val(v: float | int | None) -> str: + if v is None: + return "---" + if isinstance(v, float): + return f"{v:.4f}" if v < 10 else f"{v:,.0f}" + return f"{v:,}" + + +def _status_icon(ok: bool) -> str: + return "\u2705" if ok else "\u274c" + + +def _fmt_delta_val(delta: float, metric_name: str) -> str: + """Format a delta value with appropriate units for the metric.""" + sign = "+" if delta > 0 else "" + if metric_name == "Wall Clock (ms)": + abs_ms = abs(delta) + if abs_ms >= 60_000: + return f"{sign}{delta / 60_000:.1f}m" + return f"{sign}{delta / 1_000:.1f}s" + if "Tokens" in metric_name: + abs_t = abs(delta) + if abs_t >= 1_000_000: + return f"{sign}{delta / 1_000_000:.2f}M" + if abs_t >= 1_000: + return f"{sign}{delta / 1_000:.1f}k" + return f"{sign}{int(delta)}" + if isinstance(delta, float) and not delta.is_integer(): + return f"{sign}{delta:.3f}" + return f"{sign}{int(delta)}" + + +def _md_delta(cmp, metric_name: str) -> str: + """Return a short inline delta string for the verdict table.""" + if cmp is None: + return "" + for d in cmp.deltas: + if d.name == metric_name and d.delta is not None and abs(d.delta) > 0.001: + val = _fmt_delta_val(d.delta, metric_name) + icon = "\U0001f7e2" if d.direction == "improved" else "\U0001f534" if d.direction == "regressed" else "\u26aa" + return f" {icon} _{val} vs golden_" + return " \u26aa _= golden_" + + +def render_markdown(data: ReportData) -> str: + lines: list[str] = [] + w = lines.append + + run_name = Path(data.meta.run_folder).name if data.meta.run_folder else "unknown" + cmp = data.comparison + + # ── Header ───────────────────────────────────────────────── + w(f"# AIDLC Evaluation Report") + w("") + w(f"> **Run:** `{run_name}`") + w(f"> **Generated:** {data.generated_at}") + w("") + + # ── Test metadata ────────────────────────────────────────── + w("| | |") + w("|---|---|") + w(f"| **Executor Model** | `{data.meta.executor_model}` |") + w(f"| **Simulator Model** | `{data.meta.simulator_model}` |") + if data.meta.rules_source == "git" and data.meta.rules_repo: + w(f"| **Rules Source** | `{data.meta.rules_repo}` @ `{data.meta.rules_ref}` |") + elif data.meta.rules_source == "local" and data.meta.rules_local_path: + w(f"| **Rules Source** | local: `{data.meta.rules_local_path}` |") + elif data.meta.rules_source: + w(f"| **Rules Source** | `{data.meta.rules_source}` |") + w("") + + # ── Verdict banner ───────────────────────────────────────── + test_ok = data.tests and data.tests.test_ok and data.tests.failed == 0 + contract_ok = data.contracts and data.contracts.failed == 0 and data.contracts.errors == 0 + qual_score = data.qualitative.overall_score if data.qualitative else 0 + overall_ok = (test_ok is not False) and (contract_ok is not False) and qual_score >= 0.7 + + w("## Verdict") + w("") + w(f"| Dimension | Result |") + w(f"|-----------|--------|") + if data.tests: + w(f"| Unit Tests | {_status_icon(test_ok)} **{data.tests.pass_pct:.1f}%** ({data.tests.passed}/{data.tests.total}){_md_delta(cmp, 'Tests Pass %')} |") + if data.contracts: + w(f"| Contract Tests | {_status_icon(contract_ok)} **{data.contracts.passed}/{data.contracts.total}** passed{_md_delta(cmp, 'Contract Passed')} |") + if data.quality: + q_ok = data.quality.lint_errors == 0 and data.quality.security_high == 0 + w(f"| Code Quality | {_status_icon(q_ok)} lint: {data.quality.lint_total} ({data.quality.lint_errors} errors), security: {data.quality.security_total} ({data.quality.security_high} high){_md_delta(cmp, 'Lint Errors')} |") + if data.qualitative: + w(f"| Qualitative Score | {'🟢' if qual_score >= 0.8 else '🟡' if qual_score >= 0.6 else '🔴'} **{qual_score:.2f}**{_md_delta(cmp, 'Qualitative Score')} |") + w(f"| Execution Time | {_ms_to_human(data.metrics.wall_clock_ms)}{_md_delta(cmp, 'Wall Clock (ms)')} |") + w(f"| Total Tokens | {_fmt_tokens(data.metrics.total_tokens.total_tokens)}{_md_delta(cmp, 'Total Tokens')} |") + w("") + + # ── Run Overview ─────────────────────────────────────────── + w("## Run Overview") + w("") + w("| Property | Value |") + w("|----------|-------|") + w(f"| Status | `{data.meta.status}` |") + w(f"| Executor Model | `{data.meta.executor_model}` |") + w(f"| Simulator Model | `{data.meta.simulator_model}` |") + w(f"| Region | `{data.meta.aws_region}` |") + w(f"| Wall Clock | {_ms_to_human(data.metrics.wall_clock_ms)} |") + w(f"| Handoffs | {data.meta.total_handoffs} ({' → '.join(data.meta.node_history)}) |") + if data.meta.started_at: + w(f"| Started | {data.meta.started_at} |") + if data.meta.completed_at: + w(f"| Completed | {data.meta.completed_at} |") + w("") + + # ── Token Usage ──────────────────────────────────────────── + w("## Token Usage") + w("") + w("### Unique Tokens by Agent") + w("") + w("| Agent | Input | Output | Total |") + w("|-------|------:|-------:|------:|") + w(f"| Executor | {_fmt_tokens(data.metrics.executor_tokens.input_tokens)} | {_fmt_tokens(data.metrics.executor_tokens.output_tokens)} | {_fmt_tokens(data.metrics.executor_tokens.total_tokens)} |") + w(f"| Simulator | {_fmt_tokens(data.metrics.simulator_tokens.input_tokens)} | {_fmt_tokens(data.metrics.simulator_tokens.output_tokens)} | {_fmt_tokens(data.metrics.simulator_tokens.total_tokens)} |") + w(f"| **Total Unique** | **{_fmt_tokens(data.metrics.total_tokens.input_tokens)}** | **{_fmt_tokens(data.metrics.total_tokens.output_tokens)}** | **{_fmt_tokens(data.metrics.total_tokens.total_tokens)}** |") + w("") + + # Show repeated context if present + if data.metrics.repeated_context_tokens.total_tokens > 0: + w("### Context Repetition") + w("") + w("Tokens re-sent across multiple conversation turns:") + w("") + w("| Category | Input | Output | Total |") + w("|----------|------:|-------:|------:|") + w(f"| Repeated Context | {_fmt_tokens(data.metrics.repeated_context_tokens.input_tokens)} | {_fmt_tokens(data.metrics.repeated_context_tokens.output_tokens)} | {_fmt_tokens(data.metrics.repeated_context_tokens.total_tokens)} |") + w(f"| **API Total** | **{_fmt_tokens(data.metrics.api_total_tokens.input_tokens)}** | **{_fmt_tokens(data.metrics.api_total_tokens.output_tokens)}** | **{_fmt_tokens(data.metrics.api_total_tokens.total_tokens)}** |") + w("") + w("") + + # ── Context Size ────────────────────────────────────────── + ctx_total = data.metrics.context_size_total + if ctx_total and ctx_total.sample_count > 0: + ctx_ex = data.metrics.context_size_executor + ctx_si = data.metrics.context_size_simulator + w("## Context Size (Input Tokens per Invocation)") + w("") + w("| Agent | Min | Max | Average | Median | Samples |") + w("|-------|----:|----:|--------:|-------:|--------:|") + if ctx_ex and ctx_ex.sample_count > 0: + w(f"| Executor | {_fmt_tokens(ctx_ex.min_tokens)} | {_fmt_tokens(ctx_ex.max_tokens)} | {_fmt_tokens(ctx_ex.avg_tokens)} | {_fmt_tokens(ctx_ex.median_tokens)} | {ctx_ex.sample_count} |") + if ctx_si and ctx_si.sample_count > 0: + w(f"| Simulator | {_fmt_tokens(ctx_si.min_tokens)} | {_fmt_tokens(ctx_si.max_tokens)} | {_fmt_tokens(ctx_si.avg_tokens)} | {_fmt_tokens(ctx_si.median_tokens)} | {ctx_si.sample_count} |") + w(f"| **Total** | **{_fmt_tokens(ctx_total.min_tokens)}** | **{_fmt_tokens(ctx_total.max_tokens)}** | **{_fmt_tokens(ctx_total.avg_tokens)}** | **{_fmt_tokens(ctx_total.median_tokens)}** | **{ctx_total.sample_count}** |") + w("") + + # ── Handoff Timeline ─────────────────────────────────────── + if data.metrics.handoffs: + w("## Handoff Timeline") + w("") + w("| # | Agent | Duration |") + w("|--:|-------|----------|") + for h in data.metrics.handoffs: + w(f"| {h.handoff} | {h.node_id} | {_ms_to_human(h.duration_ms)} |") + w("") + + # ── Generated Artifacts ──────────────────────────────────── + art = data.metrics.artifacts + if art.total_files > 0: + w("## Generated Artifacts") + w("") + w("| Category | Count |") + w("|----------|------:|") + w(f"| Source files | {art.source_files} |") + w(f"| Test files | {art.test_files} |") + w(f"| Config files | {art.config_files} |") + w(f"| Total files | {art.total_files} |") + w(f"| Lines of code | {art.total_lines_of_code:,} |") + w(f"| AIDLC docs (inception) | {art.inception_files} |") + w(f"| AIDLC docs (construction) | {art.construction_files} |") + w(f"| AIDLC docs total | {art.total_doc_files} |") + w("") + + # ── Unit Tests ───────────────────────────────────────────── + if data.tests: + t = data.tests + w("## Unit Tests") + w("") + w(f"**{_status_icon(test_ok)} {t.pass_pct:.1f}% passed** ({t.passed}/{t.total})") + if t.failed: + w(f" — {t.failed} failed") + if t.coverage_pct is not None: + w(f"") + w(f"**Coverage:** {t.coverage_pct:.1f}%") + w("") + + # ── Contract Tests ───────────────────────────────────────── + if data.contracts: + ct = data.contracts + w("## Contract Tests (API Specification)") + w("") + w(f"**{_status_icon(contract_ok)} {ct.passed}/{ct.total}** endpoints validated") + w("") + if ct.server_error: + w(f"> **Server error:** {ct.server_error}") + w("") + + groups: dict[str, list] = {} + for c in ct.cases: + parts = c.path.strip("/").split("/") + group = parts[2] if len(parts) >= 3 else parts[0] + groups.setdefault(group, []).append(c) + + for group_name, cases in groups.items(): + passed_in_group = sum(1 for c in cases if c.passed) + total_in_group = len(cases) + icon = _status_icon(passed_in_group == total_in_group) + w(f"### {group_name.title()} {icon} {passed_in_group}/{total_in_group}") + w("") + w("| Test | Method | Path | Status | Latency |") + w("|------|--------|------|:------:|--------:|") + for c in cases: + mark = _status_icon(c.passed) + status_str = str(c.actual_status) if c.actual_status else "---" + lat = f"{c.latency_ms:.0f}ms" if c.latency_ms else "---" + w(f"| {mark} {c.name} | {c.method} | `{c.path}` | {status_str} | {lat} |") + w("") + for c in cases: + if not c.passed and (c.failures or c.error): + detail = "; ".join(c.failures) if c.failures else c.error + w(f"> **{c.name}:** {detail}") + w("") + + # ── Code Quality ─────────────────────────────────────────── + if data.quality: + q = data.quality + q_ok = q.lint_errors == 0 and q.security_high == 0 + w("## Code Quality") + w("") + w(f"**{_status_icon(q_ok)} Lint: {q.lint_total} findings** ({q.lint_errors} errors, {q.lint_warnings} warnings)") + w("") + if q.lint_available and q.lint_findings: + w(f"**Linter:** {q.lint_tool} {q.lint_version}") + w("") + w("| File | Line | Code | Message | Severity |") + w("|------|-----:|------|---------|----------|") + for f in q.lint_findings: + sev_icon = "🔴" if f.severity == "error" else "🟡" + w(f"| `{f.file}` | {f.line} | `{f.code}` | {f.message} | {sev_icon} {f.severity} |") + w("") + + w("### Security") + w("") + sec_ok = q.security_high == 0 + w(f"**{_status_icon(sec_ok)} {q.security_total} finding(s)** ({q.security_high} high)") + w("") + if not q.security_available: + w(f"*Security scanner ({q.security_tool or 'bandit'}) was not available.*") + w("") + if q.semgrep_available: + w(f"*Semgrep: {q.semgrep_total} finding(s)*") + w("") + elif q.semgrep_tool: + w(f"*Semgrep was not available.*") + w("") + + w("### Code Duplication") + w("") + if q.duplication_available: + dup_ok = q.duplication_blocks == 0 + w(f"**{_status_icon(dup_ok)} {q.duplication_blocks} duplicate block(s)** ({q.duplication_lines} duplicated lines)") + else: + w(f"*Duplication scanner ({q.duplication_tool or 'pmd-cpd'}) was not available.*") + w("") + + # ── Qualitative Evaluation ───────────────────────────────── + if data.qualitative: + ql = data.qualitative + w("## Qualitative Evaluation (Semantic Similarity)") + w("") + score_icon = "🟢" if ql.overall_score >= 0.8 else "🟡" if ql.overall_score >= 0.6 else "🔴" + w(f"**Overall Score: {score_icon} {ql.overall_score:.4f}**") + w("") + + for phase in ql.phases: + w(f"### {phase.phase.title()} Phase") + w("") + w(f"| Dimension | Score |") + w(f"|-----------|------:|") + w(f"| Intent | {phase.avg_intent:.2f} |") + w(f"| Design | {phase.avg_design:.2f} |") + w(f"| Completeness | {phase.avg_completeness:.2f} |") + w(f"| **Overall** | **{phase.avg_overall:.2f}** |") + w("") + + w("| Document | Intent | Design | Complete | Overall |") + w("|----------|-------:|-------:|---------:|--------:|") + for d in phase.documents: + name = Path(d.path).name + w(f"| `{name}` | {d.intent:.2f} | {d.design:.2f} | {d.completeness:.2f} | {d.overall:.2f} |") + w("") + + for d in phase.documents: + if d.notes: + name = Path(d.path).name + short = d.notes[:200] + "..." if len(d.notes) > 200 else d.notes + w(f"
{name} — {d.overall:.2f}") + w(f"") + w(f"{d.notes}") + w(f"") + w(f"
") + w("") + + if ql.unmatched_candidate: + w("### Unmatched Candidate Documents") + w("") + for p in ql.unmatched_candidate: + w(f"- `{p}`") + w("") + + # ── Errors ───────────────────────────────────────────────── + errs = data.metrics.errors + if errs and any(v > 0 for v in errs.values()): + w("## Errors During Execution") + w("") + w("| Error Type | Count |") + w("|------------|------:|") + for k, v in errs.items(): + if v > 0: + w(f"| {k.replace('_', ' ').title()} | {v} |") + w("") + + # ── Baseline Comparison ────────────────────────────────────── + if data.comparison: + cmp = data.comparison + w("## Baseline Comparison") + w("") + golden_name = Path(cmp.golden_run).name if cmp.golden_run else "unknown" + w(f"> Compared against golden baseline: `{golden_name}`") + if cmp.golden_promoted_at: + w(f"> Promoted: {cmp.golden_promoted_at}") + w("") + + improved_icon = "\U0001f7e2" # green circle + regressed_icon = "\U0001f534" # red circle + unchanged_icon = "\u26aa" # white circle + + w(f"| | Count |") + w(f"|---|------:|") + w(f"| {improved_icon} Improved | {cmp.improved} |") + w(f"| {regressed_icon} Regressed | {cmp.regressed} |") + w(f"| {unchanged_icon} Unchanged | {cmp.unchanged} |") + w("") + + categories_seen: set[str] = set() + for d in cmp.deltas: + if d.category not in categories_seen: + categories_seen.add(d.category) + w(f"### {d.category}") + w("") + w("| Metric | Golden | Current | Delta | Change |") + w("|--------|-------:|--------:|------:|--------|") + + if d.direction == "improved": + icon = improved_icon + elif d.direction == "regressed": + icon = regressed_icon + else: + icon = unchanged_icon + + golden_str = _fmt_val(d.golden) + current_str = _fmt_val(d.current) + if d.delta is not None: + sign = "+" if d.delta > 0 else "" + delta_str = f"{sign}{d.delta:.2f}" if isinstance(d.delta, float) and not d.delta.is_integer() else f"{sign}{int(d.delta)}" + pct_str = f"({d.pct_change:+.1f}%)" if d.pct_change is not None and abs(d.pct_change) >= 0.1 else "" + change_str = f"{icon} {delta_str} {pct_str}".strip() + else: + change_str = f"{icon}" + + w(f"| {d.name} | {golden_str} | {current_str} | {change_str} | {d.direction} |") + + # Close table when next category starts + next_idx = cmp.deltas.index(d) + 1 + if next_idx < len(cmp.deltas) and cmp.deltas[next_idx].category != d.category: + w("") + + w("") + + # ── Footer ───────────────────────────────────────────────── + w("---") + w(f"*Report generated by aidlc-reporting v0.1.0*") + + return "\n".join(lines) + "\n" + + +def write_markdown(data: ReportData, output_path: Path) -> None: + md = render_markdown(data) + with open(output_path, "w", encoding="utf-8") as f: + f.write(md) diff --git a/scripts/aidlc-evaluator/packages/reporting/tests/test_baseline.py b/scripts/aidlc-evaluator/packages/reporting/tests/test_baseline.py new file mode 100644 index 00000000..3ed6d77a --- /dev/null +++ b/scripts/aidlc-evaluator/packages/reporting/tests/test_baseline.py @@ -0,0 +1,253 @@ +"""Tests for reporting.baseline — promote, compare, and golden.yaml roundtrip.""" + +from pathlib import Path + +import yaml + +from reporting.baseline import ( + BaselineMetrics, + ComparisonResult, + compare, + extract_baseline, + load_baseline, + promote, + write_baseline, +) +from reporting.collector import ( + Artifacts, + ContractResults, + QualitativeResults, + QualityReport, + ReportData, + RunMeta, + RunMetrics, + PhaseScore, + TokenUsage, +) + + +def _make_report_data() -> ReportData: + from reporting.collector import TestResults as TR + return ReportData( + meta=RunMeta( + run_folder="runs/test-run-001", + executor_model="claude-opus", + simulator_model="claude-sonnet", + total_handoffs=3, + ), + metrics=RunMetrics( + total_tokens=TokenUsage(1000000, 50000, 1050000), + wall_clock_ms=600000, + artifacts=Artifacts( + source_files=10, test_files=5, total_files=20, + total_lines_of_code=2000, total_doc_files=12, + ), + ), + tests=TR( + status="completed", install_ok=True, test_ok=True, + passed=100, failed=2, total=102, coverage_pct=88.5, + ), + contracts=ContractResults(total=50, passed=48, failed=2), + quality=QualityReport(lint_errors=3, lint_warnings=7, lint_total=10), + qualitative=QualitativeResults( + overall_score=0.85, + phases=[ + PhaseScore("inception", avg_overall=0.88), + PhaseScore("construction", avg_overall=0.82), + ], + ), + ) + + +class TestExtractBaseline: + def test_extracts_all_fields(self): + data = _make_report_data() + b = extract_baseline(data) + assert b.run_folder == "runs/test-run-001" + assert b.tests_passed == 100 + assert b.tests_failed == 2 + assert b.contract_passed == 48 + assert b.lint_errors == 3 + assert b.qualitative_score == 0.85 + assert b.inception_score == 0.88 + assert b.construction_score == 0.82 + assert b.lines_of_code == 2000 + assert b.total_tokens == 1050000 + + def test_handles_missing_sections(self): + data = ReportData(meta=RunMeta(run_folder="runs/empty")) + b = extract_baseline(data) + assert b.tests_passed == 0 + assert b.contract_passed == 0 + assert b.qualitative_score == 0.0 + + +class TestWriteAndLoad: + def test_roundtrip(self, tmp_path): + b = BaselineMetrics( + run_folder="runs/golden-run", + promoted_at="2026-02-18T12:00:00+00:00", + executor_model="claude-opus", + tests_passed=192, tests_total=192, + contract_passed=88, contract_total=88, + lint_errors=5, lint_warnings=13, lint_total=18, + qualitative_score=0.891, + inception_score=0.89, + construction_score=0.892, + lines_of_code=3522, + total_tokens=9835935, + ) + path = tmp_path / "golden.yaml" + write_baseline(b, path) + + loaded = load_baseline(path) + assert loaded.run_folder == "runs/golden-run" + assert loaded.tests_passed == 192 + assert loaded.contract_passed == 88 + assert loaded.lint_errors == 5 + assert loaded.qualitative_score == 0.891 + assert loaded.inception_score == 0.89 + assert loaded.lines_of_code == 3522 + assert loaded.total_tokens == 9835935 + + def test_yaml_is_readable(self, tmp_path): + b = BaselineMetrics(run_folder="runs/test", tests_passed=10, tests_total=10) + path = tmp_path / "golden.yaml" + write_baseline(b, path) + + with open(path) as f: + raw = yaml.safe_load(f) + assert raw["unit_tests"]["passed"] == 10 + assert "qualitative" in raw + + +class TestCompare: + def test_identical_runs(self): + a = BaselineMetrics( + tests_passed=100, tests_total=100, + contract_passed=50, contract_total=50, + lint_errors=0, qualitative_score=0.9, + ) + result = compare(a, a) + assert result.improved == 0 + assert result.regressed == 0 + assert result.unchanged == 31 # 29 + 2 new token metrics (repeated_context, api_total) + + def test_improved_tests(self): + golden = BaselineMetrics(tests_passed=90, tests_total=100, tests_pass_pct=90.0) + current = BaselineMetrics(tests_passed=95, tests_total=100, tests_pass_pct=95.0) + result = compare(current, golden) + improved = [d for d in result.deltas if d.name == "Tests Pass %"] + assert len(improved) == 1 + assert improved[0].direction == "improved" + assert improved[0].delta == 5.0 + + def test_regressed_quality(self): + golden = BaselineMetrics(qualitative_score=0.9) + current = BaselineMetrics(qualitative_score=0.7) + result = compare(current, golden) + qual = [d for d in result.deltas if d.name == "Qualitative Score"] + assert len(qual) == 1 + assert qual[0].direction == "regressed" + assert result.regressed >= 1 + + def test_fewer_lint_errors_is_improvement(self): + golden = BaselineMetrics(lint_errors=10) + current = BaselineMetrics(lint_errors=3) + result = compare(current, golden) + lint = [d for d in result.deltas if d.name == "Lint Errors"] + assert lint[0].direction == "improved" + + def test_more_lint_errors_is_regression(self): + golden = BaselineMetrics(lint_errors=3) + current = BaselineMetrics(lint_errors=10) + result = compare(current, golden) + lint = [d for d in result.deltas if d.name == "Lint Errors"] + assert lint[0].direction == "regressed" + + def test_fewer_tokens_is_improvement(self): + golden = BaselineMetrics(total_tokens=10000000) + current = BaselineMetrics(total_tokens=8000000) + result = compare(current, golden) + tok = [d for d in result.deltas if d.name == "Total Tokens"] + assert tok[0].direction == "improved" + + def test_mixed_results(self): + golden = BaselineMetrics( + tests_passed=100, tests_total=100, + lint_errors=5, qualitative_score=0.85, + ) + current = BaselineMetrics( + tests_passed=105, tests_total=105, + lint_errors=10, qualitative_score=0.90, + ) + result = compare(current, golden) + assert result.improved > 0 + assert result.regressed > 0 + + +class TestPromote: + def test_promote_creates_file(self, tmp_path): + run = tmp_path / "run-001" + run.mkdir() + (run / "run-meta.yaml").write_text(yaml.safe_dump({ + "run_folder": str(run), "status": "COMPLETED", + "config": {"executor_model": "opus"}, + })) + + golden_path = tmp_path / "golden.yaml" + baseline = promote(run, golden_path) + assert golden_path.exists() + assert baseline.executor_model == "opus" + + loaded = load_baseline(golden_path) + assert loaded.executor_model == "opus" + + +class TestReportIntegration: + def test_markdown_includes_comparison(self): + from reporting.render_md import render_markdown + data = _make_report_data() + golden = BaselineMetrics( + tests_passed=90, tests_total=100, + lint_errors=5, qualitative_score=0.80, + ) + current = extract_baseline(data) + data.comparison = compare(current, golden) + + md = render_markdown(data) + assert "Baseline Comparison" in md + assert "Improved" in md + assert "Regressed" in md + + def test_html_includes_comparison(self): + from reporting.render_html import render_html + data = _make_report_data() + golden = BaselineMetrics( + tests_passed=90, tests_total=100, + lint_errors=5, qualitative_score=0.80, + ) + current = extract_baseline(data) + data.comparison = compare(current, golden) + + html = render_html(data) + assert "Baseline Comparison" in html + assert "delta-improved" in html + assert "delta-regressed" in html + + def test_no_comparison_when_absent(self): + from reporting.render_md import render_markdown + data = _make_report_data() + md = render_markdown(data) + assert "Baseline Comparison" not in md + + +class TestRealBaseline: + def test_load_real_golden(self): + path = Path(__file__).resolve().parents[3] / "test_cases" / "sci-calc" / "golden.yaml" + if not path.exists(): + return + b = load_baseline(path) + assert b.tests_passed == 180 + assert b.contract_passed == 88 + assert b.qualitative_score == 0.8544 diff --git a/scripts/aidlc-evaluator/packages/reporting/tests/test_collector.py b/scripts/aidlc-evaluator/packages/reporting/tests/test_collector.py new file mode 100644 index 00000000..7f060895 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/reporting/tests/test_collector.py @@ -0,0 +1,169 @@ +"""Tests for reporting.collector — data model and YAML parsing.""" + +from pathlib import Path + +import yaml + +from reporting.collector import collect + + +def _write(path: Path, data: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w") as f: + yaml.safe_dump(data, f, default_flow_style=False) + + +def _minimal_run(tmp_path: Path) -> Path: + """Create a minimal run folder with all YAML artifacts.""" + run = tmp_path / "run-001" + run.mkdir() + + _write(run / "run-meta.yaml", { + "run_folder": str(run), + "started_at": "2026-02-18T12:00:00Z", + "completed_at": "2026-02-18T13:00:00Z", + "status": "Status.COMPLETED", + "execution_time_ms": 3600000, + "total_handoffs": 3, + "node_history": ["executor", "simulator", "executor"], + "config": { + "executor_model": "claude-opus", + "simulator_model": "claude-sonnet", + "aws_region": "us-west-2", + }, + }) + + _write(run / "run-metrics.yaml", { + "tokens": { + "total": {"input_tokens": 1000000, "output_tokens": 50000, "total_tokens": 1050000}, + "per_agent": { + "executor": {"input_tokens": 800000, "output_tokens": 40000, "total_tokens": 840000}, + "simulator": {"input_tokens": 200000, "output_tokens": 10000, "total_tokens": 210000}, + }, + }, + "timing": { + "total_wall_clock_ms": 3600000, + "handoffs": [ + {"handoff": 1, "node_id": "executor", "duration_ms": 2000000}, + {"handoff": 2, "node_id": "simulator", "duration_ms": 600000}, + {"handoff": 3, "node_id": "executor", "duration_ms": 1000000}, + ], + }, + "artifacts": { + "workspace": {"source_files": 10, "test_files": 5, "config_files": 2, "total_files": 17, "total_lines_of_code": 1500}, + "aidlc_docs": {"inception_files": 8, "construction_files": 5, "total_files": 13}, + }, + "errors": {"throttle_events": 0, "timeout_events": 0}, + }) + + _write(run / "test-results.yaml", { + "status": "completed", + "install": {"success": True}, + "test": { + "success": True, + "output": "Total coverage: 91.3%\n192 passed in 0.87s", + "parsed_results": {"passed": 192, "failed": 0, "errors": 0, "total": 192}, + }, + }) + + _write(run / "quality-report.yaml", { + "project_type": "python", + "lint": { + "tool": "ruff", "version": "0.15.1", "available": True, + "findings": [ + {"file": "app.py", "line": 3, "code": "I001", "message": "Unsorted imports", "severity": "warning"}, + {"file": "routes.py", "line": 65, "code": "E501", "message": "Line too long", "severity": "error"}, + ], + }, + "security": {"tool": "bandit", "available": False}, + "summary": {"lint_total": 2, "lint_errors": 1, "lint_warnings": 1}, + }) + + _write(run / "contract-test-results.yaml", { + "total": 10, "passed": 9, "failed": 1, "errors": 0, "server_started": True, + "cases": [ + {"name": "health", "path": "/health", "method": "GET", "passed": True, "expected_status": 200, "actual_status": 200, "latency_ms": 5.2}, + {"name": "add", "path": "/api/v1/arithmetic/add", "method": "POST", "passed": False, "expected_status": 200, "actual_status": 500, "failures": ["status mismatch"]}, + ], + }) + + _write(run / "qualitative-comparison.yaml", { + "overall_score": 0.89, + "phases": [{ + "phase": "inception", + "avg_intent": 0.95, + "avg_design": 0.9, + "avg_completeness": 0.85, + "avg_overall": 0.9, + "documents": [{ + "path": "inception/component-dependency.md", + "intent_similarity": 0.95, + "design_similarity": 0.9, + "completeness": 0.85, + "overall": 0.9, + "notes": "Good alignment overall.", + }], + }], + }) + + return run + + +def test_collect_all_artifacts(tmp_path): + run = _minimal_run(tmp_path) + data = collect(run) + + assert data.meta.status == "Status.COMPLETED" + assert data.meta.executor_model == "claude-opus" + assert data.meta.total_handoffs == 3 + + assert data.metrics.total_tokens.total_tokens == 1050000 + assert data.metrics.wall_clock_ms == 3600000 + assert len(data.metrics.handoffs) == 3 + assert data.metrics.artifacts.source_files == 10 + + assert data.tests is not None + assert data.tests.passed == 192 + assert data.tests.test_ok is True + assert data.tests.coverage_pct == 91.3 + + assert data.quality is not None + assert data.quality.lint_total == 2 + assert data.quality.lint_errors == 1 + + assert data.contracts is not None + assert data.contracts.passed == 9 + assert data.contracts.failed == 1 + + assert data.qualitative is not None + assert data.qualitative.overall_score == 0.89 + assert len(data.qualitative.phases) == 1 + assert data.qualitative.phases[0].documents[0].intent == 0.95 + + +def test_collect_missing_artifacts(tmp_path): + run = tmp_path / "empty-run" + run.mkdir() + data = collect(run) + + assert data.meta.status == "" + assert data.tests is None + assert data.quality is None + assert data.contracts is None + assert data.qualitative is None + + +def test_collect_real_run(): + """Test against the real golden run if it exists.""" + real_run = Path(__file__).resolve().parents[3] / "runs" / "20260218T125810-b84d042dff254a72b4ffec926fe5ea99" + if not real_run.exists(): + return + + data = collect(real_run) + assert data.meta.total_handoffs == 3 + assert data.tests is not None + assert data.tests.passed == 192 + assert data.contracts is not None + assert data.contracts.passed == 88 + assert data.qualitative is not None + assert data.qualitative.overall_score > 0.8 diff --git a/scripts/aidlc-evaluator/packages/reporting/tests/test_render.py b/scripts/aidlc-evaluator/packages/reporting/tests/test_render.py new file mode 100644 index 00000000..e08c1413 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/reporting/tests/test_render.py @@ -0,0 +1,171 @@ +"""Tests for both Markdown and HTML renderers.""" + +from pathlib import Path + +from reporting.collector import ( + Artifacts, + ContractCase, + ContractResults, + DocScore, + HandoffTiming, + PhaseScore, + QualitativeResults, + QualityReport, + LintFinding, + ReportData, + RunMeta, + RunMetrics, + TestResults, + TokenUsage, +) +from reporting.render_html import render_html +from reporting.render_md import render_markdown + + +def _sample_data() -> ReportData: + return ReportData( + meta=RunMeta( + run_folder="runs/20260218T125810-test", + started_at="2026-02-18T12:58:13Z", + completed_at="2026-02-18T13:22:44Z", + status="Status.COMPLETED", + execution_time_ms=1445460, + total_handoffs=3, + node_history=["executor", "simulator", "executor"], + executor_model="claude-opus-4-6-v1", + simulator_model="claude-sonnet-4-5", + aws_region="us-west-2", + ), + metrics=RunMetrics( + total_tokens=TokenUsage(9695968, 139967, 9835935), + executor_tokens=TokenUsage(5671179, 76651, 5747830), + simulator_tokens=TokenUsage(179972, 2412, 182384), + wall_clock_ms=1445460, + handoffs=[ + HandoffTiming(1, "executor", 975455), + HandoffTiming(2, "simulator", 67876), + HandoffTiming(3, "executor", 402145), + ], + artifacts=Artifacts(17, 18, 4, 72, 3522, 8, 5, 15), + ), + tests=TestResults( + status="completed", install_ok=True, test_ok=True, + passed=192, failed=0, errors=0, total=192, coverage_pct=91.3, + ), + quality=QualityReport( + project_type="python", lint_tool="ruff", lint_version="0.15.1", + lint_available=True, + lint_findings=[ + LintFinding("app.py", 3, "I001", "Unsorted imports", "warning"), + LintFinding("routes.py", 65, "E501", "Line too long", "error"), + ], + lint_total=2, lint_errors=1, lint_warnings=1, + ), + contracts=ContractResults( + total=88, passed=88, failed=0, errors=0, server_started=True, + cases=[ + ContractCase("health", "/health", "GET", True, 200, 200, latency_ms=4.5), + ContractCase("add positive", "/api/v1/arithmetic/add", "POST", True, 200, 200, latency_ms=8.1), + ], + ), + qualitative=QualitativeResults( + overall_score=0.891, + phases=[ + PhaseScore( + "inception", 0.9, 0.8875, 0.875, 0.89, + documents=[ + DocScore("inception/component-dependency.md", 1.0, 0.95, 0.9, 0.96, "Highly aligned."), + DocScore("inception/component-methods.md", 1.0, 0.95, 0.85, 0.95, "Same methods."), + ], + ), + PhaseScore( + "construction", 0.88, 0.87, 0.86, 0.87, + documents=[ + DocScore("construction/test-plan.md", 0.9, 0.85, 0.8, 0.85, "Good coverage."), + ], + ), + ], + ), + generated_at="2026-02-18T14:00:00Z", + ) + + +class TestMarkdown: + def test_contains_header(self): + md = render_markdown(_sample_data()) + assert "# AIDLC Evaluation Report" in md + + def test_contains_verdict_table(self): + md = render_markdown(_sample_data()) + assert "## Verdict" in md + assert "192/192" in md + assert "88/88" in md + + def test_contains_token_usage(self): + md = render_markdown(_sample_data()) + assert "## Token Usage" in md + assert "Executor" in md + + def test_contains_qualitative_score(self): + md = render_markdown(_sample_data()) + assert "0.891" in md + assert "Inception" in md + + def test_contains_lint_findings(self): + md = render_markdown(_sample_data()) + assert "`E501`" in md + assert "`I001`" in md + + def test_write_to_file(self, tmp_path): + from reporting.render_md import write_markdown + path = tmp_path / "report.md" + write_markdown(_sample_data(), path) + text = path.read_text(encoding="utf-8") + assert len(text) > 500 + assert "# AIDLC Evaluation Report" in text + + +class TestHTML: + def test_contains_doctype(self): + html = render_html(_sample_data()) + assert "" in html + + def test_contains_verdict_cards(self): + html = render_html(_sample_data()) + assert "192/192" in html + assert "88/88" in html + assert "badge-pass" in html + + def test_contains_score_ring(self): + html = render_html(_sample_data()) + assert "ring-container" in html + assert "89%" in html + + def test_contains_handoff_timeline(self): + html = render_html(_sample_data()) + assert "Handoff Timeline" in html + assert "executor" in html.lower() + + def test_contains_qualitative_bars(self): + html = render_html(_sample_data()) + assert "phase-bars" in html + assert "inception" in html.lower() + + def test_contains_lint_findings(self): + html = render_html(_sample_data()) + assert "E501" in html + assert "I001" in html + + def test_self_contained(self): + """HTML report should be self-contained (inline CSS, no external sheets).""" + html = render_html(_sample_data()) + assert " + + + +
+

AIDLC Rules Trend Report

+
6 releases (v0.1.0 through v0.1.5) · awslabs/aidlc-workflows · 2026-03-20T16:04:28.635438+00:00
+
+ + + +

A. Executive Summary

+
+
Qualitative Score
0.898
Golden: 0.854
+
Contract Tests
88/88
100.0% pass rate
+
Unit Tests
100.0%
175/175 passed
+
Lint Findings
0
Golden: 0
+
Execution Time
17.9m
Golden: 23.8m
+
Total Tokens
13.66M
Golden: 18.39M
+
+

High-level snapshot comparing the latest release against the golden baseline (the reference evaluation used as the quality target).

+ + + + + + + + + + + + + + + +
MetricWhat it measures
Unit test pass ratePercentage of generated unit tests that pass. Higher means more reliable code generation.
Contract testsAPI compliance checks against the OpenAPI spec (passed/total). 88/88 = full compliance.
Lint findingsStatic analysis warnings in generated code. Lower is better — 0 means clean code.
Qualitative scoreAI-graded documentation quality on a 0–1 scale (higher is better).
Execution timeWall-clock time for the full evaluation run. Lower means faster generation.
Total tokensTotal LLM tokens consumed (input + output). Lower means more cost-efficient.
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
MetricGoldenLatest (v0.1.5)vs Golden
Unit test pass rate100.0% (180/180)100.0% (175/175)=
Contract tests88/8888/88=
Lint findings00=
Qualitative score0.8540.898+0.044
Execution time23.8m17.9m-5.9m
Total tokens18.39M13.66M-4.74M
+ +

B. Functional Correctness

+

Measures whether the code generated by each rules version actually works correctly. This is the most fundamental quality gate — code that doesn’t pass its own tests is broken.

+

B.1 Unit Tests

+
+
+

Unit tests validate individual functions and components in isolation. The AIDLC rules instruct the AI to generate both source code and test suites.

+

Pass/Total = tests that passed out of total generated. Rate = pass percentage (100% = all tests passing). Failures = tests that ran but produced wrong results.

+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
VersionPass/TotalRateFailures
v0.1.0250/250100.0%0
v0.1.1194/194100.0%0
v0.1.2180/180100.0%0
v0.1.3126/126100.0%0
v0.1.4156/156100.0%0
v0.1.5175/175100.0%0
+
+
+

B.2 Contract Tests (API Compliance)

+
+
+

Contract tests verify that the generated API implementation matches its OpenAPI specification. Each test sends a request to an endpoint and checks that the HTTP status code and response shape match the spec.

+

88 endpoints are tested per version. Pass/Total = endpoints that returned the expected status code. Rate = pass percentage (100% = full spec compliance).

+

Failures lists the specific endpoints that deviated from the spec.

+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
VersionPass/TotalRateFailures
v0.1.088/88100.0%0
v0.1.188/88100.0%0
v0.1.288/88100.0%0
v0.1.385/8896.6%3
v0.1.488/88100.0%0
v0.1.588/88100.0%0
+
+
+
v0.1.3 failures:
    +
  • POST /api/v1/arithmetic/add — expected 422, got 200 (add missing field → 422)
  • +
  • POST /api/v1/arithmetic/divide — expected 400, got 200 (divide by zero → error)
  • +
  • POST /api/v1/arithmetic/modulo — expected 400, got 200 (modulo by zero → error)
  • +
+ +

C. Qualitative Evaluation

+

Measures the quality of generated documentation by comparing it against human-authored reference documents. An AI evaluator scores each document on completeness, accuracy, and clarity, producing a 0–1 score (1.0 = perfect match to reference quality).

+

C.1 Overall Score

+
+
+

The weighted average across all evaluated documents. This is the single best indicator of how well the rules produce documentation.

+

Scores above 0.90 are considered strong; below 0.70 signals significant gaps.

+

Golden baseline: 0.854

+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
VersionOverallvs Golden
v0.1.00.860+0.006
v0.1.10.888+0.033
v0.1.20.893+0.038
v0.1.30.866+0.012
v0.1.40.891+0.037
v0.1.50.898+0.044
+
+
+

C.2 Phase Breakdown

+
+
+

Documents are grouped by SDLC phase. Inception covers early-stage design artifacts (requirements, architecture plans, component designs) — these are generated first and set the foundation.

+

Construction covers build-time artifacts (build instructions, test instructions, build-and-test summaries) — these depend on inception outputs being correct.

+

A drop in inception quality often cascades into construction.

+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
VersionInceptionConstruction
v0.1.00.8800.840
v0.1.10.8940.882
v0.1.20.9210.864
v0.1.30.8860.846
v0.1.40.8900.892
v0.1.50.8790.918
+
+
+

C.3 Per-Document Heatmap

+

Individual quality scores for each generated document across all versions. This reveals which specific documents are consistently strong, improving, or problematic. Documents scoring below 0.70 (red) are the top candidates for rules improvements.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Documentv0.1.0v0.1.1v0.1.2v0.1.3v0.1.4v0.1.5
application-design-plan.md0.961.000.961.000.95
build-and-test-summary.md0.930.950.900.900.970.95
build-instructions.md0.750.750.780.880.770.87
component-dependency.md0.970.950.960.961.000.95
component-methods.md0.930.900.960.980.930.96
components.md1.000.981.000.970.980.98
execution-plan.md0.970.910.980.930.970.97
integration-test-instructions.md0.850.870.820.700.880.91
requirement-verification-questions.md0.380.540.540.380.360.28
requirements.md1.001.000.971.000.970.97
sci-calc-code-generation-plan.md0.970.980.920.980.980.98
services.md0.910.910.960.910.910.97
unit-test-instructions.md0.700.860.900.770.860.88
+

green ≥ 0.90 yellow 0.70–0.89 red < 0.70

+

C.4 Document Coverage

+

Tracks whether the generated output includes the same set of documents as the reference. Unmatched Ref = reference documents the AI failed to generate (missing output). Unmatched Candidate = extra documents the AI generated that don’t exist in the reference (unexpected output). Ideally both columns are 0, meaning the AI produced exactly the expected set of documents.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
VersionUnmatched RefUnmatched Candidate
v0.1.011
v0.1.104
v0.1.201
v0.1.306
v0.1.406
v0.1.500
+ +

D. Efficiency & Cost Metrics

+

Tracks the computational resources consumed by each evaluation run. These metrics directly affect cost (tokens) and developer wait time (execution time). Lower values are generally better, as long as quality metrics remain stable.

+

D.1 Token Consumption

+
+
+

Total LLM tokens consumed during the run, broken down by agent. Total = all tokens across all agents (input + output).

+

Executor = the agent that generates code and documents. Simulator = the agent that simulates user interactions for testing.

+

Token count is the primary cost driver — each token represents a unit of LLM usage billed by the provider.

+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
VersionTotalExecutorSimulator
v0.1.09.26M4.65M119.3K
v0.1.113.34M6.56M266.2K
v0.1.28.34M4.15M295.5K
v0.1.311.52M5.72M222.3K
v0.1.411.52M5.67M251.9K
v0.1.513.66M6.88M90.2K
+
+
+

D.2 Execution Time

+
+
+

Wall-clock duration of the full evaluation pipeline, broken down by handoff. Each handoff (H1, H2, H3) represents a sequential phase.

+

H1 is typically code generation (the longest phase), H2 is build/test execution, and H3 is result collection and reporting.

+

Wall Clock is the total end-to-end time.

+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
VersionWall ClockHandoff Breakdown
v0.1.016.0mH1: 13.8m · H2: 0.9m · H3: 1.3m
v0.1.118.6mH1: 17.1m · H2: 1.0m · H3: 0.5m
v0.1.215.5mH1: 11.7m · H2: 1.4m · H3: 2.4m
v0.1.318.8mH1: 15.8m · H2: 1.3m · H3: 1.7m
v0.1.416.8mH1: 14.8m · H2: 1.3m · H3: 0.6m
v0.1.517.9mH1: 15.0m · H2: 0.7m · H3: 2.2m
+
+
+

D.3 Context Window Pressure

+
+
+

Measures how much of the LLM’s context window is being used across API calls. Max = the largest single context seen during the run (approaching the model’s limit risks truncation or degraded output).

+

Avg = the mean context size across all API calls. Median = the midpoint context size (less affected by outliers than avg).

+

High context pressure can indicate overly verbose prompts or accumulated conversation history.

+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
VersionMaxAvgMedian
v0.1.097.4K44.9K43.7K
v0.1.1138.5K57.2K50.4K
v0.1.296.4K38.8K26.6K
v0.1.3118.6K49.7K42.4K
v0.1.4109.8K48.8K48.5K
v0.1.5121.7K56.6K55.2K
+
+
+ +

E. Code Quality

+

Static analysis of the generated codebase. These metrics reflect the cleanliness and maintainability of the AI-generated code, independent of whether it passes tests.

+ + + + + + + + + + + + + +
MetricWhat it measures
Lint FindingsWarnings from static analysis (style violations, unused variables, etc.). 0 = clean.
Security FindingsVulnerabilities detected by security scanners (SQL injection, XSS, etc.). N/A if no scanner was configured.
Source FilesNumber of non-test source files in the generated project.
LOCTotal lines of code across all source files. Large swings may indicate generated boilerplate or missing modules.
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
VersionLint FindingsSecurity FindingsSource FilesLOC
v0.1.00N/A977398.5K
v0.1.10N/A977398.1K
v0.1.20N/A976397.4K
v0.1.30N/A977397.7K
v0.1.40N/A976397.5K
v0.1.50N/A976397.6K
+ +

F. Stability & Reliability

+

Tracks whether the evaluation pipeline itself ran smoothly, independent of output quality.

+ + + + + + + + + + + + +
MetricWhat it measures
Error EventsRuntime errors logged during the run (exceptions, timeouts, API failures). 0 = clean run.
HandoffsNumber of sequential pipeline phases completed. Typically 3 (generate, build/test, report). A different count may indicate an early abort or retry.
Server StartupWhether the generated application server started successfully. A failure here means the generated code couldn’t even boot, preventing contract tests from running.
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
VersionError EventsHandoffsServer Startup
v0.1.003PASS
v0.1.103PASS
v0.1.203PASS
v0.1.303PASS
v0.1.403PASS
v0.1.503PASS
+ +

G. Version-over-Version Deltas

+

Each row shows the change from one release to the next, making it easy to spot which specific version introduced an improvement or regression. Positive values (+) indicate an increase; negative (−) indicate a decrease. For Unit Tests and Contract, positive is better (more tests passing). For Qualitative, positive is better (higher quality score). For Tokens and Time, negative is better (more efficient).

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TransitionUnit TestsContractQualitativeTokensTime
v0.1.0 → v0.1.1-56+0+0.028+4.08M+155s
v0.1.1 → v0.1.2-14+0+0.005-5.00M-188s
v0.1.2 → v0.1.3-54-3-0.026+3.19M+200s
v0.1.3 → v0.1.4+30+3+0.025-9.2K-122s
v0.1.4 → v0.1.5+19+0+0.007+2.14M+71s
+ +

H. Pre-Release Data Points

+

Evaluation results from non-release sources — the main branch and open pull requests. These represent in-progress work that hasn’t been tagged as a release yet. Use this data to preview whether upcoming changes will improve or regress metrics before they ship.

+

No pre-release data available.

+ + + \ No newline at end of file diff --git a/scripts/aidlc-evaluator/packages/trend-reports/pyproject.toml b/scripts/aidlc-evaluator/packages/trend-reports/pyproject.toml new file mode 100644 index 00000000..0bbf8144 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/pyproject.toml @@ -0,0 +1,18 @@ +[project] +name = "aidlc-trend-reports" +version = "0.1.0" +description = "Trend reporting across AIDLC rules evaluation releases" +requires-python = ">=3.13" +dependencies = [ + "pyyaml>=6.0", +] + +[project.scripts] +trend-report = "trend_reports.__main__:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/trend_reports"] diff --git a/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__init__.py b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__init__.py new file mode 100644 index 00000000..b6316609 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__init__.py @@ -0,0 +1,50 @@ +"""AIDLC rules evaluation trend reporting tool. + +Usage: + from trend_reports import collect_trend_data, render_trend_markdown + trend = collect_trend_data(bundle_paths, baseline_path, repo, work_dir) + markdown = render_trend_markdown(trend) + +CLI: + python -m trend_reports trend --baseline golden.yaml --format all +""" + +from trend_reports.collector import ( + collect_from_directory, + collect_trend_data, + compute_deltas, + sort_runs, +) +from trend_reports.gate import check_regressions +from trend_reports.models import ( + BaselineMetrics, + GateResult, + RunData, + RunType, + SemVer, + TrendData, + TrendReportError, + VersionDelta, +) +from trend_reports.render_html import render_trend_html +from trend_reports.render_md import render_trend_markdown +from trend_reports.render_yaml import render_trend_yaml + +__all__ = [ + "BaselineMetrics", + "GateResult", + "RunData", + "RunType", + "SemVer", + "TrendData", + "TrendReportError", + "VersionDelta", + "check_regressions", + "collect_from_directory", + "collect_trend_data", + "compute_deltas", + "render_trend_html", + "render_trend_markdown", + "render_trend_yaml", + "sort_runs", +] diff --git a/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__main__.py b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__main__.py new file mode 100644 index 00000000..257bbf81 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/__main__.py @@ -0,0 +1,247 @@ +"""CLI entry point: python -m trend_reports trend ...""" + +from __future__ import annotations + +import argparse +import logging +import sys +import tempfile +from pathlib import Path + +from .models import TrendReportError + + +def main() -> None: + parser = argparse.ArgumentParser( + prog="trend-report", + description="AIDLC rules evaluation trend reporting tool", + ) + parser.add_argument( + "-v", + "--verbose", + action="count", + default=0, + help="Increase verbosity (-v for INFO, -vv for DEBUG)", + ) + + sub = parser.add_subparsers(dest="command") + + trend_parser = sub.add_parser("trend", help="Generate trend report across releases") + trend_parser.add_argument( + "-v", + "--verbose", + action="count", + default=0, + help="Increase verbosity (-v for INFO, -vv for DEBUG)", + ) + trend_parser.add_argument( + "--baseline", + required=True, + help="Path to golden.yaml baseline file", + ) + trend_parser.add_argument( + "--format", + choices=["md", "html", "yaml", "both", "all"], + default="all", + help="Output format (default: all = md + html + yaml)", + ) + trend_parser.add_argument( + "--output-dir", + default=None, + help="Directory for output artifacts (default: ./runs)", + ) + trend_parser.add_argument( + "--repo", + default="awslabs/aidlc-workflows", + help="GitHub repository (default: awslabs/aidlc-workflows)", + ) + trend_parser.add_argument( + "--cache-prefix", + default="report-", + help="Cache key prefix for pre-release bundles (default: report-)", + ) + trend_parser.add_argument( + "--gate", + action="store_true", + help="Exit non-zero if regressions detected", + ) + trend_parser.add_argument( + "--tags", + nargs="*", + help="Specific release tags to include (default: all)", + ) + trend_parser.add_argument( + "--local-bundle", + nargs="*", + dest="local_bundles", + help="Local zip bundle path(s) to include as additional data points", + ) + trend_parser.add_argument( + "--local-run-dir", + nargs="*", + dest="local_run_dirs", + help="Local run directory path(s) to include as additional data points", + ) + + args = parser.parse_args() + + # Logging + level = logging.WARNING + if args.verbose >= 2: + level = logging.DEBUG + elif args.verbose >= 1: + level = logging.INFO + logging.basicConfig( + level=level, + format="%(levelname)s: %(message)s", + ) + + if args.command != "trend": + parser.print_help() + sys.exit(0) + + try: + exit_code = cmd_trend( + baseline=args.baseline, + fmt=args.format, + output_dir=args.output_dir or str(Path.cwd() / "runs"), + repo=args.repo, + cache_prefix=args.cache_prefix, + gate=args.gate, + tags=args.tags, + local_bundles=args.local_bundles, + local_run_dirs=args.local_run_dirs, + ) + sys.exit(exit_code) + except TrendReportError as exc: + print(f"Error: {exc}", file=sys.stderr) + sys.exit(2) + except KeyboardInterrupt: + sys.exit(130) + + +def cmd_trend( + baseline: str, + fmt: str, + output_dir: str, + repo: str, + cache_prefix: str, + gate: bool, + tags: list[str] | None, + local_bundles: list[str] | None = None, + local_run_dirs: list[str] | None = None, +) -> int: + """Main orchestration. Returns 0 on success, 1 on gate failure.""" + from .collector import collect_trend_data + from .fetcher import check_gh_available, fetch_prerelease_bundles, fetch_release_bundles + from .gate import check_regressions + from .render_html import render_trend_html + from .render_md import render_trend_markdown + from .render_yaml import render_trend_yaml + + logger = logging.getLogger(__name__) + + # 1. Validate prerequisites + check_gh_available() + + out = Path(output_dir) + out.mkdir(parents=True, exist_ok=True) + + # 2. Fetch data + with tempfile.TemporaryDirectory(prefix="trend-report-") as tmpdir: + work_dir = Path(tmpdir) + + # 2a. Release bundles (required) + logger.info("Fetching release bundles from %s …", repo) + bundle_paths = fetch_release_bundles(repo, tags, work_dir) + logger.info("Fetched %d release bundle(s)", len(bundle_paths)) + + # 2b. Local bundles (from --local-bundle flag) + if local_bundles: + for bundle_str in local_bundles: + bundle_path = Path(bundle_str) + if not bundle_path.exists(): + raise TrendReportError(f"Local bundle not found: {bundle_path}") + bundle_paths.append(bundle_path) + logger.info("Added local bundle: %s", bundle_path) + + # 2b-2. Local run directories (from --local-run-dir flag) + if local_run_dirs: + for dir_str in local_run_dirs: + dir_path = Path(dir_str) + if not dir_path.is_dir(): + raise TrendReportError(f"Local run directory not found: {dir_path}") + bundle_paths.append(dir_path) + logger.info("Added local run directory: %s", dir_path) + + # 2c. Remote pre-release bundles (from GitHub Actions Artifacts) + logger.info("Fetching pre-release bundles …") + prerelease_paths = fetch_prerelease_bundles(repo, cache_prefix, work_dir) + if prerelease_paths: + logger.info("Fetched %d pre-release bundle(s)", len(prerelease_paths)) + bundle_paths.extend(prerelease_paths) + else: + logger.info("No pre-release bundles found — continuing with releases only") + + # 3. Collect and assemble + logger.info("Parsing bundles …") + trend = collect_trend_data(bundle_paths, Path(baseline), repo, work_dir) + logger.info("Assembled trend data for %d runs", len(trend.runs)) + + # 4. Render into a timestamped subdirectory + from datetime import datetime, timezone + + timestamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S") + run_dir = out / timestamp + run_dir.mkdir(parents=True, exist_ok=True) + + formats = _resolve_formats(fmt) + + if "md" in formats: + md_path = run_dir / "trend-report.md" + md_path.write_text(render_trend_markdown(trend), encoding="utf-8") + print(f"Wrote {md_path}") + + if "html" in formats: + html_path = run_dir / "trend-report.html" + html_path.write_text(render_trend_html(trend), encoding="utf-8") + print(f"Wrote {html_path}") + + if "yaml" in formats: + yaml_path = run_dir / "trend-data.yaml" + yaml_path.write_text(render_trend_yaml(trend), encoding="utf-8") + print(f"Wrote {yaml_path}") + + print(f"Output directory: {run_dir}") + + # 5. Gate + if gate: + result = check_regressions(trend) + if result.passed: + print( + f"Gate PASSED: {result.latest_label} vs {result.comparison_label} " + f"— no regressions detected." + ) + return 0 + else: + print( + f"Gate FAILED: {result.latest_label} vs {result.comparison_label}", + file=sys.stderr, + ) + for reg in result.regressions: + print(f" - {reg}", file=sys.stderr) + return 1 + + return 0 + + +def _resolve_formats(fmt: str) -> set[str]: + if fmt == "both": + return {"md", "html"} + if fmt == "all": + return {"md", "html", "yaml"} + return {fmt} + + +if __name__ == "__main__": + main() diff --git a/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/collector.py b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/collector.py new file mode 100644 index 00000000..2cfce987 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/collector.py @@ -0,0 +1,500 @@ +"""Zip/directory extraction, YAML parsing, run classification, and trend assembly.""" + +from __future__ import annotations + +import logging +import zipfile +from datetime import datetime, timezone +from pathlib import Path + +import yaml + +from .models import ( + AgentTokens, + BaselineMetrics, + CodeQualityMetrics, + CollectorError, + ContractTestFailure, + ContractTestResults, + DocumentScore, + HandoffMetrics, + QualitativeComparison, + RunConfig, + RunData, + RunMeta, + RunMetrics, + RunType, + SemVer, + TrendData, + UnitTestResults, + VersionDelta, +) + +logger = logging.getLogger(__name__) + +# The YAML files we expect inside every report bundle (zip or directory). +REQUIRED_YAML = { + "run-meta": "run-meta.yaml", + "run-metrics": "run-metrics.yaml", + "test-results": "test-results.yaml", + "contract-test-results": "contract-test-results.yaml", + "quality-report": "quality-report.yaml", + "qualitative-comparison": "qualitative-comparison.yaml", +} + + +# --------------------------------------------------------------------------- +# Zip handling +# --------------------------------------------------------------------------- + + +def extract_zip(zip_path: Path, dest_dir: Path) -> Path: + """Extract a report zip and return the directory containing the YAML files. + + The zips are flat (files at root level), so we extract into a + subdirectory named after the zip stem. + """ + try: + with zipfile.ZipFile(zip_path, "r") as zf: + subdir = dest_dir / zip_path.stem + subdir.mkdir(parents=True, exist_ok=True) + zf.extractall(subdir) + except zipfile.BadZipFile as exc: + raise CollectorError(f"Corrupt zip: {zip_path}") from exc + + return subdir + + +def find_yaml_files(run_dir: Path) -> dict[str, Path]: + """Locate the expected YAML files inside *run_dir*. + + Returns a dict keyed by short name (e.g. ``"run-meta"``) with + :class:`Path` values. Logs a warning for any missing file. + """ + found: dict[str, Path] = {} + for key, filename in REQUIRED_YAML.items(): + path = run_dir / filename + if path.exists(): + found[key] = path + else: + logger.warning("Missing %s in %s", filename, run_dir) + return found + + +# --------------------------------------------------------------------------- +# YAML parsers — one per file type +# --------------------------------------------------------------------------- + + +def _load_yaml(path: Path) -> dict: + with open(path) as fh: + data = yaml.safe_load(fh) + if not isinstance(data, dict): + raise CollectorError(f"Expected YAML dict in {path}, got {type(data).__name__}") + return data + + +def parse_run_meta(yaml_path: Path) -> RunMeta: + raw = _load_yaml(yaml_path) + cfg = raw.get("config", {}) + return RunMeta( + run_id=raw.get("run_folder", ""), + config=RunConfig( + rules_ref=cfg.get("rules_ref", ""), + model=cfg.get("executor_model", ""), + target_project=raw.get("vision_file", "").split("/")[1] + if "/" in raw.get("vision_file", "") + else "", + ), + start_time=str(raw.get("started_at", "")), + end_time=str(raw.get("completed_at", "")), + status=str(raw.get("status", "")), + ) + + +def parse_run_metrics(yaml_path: Path) -> RunMetrics: + raw = _load_yaml(yaml_path) + + tokens = raw.get("tokens", {}) + total = tokens.get("total", {}) + per_agent = tokens.get("per_agent", {}) + + agent_tokens: list[AgentTokens] = [] + for name, vals in per_agent.items(): + agent_tokens.append( + AgentTokens( + agent_name=name, + input_tokens=vals.get("input_tokens", 0), + output_tokens=vals.get("output_tokens", 0), + total_tokens=vals.get("total_tokens", 0), + cache_read_tokens=vals.get("cache_read_tokens", 0), + cache_write_tokens=vals.get("cache_write_tokens", 0), + ) + ) + + timing = raw.get("timing", {}) + handoff_list = timing.get("handoffs", []) + handoffs: list[HandoffMetrics] = [] + for h in handoff_list: + handoffs.append( + HandoffMetrics( + handoff_number=h.get("handoff", 0), + agent=h.get("node_id", ""), + duration_seconds=h.get("duration_ms", 0) / 1000.0, + tokens=0, + ) + ) + + hp = raw.get("handoff_patterns", {}) + errors = raw.get("errors", {}) + error_count = sum( + [ + errors.get("throttle_events", 0), + errors.get("timeout_events", 0), + errors.get("failed_tool_calls", 0), + errors.get("model_error_events", 0), + errors.get("service_unavailable_events", 0), + errors.get("validation_error_events", 0), + ] + ) + + ctx = raw.get("context_size", {}).get("total", {}) + + return RunMetrics( + total_tokens=total.get("total_tokens", 0), + total_input_tokens=total.get("input_tokens", 0), + total_output_tokens=total.get("output_tokens", 0), + total_cache_read_tokens=total.get("cache_read_tokens", 0), + total_cache_write_tokens=total.get("cache_write_tokens", 0), + execution_time_seconds=timing.get("total_wall_clock_ms", 0) / 1000.0, + num_handoffs=hp.get("total_handoffs", len(handoff_list)), + max_context_tokens=ctx.get("max_tokens", 0), + avg_context_tokens=ctx.get("avg_tokens", 0.0), + median_context_tokens=ctx.get("median_tokens", 0.0), + agent_tokens=agent_tokens, + handoffs=handoffs, + server_startup_success=True, + error_count=error_count, + ) + + +def parse_test_results(yaml_path: Path) -> UnitTestResults: + raw = _load_yaml(yaml_path) + parsed = raw.get("test", {}).get("parsed_results", {}) + passed = parsed.get("passed", 0) or 0 + failed = parsed.get("failed", 0) or 0 + errors = parsed.get("errors", 0) or 0 + skipped = parsed.get("skipped", 0) or 0 + total = parsed.get("total", 0) or 0 + return UnitTestResults( + passed=passed, + failed=failed, + errors=errors, + skipped=skipped, + total=total, + ) + + +def parse_contract_tests(yaml_path: Path) -> ContractTestResults: + raw = _load_yaml(yaml_path) + total = raw.get("total", 0) + passed = raw.get("passed", 0) + failed = raw.get("failed", 0) + pass_rate = passed / total if total > 0 else 0.0 + + failures: list[ContractTestFailure] = [] + for case in raw.get("cases", []): + if not case.get("passed", True): + failures.append( + ContractTestFailure( + endpoint=case.get("path", ""), + method=case.get("method", ""), + expected_status=case.get("expected_status", 0), + actual_status=case.get("actual_status", 0), + description=case.get("name", ""), + ) + ) + + return ContractTestResults( + total=total, + passed=passed, + failed=failed, + pass_rate=pass_rate, + failures=failures, + ) + + +def parse_quality_report(yaml_path: Path) -> CodeQualityMetrics: + raw = _load_yaml(yaml_path) + lint = raw.get("lint", {}) + security = raw.get("security", {}) + summary = raw.get("summary", {}) + + return CodeQualityMetrics( + lint_findings=summary.get("lint_total", len(lint.get("findings", []))), + security_findings=len(security.get("findings", [])) + if security.get("available", False) + else -1, + security_scanner_available=security.get("available", False), + source_file_count=0, + test_file_count=0, + total_lines_of_code=0, + ) + + +def parse_qualitative(yaml_path: Path) -> QualitativeComparison: + raw = _load_yaml(yaml_path) + + overall = raw.get("overall_score", 0.0) + phases = raw.get("phases", []) + inception_score = 0.0 + construction_score = 0.0 + doc_scores: list[DocumentScore] = [] + + for phase in phases: + phase_name = phase.get("phase", "") + avg_overall = phase.get("avg_overall", 0.0) + if phase_name == "inception": + inception_score = avg_overall + elif phase_name == "construction": + construction_score = avg_overall + + for doc in phase.get("documents", []): + doc_scores.append( + DocumentScore( + document_name=Path(doc.get("path", "")).name, + overall_score=doc.get("overall", 0.0), + phase=phase_name, + completeness=doc.get("completeness", 0.0), + accuracy=doc.get("design_similarity", 0.0), + clarity=doc.get("intent_similarity", 0.0), + ) + ) + + return QualitativeComparison( + overall_score=overall, + inception_score=inception_score, + construction_score=construction_score, + document_scores=doc_scores, + unmatched_reference_docs=raw.get("unmatched_reference", []), + unmatched_candidate_docs=raw.get("unmatched_candidate", []), + ) + + +# --------------------------------------------------------------------------- +# Run classification +# --------------------------------------------------------------------------- + + +def classify_run(rules_ref: str) -> tuple[RunType, str, SemVer | None, int | None]: + """Determine run type, display label, semver, and PR number from rules_ref.""" + if rules_ref == "main": + return RunType.MAIN, "main", None, None + if rules_ref.startswith("pr-"): + num = int(rules_ref.split("-", 1)[1]) + return RunType.PR, f"PR #{num}", None, num + try: + sv = SemVer.parse(rules_ref) + return RunType.RELEASE, str(sv), sv, None + except ValueError: + # Unknown format — treat as release-like + return RunType.RELEASE, rules_ref, None, None + + +# --------------------------------------------------------------------------- +# Collection pipeline +# --------------------------------------------------------------------------- + + +def _collect_from_run_dir(run_dir: Path, source_label: str) -> RunData: + """Parse YAML files in *run_dir* into a RunData. + + *source_label* is used in error messages (e.g. the zip path or directory). + """ + yaml_files = find_yaml_files(run_dir) + + if "run-meta" not in yaml_files: + raise CollectorError(f"run-meta.yaml missing from {source_label} — cannot classify run") + + meta = parse_run_meta(yaml_files["run-meta"]) + run_type, label, semver, pr_number = classify_run(meta.config.rules_ref) + + metrics = ( + parse_run_metrics(yaml_files["run-metrics"]) + if "run-metrics" in yaml_files + else RunMetrics() + ) + unit_tests = ( + parse_test_results(yaml_files["test-results"]) + if "test-results" in yaml_files + else UnitTestResults() + ) + contract_tests = ( + parse_contract_tests(yaml_files["contract-test-results"]) + if "contract-test-results" in yaml_files + else ContractTestResults() + ) + code_quality = ( + parse_quality_report(yaml_files["quality-report"]) + if "quality-report" in yaml_files + else CodeQualityMetrics() + ) + qualitative = ( + parse_qualitative(yaml_files["qualitative-comparison"]) + if "qualitative-comparison" in yaml_files + else QualitativeComparison() + ) + + # Backfill artifact counts from run-metrics if available + if "run-metrics" in yaml_files: + raw_metrics = _load_yaml(yaml_files["run-metrics"]) + workspace = raw_metrics.get("artifacts", {}).get("workspace", {}) + code_quality.source_file_count = workspace.get("source_files", 0) + code_quality.test_file_count = workspace.get("test_files", 0) + code_quality.total_lines_of_code = workspace.get("total_lines_of_code", 0) + + return RunData( + label=label, + run_type=run_type, + semver=semver, + pr_number=pr_number, + meta=meta, + metrics=metrics, + unit_tests=unit_tests, + contract_tests=contract_tests, + code_quality=code_quality, + qualitative=qualitative, + ) + + +def collect_from_zip(zip_path: Path, work_dir: Path) -> RunData: + """Extract a zip bundle and parse all YAML files into a RunData.""" + run_dir = extract_zip(zip_path, work_dir) + return _collect_from_run_dir(run_dir, source_label=str(zip_path)) + + +def collect_from_directory(dir_path: Path) -> RunData: + """Parse all YAML files from a plain directory into a RunData. + + Unlike :func:`collect_from_zip`, no extraction step is needed. + The directory must contain the expected YAML files directly. + """ + if not dir_path.is_dir(): + raise CollectorError(f"Not a directory: {dir_path}") + return _collect_from_run_dir(dir_path, source_label=str(dir_path)) + + +def load_baseline(golden_path: Path) -> BaselineMetrics: + """Parse a golden.yaml baseline file into BaselineMetrics.""" + if not golden_path.exists(): + logger.warning("Golden baseline file %s not found — using empty baseline", golden_path) + return BaselineMetrics() + + raw = _load_yaml(golden_path) + + execution = raw.get("execution", {}) + unit_tests = raw.get("unit_tests", {}) + contract_tests = raw.get("contract_tests", {}) + code_quality = raw.get("code_quality", {}) + qualitative = raw.get("qualitative", {}) + + doc_scores: dict[str, float] = {} + for name, score in qualitative.get("document_scores", {}).items(): + if isinstance(score, (int, float)): + doc_scores[name] = float(score) + + return BaselineMetrics( + unit_tests_passed=unit_tests.get("passed", 0), + unit_tests_total=unit_tests.get("total", 0), + contract_tests_passed=contract_tests.get("passed", 0), + contract_tests_total=contract_tests.get("total", 0), + lint_findings=code_quality.get("lint_total", 0), + qualitative_overall=qualitative.get("overall_score", 0.0), + execution_time_seconds=execution.get("wall_clock_ms", 0) / 1000.0, + total_tokens=execution.get("total_tokens", 0), + document_scores=doc_scores, + ) + + +# --------------------------------------------------------------------------- +# Sorting and deltas +# --------------------------------------------------------------------------- + + +def sort_runs(runs: list[RunData]) -> list[RunData]: + """Sort runs: releases by semver ascending, then main, then PRs.""" + type_order = {RunType.RELEASE: 0, RunType.MAIN: 1, RunType.PR: 2} + + def _key(run: RunData) -> tuple: + sv = ( + (run.semver.major, run.semver.minor, run.semver.patch) + if run.semver + else (999, 999, 999) + ) + pr = run.pr_number or 0 + return (type_order[run.run_type], sv, pr) + + return sorted(runs, key=_key) + + +def compute_deltas(runs: list[RunData]) -> list[VersionDelta]: + """Compute version-over-version deltas for consecutive runs.""" + deltas: list[VersionDelta] = [] + for prev, curr in zip(runs, runs[1:]): + deltas.append( + VersionDelta( + from_label=prev.label, + to_label=curr.label, + unit_tests_delta=curr.unit_tests.passed - prev.unit_tests.passed, + contract_tests_delta=curr.contract_tests.passed - prev.contract_tests.passed, + qualitative_delta=curr.qualitative.overall_score - prev.qualitative.overall_score, + token_delta=curr.metrics.total_tokens - prev.metrics.total_tokens, + time_delta_seconds=curr.metrics.execution_time_seconds + - prev.metrics.execution_time_seconds, + ) + ) + return deltas + + +# --------------------------------------------------------------------------- +# Top-level collection +# --------------------------------------------------------------------------- + + +def collect_trend_data( + bundle_paths: list[Path], + baseline_path: Path, + repo: str, + work_dir: Path | None = None, +) -> TrendData: + """Parse all bundles (zip files or directories) and assemble a TrendData.""" + import tempfile + + if work_dir is None: + work_dir = Path(tempfile.mkdtemp(prefix="trend-collect-")) + + baseline = load_baseline(baseline_path) + + runs: list[RunData] = [] + for bp in bundle_paths: + logger.info("Collecting data from %s …", bp.name) + try: + if bp.is_dir(): + run = collect_from_directory(bp) + else: + run = collect_from_zip(bp, work_dir) + runs.append(run) + except CollectorError as exc: + logger.warning("Skipping %s: %s", bp.name, exc) + + if not runs: + raise CollectorError("No runs could be parsed from the provided bundles.") + + runs = sort_runs(runs) + + return TrendData( + runs=runs, + baseline=baseline, + repo=repo, + generated_at=datetime.now(timezone.utc).isoformat(), + ) diff --git a/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/fetcher.py b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/fetcher.py new file mode 100644 index 00000000..8f0ec356 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/fetcher.py @@ -0,0 +1,315 @@ +"""Data retrieval via the gh CLI.""" + +from __future__ import annotations + +import json +import logging +import subprocess +from pathlib import Path + +from .models import FetchError + +logger = logging.getLogger(__name__) + + +def check_gh_available() -> None: + """Verify the gh CLI is installed and authenticated.""" + try: + result = subprocess.run( + ["gh", "version"], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + raise FetchError(f"gh CLI returned an error: {result.stderr.strip()}") + except FileNotFoundError: + raise FetchError("gh CLI not found. Install from https://cli.github.com/") + + # Check authentication + result = subprocess.run( + ["gh", "auth", "status"], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + raise FetchError("gh CLI not authenticated. Run 'gh auth login' first.") + + +def fetch_release_list(repo: str) -> list[dict]: + """Fetch the list of releases from a GitHub repo. + + Returns a list of dicts with ``tagName`` and ``publishedAt`` keys, + sorted by ``publishedAt`` ascending. + """ + result = subprocess.run( + [ + "gh", + "release", + "list", + "--repo", + repo, + "--json", + "tagName,publishedAt", + "--limit", + "50", + ], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + raise FetchError(f"Failed to list releases for {repo}: {result.stderr.strip()}") + + releases = json.loads(result.stdout) + releases.sort(key=lambda r: r.get("publishedAt", "")) + return releases + + +def fetch_release_bundle(repo: str, tag: str, dest_dir: Path) -> Path | None: + """Download the report zip for a single release tag. + + Returns the path to the downloaded zip, or ``None`` if the release + has no matching ``report*.zip`` asset. + """ + tag_dir = dest_dir / tag + tag_dir.mkdir(parents=True, exist_ok=True) + + result = subprocess.run( + [ + "gh", + "release", + "download", + tag, + "--repo", + repo, + "--pattern", + "report*.zip", + "--dir", + str(tag_dir), + ], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + stderr = result.stderr.strip() + if "no assets match" in stderr.lower() or "no asset" in stderr.lower(): + logger.warning("Release %s has no report zip asset — skipping", tag) + return None + raise FetchError(f"Failed to download report for {tag}: {stderr}") + + # Find the downloaded zip + zips = list(tag_dir.glob("report*.zip")) + if not zips: + logger.warning("Release %s: download succeeded but no zip found — skipping", tag) + return None + + return zips[0] + + +def fetch_workflow_runs( + repo: str, + branch: str | None = None, + event: str | None = None, + limit: int = 10, +) -> list[dict]: + """List recent successful workflow runs from a GitHub repo. + + Returns a list of dicts with workflow run metadata, sorted most recent + first. Only runs with ``conclusion == "success"`` are included. + """ + cmd = [ + "gh", + "run", + "list", + "--repo", + repo, + "--status", + "completed", + "--json", + "databaseId,headBranch,conclusion,event,createdAt", + "--limit", + str(limit), + ] + if branch is not None: + cmd.extend(["--branch", branch]) + if event is not None: + cmd.extend(["--event", event]) + + result = subprocess.run(cmd, capture_output=True, text=True, check=False) + if result.returncode != 0: + raise FetchError(f"Failed to list workflow runs for {repo}: {result.stderr.strip()}") + + runs = json.loads(result.stdout) + return [r for r in runs if r.get("conclusion") == "success"] + + +def fetch_artifact_bundle( + repo: str, + run_id: int, + artifact_name: str, + dest_dir: Path, +) -> Path | None: + """Download a single artifact from a workflow run. + + Returns the path to the downloaded zip file, or ``None`` if no matching + artifact exists in the run. + """ + artifact_dir = dest_dir / artifact_name + artifact_dir.mkdir(parents=True, exist_ok=True) + + result = subprocess.run( + [ + "gh", + "run", + "download", + str(run_id), + "--repo", + repo, + "--name", + artifact_name, + "--dir", + str(artifact_dir), + ], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + stderr = result.stderr.strip().lower() + if "no artifact" in stderr or "no valid" in stderr: + logger.warning("Run %s has no artifact %r — skipping", run_id, artifact_name) + return None + raise FetchError( + f"Failed to download artifact {artifact_name!r} from run {run_id}: " + f"{result.stderr.strip()}" + ) + + # gh run download extracts artifact contents — look for the zip inside + zips = list(artifact_dir.glob("*.zip")) + if not zips: + logger.warning( + "Run %s artifact %r: download succeeded but no zip found — skipping", + run_id, + artifact_name, + ) + return None + + return zips[0] + + +def fetch_prerelease_bundles( + repo: str, + cache_prefix: str = "report-", + work_dir: Path | None = None, +) -> list[Path]: + """Fetch pre-release artifact bundles (main branch and PRs). + + Uses GitHub Actions Artifacts to find evaluation bundles for the ``main`` + branch and open pull requests. Returns a (possibly empty) list of zip + file paths. Never raises on missing artifacts — pre-release data is + optional. + """ + import tempfile + + if work_dir is None: + work_dir = Path(tempfile.mkdtemp(prefix="trend-prerelease-")) + + zip_paths: list[Path] = [] + + # --- Phase A: main branch artifact --- + try: + main_runs = fetch_workflow_runs(repo, branch="main", limit=5) + if main_runs: + artifact_name = f"{cache_prefix}main" + for run in main_runs: + run_id = run["databaseId"] + logger.info("Checking run %s for %s artifact …", run_id, artifact_name) + zip_path = fetch_artifact_bundle(repo, run_id, artifact_name, work_dir) + if zip_path is not None: + zip_paths.append(zip_path) + break # Only need the latest main artifact + else: + logger.info("No successful main-branch workflow runs found") + except FetchError as exc: + logger.warning("Could not fetch main-branch artifact: %s", exc) + + # --- Phase B: PR artifacts --- + try: + pr_runs = fetch_workflow_runs(repo, event="pull_request", limit=20) + seen_branches: set[str] = set() + for run in pr_runs: + branch = run.get("headBranch", "") + if branch in seen_branches: + continue # Only latest run per branch + seen_branches.add(branch) + + run_id = run["databaseId"] + artifact_dir = work_dir / f"pr-run-{run_id}" + artifact_dir.mkdir(parents=True, exist_ok=True) + + result = subprocess.run( + [ + "gh", + "run", + "download", + str(run_id), + "--repo", + repo, + "--pattern", + f"{cache_prefix}pr*", + "--dir", + str(artifact_dir), + ], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + continue # No PR artifacts in this run + + for zp in artifact_dir.rglob("*.zip"): + zip_paths.append(zp) + except FetchError as exc: + logger.warning("Could not fetch PR artifacts: %s", exc) + + return zip_paths + + +def fetch_release_bundles( + repo: str, + tags: list[str] | None = None, + work_dir: Path | None = None, +) -> list[Path]: + """Fetch report zips for all (or specified) releases. + + If *tags* is ``None``, all releases are fetched. Returns a list of + zip file paths (releases without a report asset are silently skipped). + """ + import tempfile + + if work_dir is None: + work_dir = Path(tempfile.mkdtemp(prefix="trend-report-")) + + releases = fetch_release_list(repo) + + if tags is not None: + tag_set = set(tags) + releases = [r for r in releases if r["tagName"] in tag_set] + + zip_paths: list[Path] = [] + for release in releases: + tag = release["tagName"] + logger.info("Fetching report for %s …", tag) + zip_path = fetch_release_bundle(repo, tag, work_dir) + if zip_path is not None: + zip_paths.append(zip_path) + + if not zip_paths: + raise FetchError( + f"No report bundles found for {repo}. Ensure releases have report*.zip assets." + ) + + return zip_paths diff --git a/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/gate.py b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/gate.py new file mode 100644 index 00000000..c0f1b3ee --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/gate.py @@ -0,0 +1,79 @@ +"""CI regression gate logic.""" + +from __future__ import annotations + +from .models import GateResult, RunData, RunType, TrendData + + +def check_regressions(trend: TrendData) -> GateResult: + """Compare the latest data point against the previous release. + + Regression criteria: + - Contract test pass rate decreased + - Unit test failures appeared (> 0 when previous had 0) + - Qualitative overall score decreased by more than 0.02 + """ + latest, previous = find_latest_and_previous(trend) + if latest is None or previous is None: + return GateResult( + passed=True, + regressions=[], + latest_label=latest.label if latest else "", + comparison_label=previous.label if previous else "", + ) + + regressions: list[str] = [] + + # Contract test regression + if latest.contract_tests.pass_rate < previous.contract_tests.pass_rate: + regressions.append( + f"Contract test pass rate decreased: " + f"{previous.contract_tests.pass_rate:.1%} → {latest.contract_tests.pass_rate:.1%}" + ) + + # Unit test failures appeared + if latest.unit_tests.failed > 0 and previous.unit_tests.failed == 0: + regressions.append(f"Unit test failures appeared: {latest.unit_tests.failed} failures") + + # Qualitative score regression (tolerance: 0.02) + score_delta = latest.qualitative.overall_score - previous.qualitative.overall_score + if score_delta < -0.02: + regressions.append( + f"Qualitative score regressed: " + f"{previous.qualitative.overall_score:.3f} → {latest.qualitative.overall_score:.3f} " + f"(delta: {score_delta:+.3f})" + ) + + return GateResult( + passed=len(regressions) == 0, + regressions=regressions, + latest_label=latest.label, + comparison_label=previous.label, + ) + + +def find_latest_and_previous( + trend: TrendData, +) -> tuple[RunData | None, RunData | None]: + """Identify the latest data point and the previous release to compare against. + + If the latest is a release, compare to the second-to-last release. + If the latest is main/PR, compare to the most recent release. + """ + if len(trend.runs) < 2: + return (trend.runs[0] if trend.runs else None, None) + + latest = trend.runs[-1] + + if latest.run_type == RunType.RELEASE: + # Find the previous release + for run in reversed(trend.runs[:-1]): + if run.run_type == RunType.RELEASE: + return latest, run + else: + # Latest is main or PR — compare to the most recent release + for run in reversed(trend.runs): + if run.run_type == RunType.RELEASE: + return latest, run + + return latest, trend.runs[-2] diff --git a/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/models.py b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/models.py new file mode 100644 index 00000000..0a5cd043 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/models.py @@ -0,0 +1,260 @@ +"""Data models for trend reporting.""" + +from __future__ import annotations + +import re +from dataclasses import dataclass, field +from enum import Enum + +# --------------------------------------------------------------------------- +# Exceptions +# --------------------------------------------------------------------------- + + +class TrendReportError(Exception): + """Base exception for all trend report errors.""" + + +class FetchError(TrendReportError): + """Raised when a gh CLI fetch operation fails.""" + + +class CollectorError(TrendReportError): + """Raised when data collection or parsing fails.""" + + +# --------------------------------------------------------------------------- +# Enums and value types +# --------------------------------------------------------------------------- + + +class RunType(Enum): + RELEASE = "release" + MAIN = "main" + PR = "pr" + + +@dataclass(frozen=True, order=True) +class SemVer: + """Semantic version, comparable via tuple ordering.""" + + major: int + minor: int + patch: int + + @classmethod + def parse(cls, tag: str) -> SemVer: + """Parse 'v0.1.3' or '0.1.3' into SemVer.""" + m = re.match(r"v?(\d+)\.(\d+)\.(\d+)", tag) + if not m: + raise ValueError(f"Cannot parse semver from '{tag}'") + return cls(int(m.group(1)), int(m.group(2)), int(m.group(3))) + + def __str__(self) -> str: + return f"v{self.major}.{self.minor}.{self.patch}" + + +# --------------------------------------------------------------------------- +# Per-YAML-file models +# --------------------------------------------------------------------------- + + +@dataclass +class RunConfig: + """Subset of run-meta.yaml -> config.""" + + rules_ref: str + model: str = "" + target_project: str = "" + + +@dataclass +class RunMeta: + """Parsed from run-meta.yaml.""" + + run_id: str + config: RunConfig + start_time: str = "" + end_time: str = "" + status: str = "" + + +@dataclass +class AgentTokens: + """Token breakdown for a single agent.""" + + agent_name: str + input_tokens: int = 0 + output_tokens: int = 0 + total_tokens: int = 0 + cache_read_tokens: int = 0 + cache_write_tokens: int = 0 + + +@dataclass +class HandoffMetrics: + """Metrics for a single handoff segment.""" + + handoff_number: int + agent: str = "" + duration_seconds: float = 0.0 + tokens: int = 0 + + +@dataclass +class RunMetrics: + """Parsed from run-metrics.yaml.""" + + total_tokens: int = 0 + total_input_tokens: int = 0 + total_output_tokens: int = 0 + total_cache_read_tokens: int = 0 + total_cache_write_tokens: int = 0 + execution_time_seconds: float = 0.0 + num_handoffs: int = 0 + max_context_tokens: int = 0 + avg_context_tokens: float = 0.0 + median_context_tokens: float = 0.0 + agent_tokens: list[AgentTokens] = field(default_factory=list) + handoffs: list[HandoffMetrics] = field(default_factory=list) + server_startup_success: bool = True + error_count: int = 0 + + +@dataclass +class UnitTestResults: + """Parsed from test-results.yaml.""" + + passed: int = 0 + failed: int = 0 + errors: int = 0 + skipped: int = 0 + total: int = 0 + + +@dataclass +class ContractTestFailure: + """A single contract test failure.""" + + endpoint: str = "" + method: str = "" + expected_status: int = 0 + actual_status: int = 0 + description: str = "" + + +@dataclass +class ContractTestResults: + """Parsed from contract-test-results.yaml.""" + + total: int = 0 + passed: int = 0 + failed: int = 0 + pass_rate: float = 0.0 + failures: list[ContractTestFailure] = field(default_factory=list) + + +@dataclass +class CodeQualityMetrics: + """Parsed from quality-report.yaml.""" + + lint_findings: int = 0 + security_findings: int = -1 + security_scanner_available: bool = False + source_file_count: int = 0 + test_file_count: int = 0 + total_lines_of_code: int = 0 + artifact_counts: dict[str, int] = field(default_factory=dict) + + +@dataclass +class DocumentScore: + """Score for a single document in qualitative comparison.""" + + document_name: str + overall_score: float = 0.0 + phase: str = "" + completeness: float = 0.0 + accuracy: float = 0.0 + clarity: float = 0.0 + + +@dataclass +class QualitativeComparison: + """Parsed from qualitative-comparison.yaml.""" + + overall_score: float = 0.0 + inception_score: float = 0.0 + construction_score: float = 0.0 + document_scores: list[DocumentScore] = field(default_factory=list) + unmatched_reference_docs: list[str] = field(default_factory=list) + unmatched_candidate_docs: list[str] = field(default_factory=list) + + +# --------------------------------------------------------------------------- +# Composite models +# --------------------------------------------------------------------------- + + +@dataclass +class RunData: + """All data for a single evaluation run (one zip bundle).""" + + label: str + run_type: RunType + semver: SemVer | None + pr_number: int | None + meta: RunMeta + metrics: RunMetrics + unit_tests: UnitTestResults + contract_tests: ContractTestResults + code_quality: CodeQualityMetrics + qualitative: QualitativeComparison + + +@dataclass +class BaselineMetrics: + """Golden baseline reference values.""" + + unit_tests_passed: int = 0 + unit_tests_total: int = 0 + contract_tests_passed: int = 0 + contract_tests_total: int = 0 + lint_findings: int = 0 + qualitative_overall: float = 0.0 + execution_time_seconds: float = 0.0 + total_tokens: int = 0 + document_scores: dict[str, float] = field(default_factory=dict) + + +@dataclass +class TrendData: + """Complete assembled dataset for trend rendering.""" + + runs: list[RunData] + baseline: BaselineMetrics + repo: str = "" + generated_at: str = "" + + +@dataclass +class VersionDelta: + """Computed delta between two consecutive runs.""" + + from_label: str + to_label: str + unit_tests_delta: int = 0 + contract_tests_delta: int = 0 + qualitative_delta: float = 0.0 + token_delta: int = 0 + time_delta_seconds: float = 0.0 + + +@dataclass +class GateResult: + """Result of regression gate check.""" + + passed: bool + regressions: list[str] = field(default_factory=list) + latest_label: str = "" + comparison_label: str = "" diff --git a/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/render_html.py b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/render_html.py new file mode 100644 index 00000000..fe9ced63 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/render_html.py @@ -0,0 +1,955 @@ +"""Self-contained HTML trend report renderer (tables only, no JavaScript).""" + +from __future__ import annotations + +from html import escape + +from .collector import compute_deltas +from .models import RunType, TrendData +from .sparkline import ( + format_delta, + format_number, + format_pct, + format_seconds_as_minutes, +) + + +def render_trend_html(trend: TrendData) -> str: + """Render the full trend report as a self-contained HTML string.""" + parts = [ + _html_header("AIDLC Rules Trend Report"), + _render_html_hero(trend), + _render_nav(), + _render_html_section_a(trend), + _render_html_section_b(trend), + _render_html_section_c(trend), + _render_html_section_d(trend), + _render_html_section_e(trend), + _render_html_section_f(trend), + _render_html_section_g(trend), + _render_html_section_h(trend), + _html_footer(), + ] + return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# HTML chrome +# --------------------------------------------------------------------------- + +_CSS = """\ +:root { + /* AWS Cloudscape-aligned palette */ + --aws-squid-ink: #000716; + --aws-orange: #ec7211; + --aws-blue-600: #0972d3; + + /* Status colors */ + --green-bg: #f2fcf3; --green-text: #037f0c; --green-border: #29ad32; + --yellow-bg: #fff8e1; --yellow-text: #8d6605; --yellow-border: #d4a017; + --red-bg: #fff3f0; --red-text: #d91515; --red-border: #eb5f5f; + --blue-bg: #f0f6ff; --blue-text: #0972d3; + + /* Neutral grays */ + --gray-50: #fafafa; --gray-100: #f2f3f3; --gray-200: #e9ebed; + --gray-300: #d1d5db; --gray-500: #5f6b7a; --gray-700: #414d5c; + --gray-900: #000716; + + --radius: 8px; +} +* { box-sizing: border-box; } +body { + font-family: 'Amazon Ember', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', sans-serif; + max-width: 1200px; margin: 0 auto; padding: 24px; + color: var(--gray-900); background: #fff; line-height: 1.6; +} +h1 { font-size: 28px; margin: 0 0 4px 0; } +h2 { + font-size: 20px; margin: 40px 0 12px 0; padding-bottom: 8px; + border-bottom: 2px solid var(--gray-200); color: var(--gray-900); +} +h3 { font-size: 16px; margin: 24px 0 8px 0; color: var(--gray-700); } + +/* Hero header */ +.hero { + margin-bottom: 32px; padding: 20px 24px; + background: var(--aws-squid-ink); color: #fff; border-radius: var(--radius); +} +.hero h1 { font-size: 28px; color: #fff; } +.hero .meta { color: #a8b4c4; font-size: 14px; margin-top: 4px; } + +/* Navigation */ +.nav { + display: flex; flex-wrap: wrap; gap: 6px; margin-bottom: 28px; + padding: 12px 16px; background: var(--aws-squid-ink); border-radius: var(--radius); + border: none; +} +.nav a { + font-size: 13px; color: #d5dbdb; text-decoration: none; + padding: 4px 10px; border-radius: 4px; transition: background 0.15s; +} +.nav a:hover { background: rgba(255, 255, 255, 0.12); color: #fff; } + +/* Summary cards */ +.cards { display: grid; grid-template-columns: repeat(auto-fit, minmax(170px, 1fr)); gap: 12px; margin: 16px 0 24px 0; } +.card { + padding: 14px 16px; border-radius: var(--radius); + border: 1px solid var(--gray-200); background: #fff; +} +.card .label { font-size: 12px; color: var(--gray-500); text-transform: uppercase; letter-spacing: 0.5px; } +.card .value { font-size: 24px; font-weight: 700; margin: 4px 0 2px 0; } +.card .detail { font-size: 12px; color: var(--gray-500); } +.card.good { border-left: 4px solid var(--green-border); } +.card.warn { border-left: 4px solid var(--yellow-border); } +.card.bad { border-left: 4px solid #d91515; } + +/* Tables */ +table { + border-collapse: collapse; width: 100%; margin: 12px 0 24px 0; + font-size: 14px; border-radius: var(--radius); overflow: hidden; + border: 1px solid var(--gray-200); +} +th { + background: var(--gray-100); font-weight: 600; text-align: left; + padding: 10px 14px; border-bottom: 2px solid var(--gray-200); + font-size: 13px; color: var(--gray-700); text-transform: uppercase; + letter-spacing: 0.3px; +} +td { padding: 9px 14px; border-bottom: 1px solid var(--gray-100); } +tr:hover td { background: var(--gray-50); } +td:first-child { font-weight: 500; } + +/* Score cells */ +.s-green { background: var(--green-bg); color: var(--green-text); font-weight: 600; } +.s-yellow { background: var(--yellow-bg); color: var(--yellow-text); font-weight: 500; } +.s-red { background: var(--red-bg); color: var(--red-text); font-weight: 600; } +.d-pos { color: var(--green-text); font-weight: 500; } +.d-neg { color: var(--red-text); font-weight: 500; } +.na { color: var(--gray-500); font-style: italic; } + +/* Badges */ +.badge { + display: inline-block; padding: 2px 8px; border-radius: 10px; + font-size: 12px; font-weight: 600; letter-spacing: 0.3px; +} +.badge-pass { background: var(--green-bg); color: var(--green-text); } +.badge-fail { background: var(--red-bg); color: var(--red-text); } +.badge-warn { background: var(--yellow-bg); color: var(--yellow-text); } + +/* Mini bar chart (CSS only) */ +.bar-cell { position: relative; } +.bar { + display: inline-block; height: 16px; border-radius: 2px; + background: linear-gradient(90deg, #ec7211, #ff9900); + vertical-align: middle; margin-right: 6px; + min-width: 2px; +} + +/* Two-column split layout */ +.split { + display: grid; grid-template-columns: 1fr 2fr; + gap: 24px; align-items: start; margin: 4px 0 24px 0; +} +.split-desc { + font-size: 14px; color: var(--gray-500); line-height: 1.7; + padding-top: 4px; +} +.split-desc p { margin: 0 0 8px 0; } +.split table { margin-top: 0; } + +/* Blockquote callouts */ +.callout { + padding: 12px 16px; margin: 12px 0; + border-left: 4px solid var(--yellow-border); background: var(--yellow-bg); + border-radius: 0 var(--radius) var(--radius) 0; font-size: 14px; +} +.callout.info { border-left-color: var(--aws-blue-600); background: var(--blue-bg); } + +/* Section description */ +.section-desc { color: var(--gray-500); font-size: 14px; margin: 0 0 12px 0; } + +/* Responsive */ +@media (max-width: 768px) { + body { padding: 12px; } + .cards { grid-template-columns: repeat(2, 1fr); } + .split { grid-template-columns: 1fr; } + table { font-size: 13px; } + th, td { padding: 6px 8px; } +} +""" + + +def _html_header(title: str) -> str: + return f""" + + + + +{escape(title)} + + + +""" + + +def _html_footer() -> str: + return "\n" + + +def _render_html_hero(trend: TrendData) -> str: + n = len(trend.runs) + first = trend.runs[0].label if trend.runs else "—" + last = trend.runs[-1].label if trend.runs else "—" + return ( + '
\n' + f"

AIDLC Rules Trend Report

\n" + f'
{n} releases ({first} through {last}) · ' + f"{escape(trend.repo)} · {escape(trend.generated_at)}
\n" + "
\n" + ) + + +def _render_nav() -> str: + links = [ + ("A. Summary", "a-executive-summary"), + ("B. Correctness", "b-functional-correctness"), + ("C. Qualitative", "c-qualitative-evaluation"), + ("D. Efficiency", "d-efficiency-cost-metrics"), + ("E. Quality", "e-code-quality"), + ("F. Stability", "f-stability-reliability"), + ("G. Deltas", "g-version-over-version-deltas"), + ("H. Pre-Release", "h-pre-release-data-points"), + ] + items = " ".join(f'{label}' for label, anchor in links) + return f'\n' + + +# --------------------------------------------------------------------------- +# Section A — Executive Summary +# --------------------------------------------------------------------------- + + +def _render_html_section_a(trend: TrendData) -> str: + runs = trend.runs + bl = trend.baseline + latest = runs[-1] if runs else None + if not latest: + return '

A. Executive Summary

\n

No data available.

\n' + + # Summary cards + qual_status = ( + "good" + if latest.qualitative.overall_score >= 0.90 + else ("warn" if latest.qualitative.overall_score >= 0.80 else "bad") + ) + contract_status = ( + "good" + if latest.contract_tests.pass_rate >= 1.0 + else ("warn" if latest.contract_tests.pass_rate >= 0.95 else "bad") + ) + unit_pass_rate = ( + latest.unit_tests.passed / latest.unit_tests.total + if latest.unit_tests.total > 0 + else 0.0 + ) + bl_unit_pass_rate = ( + bl.unit_tests_passed / bl.unit_tests_total + if bl.unit_tests_total > 0 + else 0.0 + ) + test_status = "good" if unit_pass_rate >= 1.0 else ("warn" if unit_pass_rate >= 0.95 else "bad") + lint_status = "good" if latest.code_quality.lint_findings == 0 else "warn" + + cards = ( + '
\n' + f'
' + f'
Qualitative Score
' + f'
{latest.qualitative.overall_score:.3f}
' + f'
Golden: {bl.qualitative_overall:.3f}
\n' + f'
' + f'
Contract Tests
' + f'
{latest.contract_tests.passed}/{latest.contract_tests.total}
' + f'
{format_pct(latest.contract_tests.pass_rate)} pass rate
\n' + f'
' + f'
Unit Tests
' + f'
{format_pct(unit_pass_rate)}
' + f'
{latest.unit_tests.passed}/{latest.unit_tests.total} passed
\n' + f'
' + f'
Lint Findings
' + f'
{latest.code_quality.lint_findings}
' + f'
Golden: {bl.lint_findings}
\n' + f'
' + f'
Execution Time
' + f'
{format_seconds_as_minutes(latest.metrics.execution_time_seconds)}
' + f'
Golden: {format_seconds_as_minutes(bl.execution_time_seconds) if bl.execution_time_seconds else "—"}
\n' + f'
' + f'
Total Tokens
' + f'
{format_number(latest.metrics.total_tokens)}
' + f'
Golden: {format_number(bl.total_tokens) if bl.total_tokens else "—"}
\n' + "
\n" + ) + + # Detail table + rows = [ + ( + "Unit test pass rate", + f"{format_pct(bl.unit_tests_passed / bl.unit_tests_total)} ({bl.unit_tests_passed}/{bl.unit_tests_total})" + if bl.unit_tests_total + else _bl(bl.unit_tests_passed), + f"{format_pct(unit_pass_rate)} ({latest.unit_tests.passed}/{latest.unit_tests.total})" + if latest.unit_tests.total + else "0", + "=" + if bl.unit_tests_total and unit_pass_rate == bl_unit_pass_rate + else ( + f"{(unit_pass_rate - bl_unit_pass_rate) * 100:+.1f}%" + if bl.unit_tests_total + else "—" + ), + ), + ( + "Contract tests", + f"{bl.contract_tests_passed}/{bl.contract_tests_total}" + if bl.contract_tests_total + else "—", + f"{latest.contract_tests.passed}/{latest.contract_tests.total}", + _fmt_int_delta(latest.contract_tests.passed, bl.contract_tests_passed), + ), + ( + "Lint findings", + str(bl.lint_findings), + str(latest.code_quality.lint_findings), + _fmt_int_delta(latest.code_quality.lint_findings, bl.lint_findings), + ), + ( + "Qualitative score", + f"{bl.qualitative_overall:.3f}" if bl.qualitative_overall else "—", + f"{latest.qualitative.overall_score:.3f}", + f"{latest.qualitative.overall_score - bl.qualitative_overall:+.3f}" + if bl.qualitative_overall + else "—", + ), + ( + "Execution time", + format_seconds_as_minutes(bl.execution_time_seconds) + if bl.execution_time_seconds + else "—", + format_seconds_as_minutes(latest.metrics.execution_time_seconds), + _fmt_time_delta(latest.metrics.execution_time_seconds, bl.execution_time_seconds), + ), + ( + "Total tokens", + format_number(bl.total_tokens) if bl.total_tokens else "—", + format_number(latest.metrics.total_tokens), + _fmt_token_delta_html(latest.metrics.total_tokens, bl.total_tokens), + ), + ] + + # Metrics where lower values are better — a negative delta is good (green) + lower_is_better = {"lint findings", "execution time", "total tokens"} + + table_rows = [] + table_styles = [] + for label, golden, latest_val, vs in rows: + table_rows.append([label, golden, latest_val, vs]) + # Color the delta column based on metric direction + delta_cls = "" + if vs not in ("=", "—") and (vs.startswith("+") or vs.startswith("-") or vs.startswith("−")): + is_negative = vs.startswith("-") or vs.startswith("−") + if label.lower() in lower_is_better: + delta_cls = "d-pos" if is_negative else "d-neg" + else: + delta_cls = "d-neg" if is_negative else "d-pos" + table_styles.append(["", "", "", delta_cls]) + + metric_guide = ( + '

High-level snapshot comparing the latest release against the ' + "golden baseline (the reference evaluation used as the quality target).

\n" + "\n\n\n \n \n\n\n\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "\n
MetricWhat it measures
Unit test pass ratePercentage of generated unit tests that pass. Higher means more reliable code generation.
Contract testsAPI compliance checks against the OpenAPI spec (passed/total). 88/88 = full compliance.
Lint findingsStatic analysis warnings in generated code. Lower is better — 0 means clean code.
Qualitative scoreAI-graded documentation quality on a 0–1 scale (higher is better).
Execution timeWall-clock time for the full evaluation run. Lower means faster generation.
Total tokensTotal LLM tokens consumed (input + output). Lower means more cost-efficient.
\n" + ) + + html = ( + '

A. Executive Summary

\n' + + cards + + metric_guide + + _html_table( + ["Metric", "Golden", f"Latest ({escape(latest.label)})", "vs Golden"], + table_rows, + table_styles, + ) + ) + return html + + +# --------------------------------------------------------------------------- +# Section B +# --------------------------------------------------------------------------- + + +def _render_html_section_b(trend: TrendData) -> str: + parts = ['

B. Functional Correctness

\n'] + parts.append( + '

Measures whether the code generated by each rules version actually works correctly. ' + "This is the most fundamental quality gate — code that doesn’t pass its own tests is broken.

\n" + ) + + # B.1 Unit tests with bar chart + parts.append("

B.1 Unit Tests

\n") + parts.append('
\n
\n') + parts.append( + "

Unit tests validate individual functions and components in isolation. " + "The AIDLC rules instruct the AI to generate both source code and test suites.

\n" + "

Pass/Total = tests that passed out of total generated. " + "Rate = pass percentage (100% = all tests passing). " + "Failures = tests that ran but produced wrong results.

\n" + ) + parts.append("
\n
\n") + + rows = [] + styles = [] + for r in trend.runs: + rate = r.unit_tests.passed / r.unit_tests.total if r.unit_tests.total > 0 else 0.0 + cls = _score_class(rate) + fail_cls = "d-neg" if r.unit_tests.failed > 0 else "" + rows.append( + [ + r.label, + f"{r.unit_tests.passed}/{r.unit_tests.total}", + format_pct(rate), + str(r.unit_tests.failed), + ] + ) + styles.append(["", "", cls, fail_cls]) + parts.append(_html_table(["Version", "Pass/Total", "Rate", "Failures"], rows, styles)) + parts.append("
\n
\n") + + # B.2 Contract tests + parts.append("

B.2 Contract Tests (API Compliance)

\n") + parts.append('
\n
\n') + parts.append( + "

Contract tests verify that the generated API implementation matches its " + "OpenAPI specification. Each test sends a request to an endpoint and checks that " + "the HTTP status code and response shape match the spec.

\n" + "

88 endpoints are tested per version. " + "Pass/Total = endpoints that returned the expected status code. " + "Rate = pass percentage (100% = full spec compliance).

\n" + "

Failures lists the specific endpoints that deviated from the spec.

\n" + ) + parts.append("
\n
\n") + + rows = [] + styles = [] + for r in trend.runs: + rate = r.contract_tests.pass_rate + cls = _score_class(rate) + fail_cls = "d-neg" if r.contract_tests.failed > 0 else "" + rows.append( + [ + r.label, + f"{r.contract_tests.passed}/{r.contract_tests.total}", + format_pct(rate), + str(r.contract_tests.failed), + ] + ) + styles.append(["", "", cls, fail_cls]) + parts.append(_html_table(["Version", "Pass/Total", "Rate", "Failures"], rows, styles)) + parts.append("
\n
\n") + + for r in trend.runs: + if r.contract_tests.failures: + parts.append(f'
{escape(r.label)} failures:
    \n') + for f in r.contract_tests.failures: + parts.append( + f"
  • {escape(f.method)} {escape(f.endpoint)} — " + f"expected {f.expected_status}, got {f.actual_status} " + f"({escape(f.description)})
  • \n" + ) + parts.append("
\n") + + return "".join(parts) + + +# --------------------------------------------------------------------------- +# Section C +# --------------------------------------------------------------------------- + + +def _render_html_section_c(trend: TrendData) -> str: + parts = ['

C. Qualitative Evaluation

\n'] + parts.append( + '

Measures the quality of generated documentation by comparing it against ' + "human-authored reference documents. An AI evaluator scores each document on completeness, accuracy, " + "and clarity, producing a 0–1 score (1.0 = perfect match to reference quality).

\n" + ) + + # C.1 Overall + parts.append("

C.1 Overall Score

\n") + parts.append('
\n
\n') + parts.append( + "

The weighted average across all evaluated documents. " + "This is the single best indicator of how well the rules produce documentation.

\n" + "

Scores above 0.90 are considered strong; below 0.70 signals significant gaps.

\n" + ) + bl_score = trend.baseline.qualitative_overall + if bl_score: + parts.append( + f"

Golden baseline: {bl_score:.3f}

\n" + ) + parts.append("
\n
\n") + + rows = [] + styles = [] + for r in trend.runs: + s = r.qualitative.overall_score + delta = s - bl_score if bl_score else 0 + rows.append([r.label, f"{s:.3f}", f"{delta:+.3f}" if bl_score else "—"]) + styles.append(["", _score_class(s), _delta_class(delta)]) + parts.append(_html_table(["Version", "Overall", "vs Golden"], rows, styles)) + parts.append("
\n
\n") + + # C.2 Phase breakdown + parts.append("

C.2 Phase Breakdown

\n") + parts.append('
\n
\n') + parts.append( + "

Documents are grouped by SDLC phase. " + "Inception covers early-stage design artifacts (requirements, architecture plans, " + "component designs) — these are generated first and set the foundation.

\n" + "

Construction covers build-time artifacts (build instructions, test instructions, " + "build-and-test summaries) — these depend on inception outputs being correct.

\n" + "

A drop in inception quality often cascades into construction.

\n" + ) + parts.append("
\n
\n") + rows = [] + styles = [] + for r in trend.runs: + inc = r.qualitative.inception_score + con = r.qualitative.construction_score + rows.append([r.label, f"{inc:.3f}", f"{con:.3f}"]) + styles.append(["", _score_class(inc), _score_class(con)]) + parts.append(_html_table(["Version", "Inception", "Construction"], rows, styles)) + parts.append("
\n
\n") + + # C.3 Per-document heatmap + parts.append("

C.3 Per-Document Heatmap

\n") + parts.append( + '

Individual quality scores for each generated document across all versions. ' + "This reveals which specific documents are consistently strong, improving, or problematic. " + "Documents scoring below 0.70 (red) are the top candidates for rules improvements.

\n" + ) + all_docs, labels, matrix = _build_heatmap(trend) + header = ["Document"] + labels + rows = [] + styles = [] + for i, doc in enumerate(all_docs): + row = [f"{escape(doc)}"] + row_styles = [""] + for score in matrix[i]: + if score < 0: + row.append('') + row_styles.append("") + else: + row.append(f"{score:.2f}") + row_styles.append(_score_class(score)) + rows.append(row) + styles.append(row_styles) + parts.append(_html_table(header, rows, styles)) + parts.append( + '

' + 'green ≥ 0.90 ' + 'yellow 0.70–0.89 ' + 'red < 0.70

\n' + ) + + # C.4 Coverage + parts.append("

C.4 Document Coverage

\n") + parts.append( + '

Tracks whether the generated output includes the same set of documents as the reference. ' + "Unmatched Ref = reference documents the AI failed to generate (missing output). " + "Unmatched Candidate = extra documents the AI generated that don’t exist in the reference " + "(unexpected output). Ideally both columns are 0, meaning the AI produced exactly the expected set of documents.

\n" + ) + rows = [] + styles = [] + for r in trend.runs: + ref_n = len(r.qualitative.unmatched_reference_docs) + cand_n = len(r.qualitative.unmatched_candidate_docs) + rows.append([r.label, str(ref_n), str(cand_n)]) + styles.append( + [ + "", + "d-neg" if ref_n > 0 else "", + "d-neg" if cand_n > 0 else "", + ] + ) + parts.append(_html_table(["Version", "Unmatched Ref", "Unmatched Candidate"], rows, styles)) + + return "".join(parts) + + +# --------------------------------------------------------------------------- +# Section D +# --------------------------------------------------------------------------- + + +def _render_html_section_d(trend: TrendData) -> str: + parts = ['

D. Efficiency & Cost Metrics

\n'] + parts.append( + '

Tracks the computational resources consumed by each evaluation run. ' + "These metrics directly affect cost (tokens) and developer wait time (execution time). " + "Lower values are generally better, as long as quality metrics remain stable.

\n" + ) + + # D.1 Token consumption with bars + parts.append("

D.1 Token Consumption

\n") + parts.append('
\n
\n') + parts.append( + "

Total LLM tokens consumed during the run, broken down by agent. " + "Total = all tokens across all agents (input + output).

\n" + "

Executor = the agent that generates code and documents. " + "Simulator = the agent that simulates user interactions for testing.

\n" + "

Token count is the primary cost driver — each token represents a unit of " + "LLM usage billed by the provider.

\n" + ) + parts.append("
\n
\n") + max_tok = max((r.metrics.total_tokens for r in trend.runs), default=1) + rows = [] + styles = [] + for r in trend.runs: + pct = r.metrics.total_tokens / max_tok * 100 if max_tok else 0 + agent_map = {a.agent_name: format_number(a.total_tokens) for a in r.metrics.agent_tokens} + bar_html = f'' + rows.append( + [ + r.label, + bar_html, + format_number(r.metrics.total_tokens), + agent_map.get("executor", "—"), + agent_map.get("simulator", "—"), + ] + ) + styles.append(["", "bar-cell", "", "", ""]) + parts.append(_html_table(["Version", "", "Total", "Executor", "Simulator"], rows, styles)) + parts.append("
\n
\n") + + # D.2 Execution time with bars + parts.append("

D.2 Execution Time

\n") + parts.append('
\n
\n') + parts.append( + "

Wall-clock duration of the full evaluation pipeline, broken down by handoff. " + "Each handoff (H1, H2, H3) represents a sequential phase.

\n" + "

H1 is typically code generation (the longest phase), H2 is build/test execution, " + "and H3 is result collection and reporting.

\n" + "

Wall Clock is the total end-to-end time.

\n" + ) + parts.append("
\n
\n") + max_time = max((r.metrics.execution_time_seconds for r in trend.runs), default=1) + rows = [] + styles = [] + for r in trend.runs: + pct = r.metrics.execution_time_seconds / max_time * 100 if max_time else 0 + bar_html = f'' + handoff_strs = [ + f"H{h.handoff_number}: {format_seconds_as_minutes(h.duration_seconds)}" + for h in r.metrics.handoffs + ] + rows.append([r.label, bar_html, format_seconds_as_minutes(r.metrics.execution_time_seconds), " · ".join(handoff_strs) if handoff_strs else "—"]) + styles.append(["", "bar-cell", "", ""]) + parts.append(_html_table(["Version", "", "Wall Clock", "Handoff Breakdown"], rows, styles)) + parts.append("
\n
\n") + + # D.3 Context window + parts.append("

D.3 Context Window Pressure

\n") + parts.append('
\n
\n') + parts.append( + "

Measures how much of the LLM’s context window is being used across API calls. " + "Max = the largest single context seen during the run (approaching the " + "model’s limit risks truncation or degraded output).

\n" + "

Avg = the mean context size across all API calls. " + "Median = the midpoint context size (less affected by outliers than avg).

\n" + "

High context pressure can indicate overly verbose prompts or accumulated conversation history.

\n" + ) + parts.append("
\n
\n") + rows = [ + [ + r.label, + format_number(r.metrics.max_context_tokens), + format_number(r.metrics.avg_context_tokens), + format_number(r.metrics.median_context_tokens), + ] + for r in trend.runs + ] + parts.append(_html_table(["Version", "Max", "Avg", "Median"], rows)) + parts.append("
\n
\n") + + return "".join(parts) + + +# --------------------------------------------------------------------------- +# Section E +# --------------------------------------------------------------------------- + + +def _render_html_section_e(trend: TrendData) -> str: + parts = ['

E. Code Quality

\n'] + parts.append( + '

Static analysis of the generated codebase. These metrics reflect the cleanliness and ' + "maintainability of the AI-generated code, independent of whether it passes tests.

\n" + "\n\n\n \n \n\n\n\n" + "\n" + "\n" + "\n" + "\n" + "\n
MetricWhat it measures
Lint FindingsWarnings from static analysis (style violations, unused variables, etc.). 0 = clean.
Security FindingsVulnerabilities detected by security scanners (SQL injection, XSS, etc.). N/A if no scanner was configured.
Source FilesNumber of non-test source files in the generated project.
LOCTotal lines of code across all source files. Large swings may indicate generated boilerplate or missing modules.
\n" + ) + rows = [ + [ + r.label, + str(r.code_quality.lint_findings), + str(r.code_quality.security_findings) + if r.code_quality.security_scanner_available + else 'N/A', + str(r.code_quality.source_file_count), + format_number(r.code_quality.total_lines_of_code), + ] + for r in trend.runs + ] + parts.append( + _html_table(["Version", "Lint Findings", "Security Findings", "Source Files", "LOC"], rows) + ) + return "".join(parts) + + +# --------------------------------------------------------------------------- +# Section F +# --------------------------------------------------------------------------- + + +def _render_html_section_f(trend: TrendData) -> str: + parts = ['

F. Stability & Reliability

\n'] + parts.append( + '

Tracks whether the evaluation pipeline itself ran smoothly, independent of output quality.

\n' + "\n\n\n \n \n\n\n\n" + "\n" + "\n" + "\n" + "\n
MetricWhat it measures
Error EventsRuntime errors logged during the run (exceptions, timeouts, API failures). 0 = clean run.
HandoffsNumber of sequential pipeline phases completed. Typically 3 (generate, build/test, report). A different count may indicate an early abort or retry.
Server StartupWhether the generated application server started successfully. A failure here means the generated code couldn’t even boot, preventing contract tests from running.
\n" + ) + rows = [] + styles = [] + for r in trend.runs: + ok = r.metrics.server_startup_success + err_cls = "d-neg" if r.metrics.error_count > 0 else "" + ok_html = ( + 'PASS' + if ok + else 'FAIL' + ) + rows.append([r.label, str(r.metrics.error_count), str(r.metrics.num_handoffs), ok_html]) + styles.append(["", err_cls, "", ""]) + parts.append( + _html_table(["Version", "Error Events", "Handoffs", "Server Startup"], rows, styles) + ) + return "".join(parts) + + +# --------------------------------------------------------------------------- +# Section G +# --------------------------------------------------------------------------- + + +def _render_html_section_g(trend: TrendData) -> str: + parts = ['

G. Version-over-Version Deltas

\n'] + deltas = compute_deltas(trend.runs) + if not deltas: + parts.append("

Not enough data points.

\n") + return "".join(parts) + + parts.append( + '

Each row shows the change from one release to the next, making it easy to spot ' + "which specific version introduced an improvement or regression. " + "Positive values (+) indicate an increase; negative (−) indicate a decrease. " + "For Unit Tests and Contract, positive is better (more tests passing). " + "For Qualitative, positive is better (higher quality score). " + "For Tokens and Time, negative is better (more efficient).

\n" + ) + + rows = [] + styles = [] + for d in deltas: + tok_str = _fmt_signed_number(d.token_delta) + time_str = f"{format_delta(d.time_delta_seconds, precision=0)}s" + rows.append( + [ + f"{d.from_label} → {d.to_label}", + format_delta(d.unit_tests_delta), + format_delta(d.contract_tests_delta), + format_delta(d.qualitative_delta, precision=3), + tok_str, + time_str, + ] + ) + styles.append( + [ + "", + _delta_class(d.unit_tests_delta), + _delta_class(d.contract_tests_delta), + _delta_class(d.qualitative_delta), + _delta_class(-d.token_delta), + _delta_class(-d.time_delta_seconds), + ] + ) + parts.append( + _html_table( + ["Transition", "Unit Tests", "Contract", "Qualitative", "Tokens", "Time"], + rows, + styles, + ) + ) + return "".join(parts) + + +# --------------------------------------------------------------------------- +# Section H +# --------------------------------------------------------------------------- + + +def _render_html_section_h(trend: TrendData) -> str: + pre_release = [r for r in trend.runs if r.run_type in (RunType.MAIN, RunType.PR)] + + html = ( + '

H. Pre-Release Data Points

\n' + '

Evaluation results from non-release sources — the main branch ' + "and open pull requests. These represent in-progress work that hasn’t been tagged as a release yet. " + "Use this data to preview whether upcoming changes will improve or regress metrics before they ship.

\n" + ) + if not pre_release: + return html + '

No pre-release data available.

\n' + + rows = [ + [ + r.label, + f"{format_pct(r.unit_tests.passed / r.unit_tests.total)} ({r.unit_tests.passed}/{r.unit_tests.total})" + if r.unit_tests.total > 0 + else "0", + f"{r.contract_tests.passed}/{r.contract_tests.total}", + f"{r.qualitative.overall_score:.3f}", + format_number(r.metrics.total_tokens), + ] + for r in pre_release + ] + return html + _html_table(["Source", "Unit Tests", "Contract", "Qualitative", "Tokens"], rows) + + +# --------------------------------------------------------------------------- +# Utilities +# --------------------------------------------------------------------------- + + +def _score_class(score: float) -> str: + if score >= 0.90: + return "s-green" + if score >= 0.70: + return "s-yellow" + return "s-red" + + +def _delta_class(delta: float) -> str: + if delta > 0: + return "d-pos" + if delta < 0: + return "d-neg" + return "" + + +def _html_table( + headers: list[str], + rows: list[list[str]], + cell_styles: list[list[str]] | None = None, +) -> str: + lines = ["\n\n"] + for h in headers: + lines.append(f" ") + lines.append("\n\n") + for i, row in enumerate(rows): + lines.append("") + for j, cell in enumerate(row): + cls = "" + if cell_styles and i < len(cell_styles) and j < len(cell_styles[i]): + cls_name = cell_styles[i][j] + if cls_name: + cls = f' class="{cls_name}"' + lines.append(f" {cell}") + lines.append("") + lines.append("\n
{h}
\n") + return "\n".join(lines) + + +def _build_heatmap( + trend: TrendData, +) -> tuple[list[str], list[str], list[list[float]]]: + all_docs = sorted( + {ds.document_name for run in trend.runs for ds in run.qualitative.document_scores} + ) + labels = [r.label for r in trend.runs] + matrix: list[list[float]] = [] + for doc in all_docs: + row: list[float] = [] + for run in trend.runs: + score = next( + ( + ds.overall_score + for ds in run.qualitative.document_scores + if ds.document_name == doc + ), + -1.0, + ) + row.append(score) + matrix.append(row) + return all_docs, labels, matrix + + +def _bl(val: int | float) -> str: + """Format a baseline value, returning '—' only when truly zero/missing.""" + if val is None: + return "—" + return str(val) + + +def _fmt_int_delta(current: int, baseline: int) -> str: + if baseline is None: + return "—" + delta = current - baseline + if delta == 0: + return "=" + return f"{delta:+d}" + + +def _fmt_time_delta(current_s: float, baseline_s: float) -> str: + if not baseline_s: + return "—" + delta_m = (current_s - baseline_s) / 60 + return f"{delta_m:+.1f}m" + + +def _fmt_token_delta_html(current: int, baseline: int) -> str: + if not baseline: + return "—" + delta = current - baseline + return _fmt_signed_number(delta) + + +def _fmt_signed_number(n: int) -> str: + sign = "+" if n >= 0 else "" + abs_n = abs(n) + if abs_n >= 1_000_000: + return f"{sign}{n / 1_000_000:.2f}M" + if abs_n >= 1_000: + return f"{sign}{n / 1_000:.1f}K" + return f"{sign}{n}" diff --git a/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/render_md.py b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/render_md.py new file mode 100644 index 00000000..2561f060 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/render_md.py @@ -0,0 +1,649 @@ +"""Markdown trend report renderer.""" + +from __future__ import annotations + +from .collector import compute_deltas +from .models import TrendData +from .sparkline import ( + format_delta, + format_number, + format_pct, + format_seconds_as_minutes, + sparkline, + trend_arrow, +) + + +def render_trend_markdown(trend: TrendData) -> str: + """Render the full trend report as Markdown.""" + sections = [ + _render_header(trend), + _render_toc(), + _render_section_a(trend), + _render_section_b(trend), + _render_section_c(trend), + _render_section_d(trend), + _render_section_e(trend), + _render_section_f(trend), + _render_section_g(trend), + _render_section_h(trend), + ] + return "\n".join(sections) + "\n" + + +# --------------------------------------------------------------------------- +# Header & TOC +# --------------------------------------------------------------------------- + + +def _render_header(trend: TrendData) -> str: + n = len(trend.runs) + first = trend.runs[0].label if trend.runs else "—" + last = trend.runs[-1].label if trend.runs else "—" + return ( + f"# AIDLC Rules Trend Report\n\n" + f"> **{n} releases** compared ({first} through {last}) \n" + f"> **Repository:** `{trend.repo}` \n" + f"> **Generated:** {trend.generated_at}\n" + ) + + +def _render_toc() -> str: + return ( + "## Contents\n\n" + "- [A. Executive Summary](#a-executive-summary)\n" + "- [B. Functional Correctness](#b-functional-correctness)\n" + "- [C. Qualitative Evaluation](#c-qualitative-evaluation)\n" + "- [D. Efficiency & Cost](#d-efficiency--cost-metrics)\n" + "- [E. Code Quality](#e-code-quality)\n" + "- [F. Stability & Reliability](#f-stability--reliability)\n" + "- [G. Version-over-Version Deltas](#g-version-over-version-deltas)\n" + "- [H. Pre-Release Data Points](#h-pre-release-data-points)\n" + ) + + +# --------------------------------------------------------------------------- +# Section A — Executive Summary +# --------------------------------------------------------------------------- + + +def _render_section_a(trend: TrendData) -> str: + runs = trend.runs + bl = trend.baseline + latest = runs[-1] if runs else None + if latest is None: + return "---\n\n## A. Executive Summary\n\nNo data available.\n" + + def _spark(extractor): + vals = [extractor(r) for r in runs] + return f"`{sparkline(vals)}` {trend_arrow(vals)}" + + def _bl_str(val, fmt=str): + return fmt(val) if val else "—" + + rows = [ + [ + "Unit tests passed", + _bl_str(bl.unit_tests_passed), + str(latest.unit_tests.passed), + _fmt_vs(latest.unit_tests.passed, bl.unit_tests_passed), + _spark(lambda r: r.unit_tests.passed), + ], + [ + "Contract tests", + f"{bl.contract_tests_passed}/{bl.contract_tests_total}" + if bl.contract_tests_total + else "—", + f"{latest.contract_tests.passed}/{latest.contract_tests.total}", + _fmt_vs(latest.contract_tests.passed, bl.contract_tests_passed), + _spark(lambda r: r.contract_tests.passed), + ], + [ + "Lint findings", + str(bl.lint_findings), + str(latest.code_quality.lint_findings), + _fmt_vs(latest.code_quality.lint_findings, bl.lint_findings, lower_is_better=True), + _spark(lambda r: r.code_quality.lint_findings), + ], + [ + "Qualitative score", + f"{bl.qualitative_overall:.3f}" if bl.qualitative_overall else "—", + f"{latest.qualitative.overall_score:.3f}", + f"{latest.qualitative.overall_score - bl.qualitative_overall:+.3f}" + if bl.qualitative_overall + else "—", + _spark(lambda r: r.qualitative.overall_score), + ], + [ + "Execution time", + format_seconds_as_minutes(bl.execution_time_seconds) + if bl.execution_time_seconds + else "—", + format_seconds_as_minutes(latest.metrics.execution_time_seconds), + _fmt_time_vs(latest.metrics.execution_time_seconds, bl.execution_time_seconds), + _spark(lambda r: r.metrics.execution_time_seconds), + ], + [ + "Total tokens", + format_number(bl.total_tokens) if bl.total_tokens else "—", + format_number(latest.metrics.total_tokens), + _fmt_token_vs(latest.metrics.total_tokens, bl.total_tokens), + _spark(lambda r: r.metrics.total_tokens), + ], + ] + + return ( + "---\n\n" + f"## A. Executive Summary\n\n" + f"Latest release: **{latest.label}**\n\n" + "High-level snapshot comparing the latest release against the golden baseline " + "(the reference evaluation used as the quality target).\n\n" + "| Metric | What it measures |\n" + "| --- | --- |\n" + "| **Unit tests passed** | Number of generated unit tests that pass. Higher means the rules produce broader, more complete test suites. |\n" + "| **Contract tests** | API compliance checks against the OpenAPI spec (passed/total). 88/88 = full compliance. |\n" + "| **Lint findings** | Static analysis warnings in generated code. Lower is better — 0 means clean code. |\n" + "| **Qualitative score** | AI-graded quality of generated documentation on a 0–1 scale (higher is better). |\n" + "| **Execution time** | Wall-clock time for the full evaluation run. Lower means faster generation. |\n" + "| **Total tokens** | Total LLM tokens consumed (input + output). Lower means more cost-efficient. |\n\n" + + _md_table( + ["Metric", "Golden", f"Latest ({latest.label})", "vs Golden", "Trend"], + rows, + ) + ) + + +# --------------------------------------------------------------------------- +# Section B — Functional Correctness +# --------------------------------------------------------------------------- + + +def _render_section_b(trend: TrendData) -> str: + parts = ["---\n\n## B. Functional Correctness\n"] + + parts.append( + "Measures whether the code generated by each rules version actually works correctly. " + "This is the most fundamental quality gate — code that doesn't pass its own tests is broken.\n" + ) + + # B.1 Unit tests + parts.append("### B.1 Unit Tests\n") + parts.append( + "Unit tests validate individual functions and components in isolation. " + "The AIDLC rules instruct the AI to generate both source code and test suites. " + "**Passed** = tests that ran and succeeded. " + "**Failed** = tests that ran but produced wrong results. " + "**Total** = passed + failed + errors + skipped. " + "All versions currently show 0 failures — the variance is in how many " + "tests the rules produce, which reflects test suite breadth and coverage.\n\n" + ) + rows = [ + [r.label, str(r.unit_tests.passed), str(r.unit_tests.failed), str(r.unit_tests.total)] + for r in trend.runs + ] + parts.append(_md_table(["Version", "Passed", "Failed", "Total"], rows)) + + # B.2 Contract tests + parts.append("\n### B.2 Contract Tests (API Compliance)\n") + parts.append( + "Contract tests verify that the generated API implementation matches its OpenAPI specification. " + "Each test sends a request to an endpoint and checks that the HTTP status code and response " + "shape match the spec. 88 endpoints are tested per version. " + "**Pass/Total** = endpoints that returned the expected status code. " + "**Rate** = pass percentage (100% = full spec compliance). " + "**Failures** lists the specific endpoints that deviated from the spec.\n\n" + ) + rows = [ + [ + r.label, + f"{r.contract_tests.passed}/{r.contract_tests.total}", + format_pct(r.contract_tests.pass_rate), + str(r.contract_tests.failed), + ] + for r in trend.runs + ] + parts.append(_md_table(["Version", "Pass/Total", "Rate", "Failures"], rows)) + + for r in trend.runs: + if r.contract_tests.failures: + parts.append(f"\n> **{r.label} failures:**\n") + for f in r.contract_tests.failures: + parts.append( + f"> - `{f.method} {f.endpoint}` — expected {f.expected_status}, " + f"got {f.actual_status} ({f.description})\n" + ) + + return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# Section C — Qualitative Evaluation +# --------------------------------------------------------------------------- + + +def _render_section_c(trend: TrendData) -> str: + parts = ["---\n\n## C. Qualitative Evaluation\n"] + + parts.append( + "Measures the quality of generated documentation by comparing it against " + "human-authored reference documents. An AI evaluator scores each document on " + "completeness, accuracy, and clarity, producing a 0–1 score (1.0 = perfect match " + "to reference quality).\n" + ) + + # C.1 Overall + parts.append("### C.1 Overall Score\n") + parts.append( + "The weighted average across all evaluated documents. " + "This is the single best indicator of how well the rules produce documentation. " + "Scores above 0.90 are considered strong; below 0.70 signals significant gaps.\n\n" + ) + bl_score = trend.baseline.qualitative_overall + if bl_score: + parts.append(f"Golden baseline: **{bl_score:.3f}**\n\n") + rows = [ + [ + r.label, + f"{r.qualitative.overall_score:.3f}", + f"{r.qualitative.overall_score - bl_score:+.3f}" if bl_score else "—", + ] + for r in trend.runs + ] + parts.append(_md_table(["Version", "Overall", "vs Golden"], rows)) + + # C.2 Phase breakdown + parts.append("\n### C.2 Phase Breakdown\n") + parts.append( + "Documents are grouped by SDLC phase. " + "**Inception** covers early-stage design artifacts (requirements, architecture plans, " + "component designs) — these are generated first and set the foundation. " + "**Construction** covers build-time artifacts (build instructions, test instructions, " + "build-and-test summaries) — these depend on inception outputs being correct. " + "A drop in inception quality often cascades into construction.\n\n" + ) + rows = [ + [r.label, f"{r.qualitative.inception_score:.3f}", f"{r.qualitative.construction_score:.3f}"] + for r in trend.runs + ] + parts.append(_md_table(["Version", "Inception", "Construction"], rows)) + + # C.3 Per-document heatmap + parts.append("\n### C.3 Per-Document Heatmap\n") + parts.append( + "Individual quality scores for each generated document across all versions. " + "This reveals which specific documents are consistently strong, improving, or " + "problematic. Documents scoring below 0.70 (bold/red) are the top candidates for " + "rules improvements.\n\n" + ) + all_docs, labels, matrix = _build_heatmap_matrix(trend) + header = ["Document"] + labels + rows = [] + for i, doc in enumerate(all_docs): + row = [f"`{doc}`"] + for score in matrix[i]: + if score < 0: + row.append("—") + elif score >= 0.90: + row.append(f"{score:.2f}") + elif score >= 0.70: + row.append(f"*{score:.2f}*") + else: + row.append(f"**{score:.2f}**") + rows.append(row) + parts.append(_md_table(header, rows)) + parts.append( + "\n> **Legend:** plain = green (>= 0.90) · *italic* = yellow (0.70–0.89) · **bold** = red (< 0.70)\n" + ) + + # C.4 Document coverage + parts.append("\n### C.4 Document Coverage\n") + parts.append( + "Tracks whether the generated output includes the same set of documents as the reference. " + "**Unmatched Ref** = reference documents the AI failed to generate (missing output). " + "**Unmatched Candidate** = extra documents the AI generated that don't exist in the reference " + "(unexpected output). Ideally both columns are 0, meaning the AI produced exactly the expected " + "set of documents.\n\n" + ) + rows = [ + [ + r.label, + str(len(r.qualitative.unmatched_reference_docs)), + str(len(r.qualitative.unmatched_candidate_docs)), + ] + for r in trend.runs + ] + parts.append(_md_table(["Version", "Unmatched Ref", "Unmatched Candidate"], rows)) + + return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# Section D — Efficiency & Cost +# --------------------------------------------------------------------------- + + +def _render_section_d(trend: TrendData) -> str: + parts = ["---\n\n## D. Efficiency & Cost Metrics\n"] + parts.append( + "Tracks the computational resources consumed by each evaluation run. " + "These metrics directly affect cost (tokens) and developer wait time (execution time). " + "Lower values are generally better, as long as quality metrics remain stable.\n" + ) + + # D.1 Token consumption + parts.append("### D.1 Token Consumption\n") + parts.append( + "Total LLM tokens consumed during the run, broken down by agent. " + "**Total** = all tokens across all agents (input + output). " + "**Executor** = the agent that generates code and documents. " + "**Simulator** = the agent that simulates user interactions for testing. " + "Token count is the primary cost driver — each token represents a unit of LLM usage billed by the provider.\n\n" + ) + rows = [] + for r in trend.runs: + agent_cols = {a.agent_name: format_number(a.total_tokens) for a in r.metrics.agent_tokens} + rows.append( + [ + r.label, + format_number(r.metrics.total_tokens), + agent_cols.get("executor", "—"), + agent_cols.get("simulator", "—"), + ] + ) + parts.append(_md_table(["Version", "Total", "Executor", "Simulator"], rows)) + + # D.2 Execution time + parts.append("\n### D.2 Execution Time\n") + parts.append( + "Wall-clock duration of the full evaluation pipeline, broken down by handoff. " + "Each **handoff** (H1, H2, H3) represents a sequential phase of the pipeline: " + "H1 is typically code generation (the longest phase), H2 is build/test execution, " + "and H3 is result collection and reporting. " + "**Wall Clock** is the total end-to-end time.\n\n" + ) + rows = [] + for r in trend.runs: + handoff_strs = [ + f"H{h.handoff_number}: {format_seconds_as_minutes(h.duration_seconds)}" + for h in r.metrics.handoffs + ] + rows.append( + [ + r.label, + format_seconds_as_minutes(r.metrics.execution_time_seconds), + " · ".join(handoff_strs) if handoff_strs else "—", + ] + ) + parts.append(_md_table(["Version", "Wall Clock", "Handoff Breakdown"], rows)) + + # D.3 Context window + parts.append("\n### D.3 Context Window Pressure\n") + parts.append( + "Measures how much of the LLM's context window is being used across API calls. " + "**Max** = the largest single context seen during the run (approaching the model's limit " + "risks truncation or degraded output). " + "**Avg** = the mean context size across all API calls. " + "**Median** = the midpoint context size (less affected by outliers than avg). " + "High context pressure can indicate overly verbose prompts or accumulated conversation history.\n\n" + ) + rows = [ + [ + r.label, + format_number(r.metrics.max_context_tokens), + format_number(r.metrics.avg_context_tokens), + format_number(r.metrics.median_context_tokens), + ] + for r in trend.runs + ] + parts.append(_md_table(["Version", "Max", "Avg", "Median"], rows)) + + return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# Section E — Code Quality +# --------------------------------------------------------------------------- + + +def _render_section_e(trend: TrendData) -> str: + parts = ["---\n\n## E. Code Quality\n"] + parts.append( + "Static analysis of the generated codebase. These metrics reflect the cleanliness and " + "maintainability of the AI-generated code, independent of whether it passes tests.\n\n" + "| Metric | What it measures |\n" + "| --- | --- |\n" + "| **Lint Findings** | Warnings from static analysis (style violations, unused variables, etc.). 0 = clean. |\n" + "| **Security Findings** | Vulnerabilities detected by security scanners (SQL injection, XSS, etc.). N/A if no scanner was configured. |\n" + "| **Source Files** | Number of non-test source files in the generated project. |\n" + "| **LOC** | Total lines of code across all source files. Large swings may indicate generated boilerplate or missing modules. |\n\n" + ) + rows = [ + [ + r.label, + str(r.code_quality.lint_findings), + str(r.code_quality.security_findings) + if r.code_quality.security_scanner_available + else "N/A", + str(r.code_quality.source_file_count), + format_number(r.code_quality.total_lines_of_code), + ] + for r in trend.runs + ] + parts.append( + _md_table( + ["Version", "Lint Findings", "Security Findings", "Source Files", "LOC"], + rows, + ) + ) + return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# Section F — Stability & Reliability +# --------------------------------------------------------------------------- + + +def _render_section_f(trend: TrendData) -> str: + parts = ["---\n\n## F. Stability & Reliability\n"] + parts.append( + "Tracks whether the evaluation pipeline itself ran smoothly, independent of output quality.\n\n" + "| Metric | What it measures |\n" + "| --- | --- |\n" + "| **Error Events** | Runtime errors logged during the run (exceptions, timeouts, API failures). 0 = clean run. |\n" + "| **Handoffs** | Number of sequential pipeline phases completed. Typically 3 (generate, build/test, report). A different count may indicate an early abort or retry. |\n" + "| **Server Startup** | Whether the generated application server started successfully. A failure here means the generated code couldn't even boot, preventing contract tests from running. |\n\n" + ) + rows = [ + [ + r.label, + str(r.metrics.error_count), + str(r.metrics.num_handoffs), + "Yes" if r.metrics.server_startup_success else "**No**", + ] + for r in trend.runs + ] + parts.append(_md_table(["Version", "Error Events", "Handoffs", "Server Startup"], rows)) + return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# Section G — Version-over-Version Deltas +# --------------------------------------------------------------------------- + + +def _render_section_g(trend: TrendData) -> str: + parts = ["---\n\n## G. Version-over-Version Deltas\n"] + deltas = compute_deltas(trend.runs) + if not deltas: + parts.append("Not enough data points to compute deltas.\n") + return "\n".join(parts) + + parts.append( + "Each row shows the change from one release to the next, making it easy to spot " + "which specific version introduced an improvement or regression. " + "Positive values (+) indicate an increase; negative (-) indicate a decrease. " + "For **Unit Tests** and **Contract**, positive is better (more tests passing). " + "For **Qualitative**, positive is better (higher quality score). " + "For **Tokens** and **Time**, negative is better (more efficient).\n\n" + ) + + rows = [ + [ + f"{d.from_label} -> {d.to_label}", + format_delta(d.unit_tests_delta), + format_delta(d.contract_tests_delta), + format_delta(d.qualitative_delta, precision=3), + format_delta(d.token_delta) + if abs(d.token_delta) < 1000 + else _fmt_token_delta(d.token_delta), + f"{format_delta(d.time_delta_seconds, precision=0)}s", + ] + for d in deltas + ] + parts.append( + _md_table( + ["Transition", "Unit Tests", "Contract", "Qualitative", "Tokens", "Time"], + rows, + ) + ) + return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# Section H — Pre-Release Data Points +# --------------------------------------------------------------------------- + + +def _render_section_h(trend: TrendData) -> str: + from .models import RunType + + pre_release = [r for r in trend.runs if r.run_type in (RunType.MAIN, RunType.PR)] + + parts = ["---\n\n## H. Pre-Release Data Points\n"] + parts.append( + "Evaluation results from non-release sources — the `main` branch and open pull requests. " + "These represent in-progress work that hasn't been tagged as a release yet. " + "Use this data to preview whether upcoming changes will improve or regress metrics " + "before they ship.\n" + ) + + if not pre_release: + parts.append( + "\nNo pre-release data available. Data from `main` and " + "pull request evaluations will appear here when available.\n" + ) + return "\n".join(parts) + + rows = [ + [ + r.label, + str(r.unit_tests.passed), + f"{r.contract_tests.passed}/{r.contract_tests.total}", + f"{r.qualitative.overall_score:.3f}", + format_number(r.metrics.total_tokens), + ] + for r in pre_release + ] + parts.append( + _md_table( + ["Source", "Unit Tests", "Contract", "Qualitative", "Tokens"], + rows, + ) + ) + return "\n".join(parts) + + +# --------------------------------------------------------------------------- +# Utilities +# --------------------------------------------------------------------------- + + +def _md_table(headers: list[str], rows: list[list[str]]) -> str: + """Render a Markdown table with right-aligned numeric columns.""" + if not rows: + return "" + + # Compute column widths for alignment + widths = [len(h) for h in headers] + for row in rows: + for i, cell in enumerate(row): + if i < len(widths): + widths[i] = max(widths[i], len(cell)) + + # Build header + header_line = "| " + " | ".join(h.ljust(widths[i]) for i, h in enumerate(headers)) + " |" + sep_line = "| " + " | ".join("-" * widths[i] for i in range(len(headers))) + " |" + + lines = [header_line, sep_line] + for row in rows: + cells = [] + for i, cell in enumerate(row): + w = widths[i] if i < len(widths) else len(cell) + cells.append(cell.ljust(w)) + lines.append("| " + " | ".join(cells) + " |") + + return "\n".join(lines) + "\n" + + +def _build_heatmap_matrix( + trend: TrendData, +) -> tuple[list[str], list[str], list[list[float]]]: + """Build a document x version score matrix for the heatmap.""" + all_docs = sorted( + {ds.document_name for run in trend.runs for ds in run.qualitative.document_scores} + ) + labels = [r.label for r in trend.runs] + + matrix: list[list[float]] = [] + for doc in all_docs: + row: list[float] = [] + for run in trend.runs: + score = next( + ( + ds.overall_score + for ds in run.qualitative.document_scores + if ds.document_name == doc + ), + -1.0, + ) + row.append(score) + matrix.append(row) + + return all_docs, labels, matrix + + +def _fmt_vs(current: int, baseline: int, lower_is_better: bool = False) -> str: + """Format a current vs baseline comparison.""" + if not baseline: + return "—" + delta = current - baseline + if delta == 0: + return "=" + display_delta = -delta if lower_is_better else delta + return format_delta(display_delta) + + +def _fmt_time_vs(current_s: float, baseline_s: float) -> str: + if not baseline_s: + return "—" + delta_s = current_s - baseline_s + delta_m = delta_s / 60 + return f"{delta_m:+.1f}m" + + +def _fmt_token_vs(current: int, baseline: int) -> str: + if not baseline: + return "—" + delta = current - baseline + return _fmt_token_delta(delta) + + +def _fmt_token_delta(delta: int) -> str: + """Format a token delta with sign and human-readable units.""" + sign = "+" if delta >= 0 else "" + abs_d = abs(delta) + if abs_d >= 1_000_000: + return f"{sign}{delta / 1_000_000:.2f}M" + if abs_d >= 1_000: + return f"{sign}{delta / 1_000:.1f}K" + return f"{sign}{delta}" diff --git a/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/render_yaml.py b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/render_yaml.py new file mode 100644 index 00000000..adba48a7 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/render_yaml.py @@ -0,0 +1,33 @@ +"""YAML data export for trend data (machine-readable output for CI gates).""" + +from __future__ import annotations + +import dataclasses + +import yaml + +from .models import RunType, SemVer, TrendData + + +def render_trend_yaml(trend: TrendData) -> str: + """Serialize TrendData to a YAML string.""" + data = _serialize(trend) + return yaml.dump(data, default_flow_style=False, sort_keys=False, allow_unicode=True) + + +def _serialize(obj: object) -> object: + """Recursively convert dataclasses, enums, and custom types to plain dicts.""" + if isinstance(obj, SemVer): + return str(obj) + if isinstance(obj, RunType): + return obj.value + if dataclasses.is_dataclass(obj) and not isinstance(obj, type): + return { + f.name: _serialize(getattr(obj, f.name)) + for f in dataclasses.fields(obj) + } + if isinstance(obj, list): + return [_serialize(item) for item in obj] + if isinstance(obj, dict): + return {k: _serialize(v) for k, v in obj.items()} + return obj diff --git a/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/sparkline.py b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/sparkline.py new file mode 100644 index 00000000..b0b3817e --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/src/trend_reports/sparkline.py @@ -0,0 +1,103 @@ +"""ASCII sparkline and formatting utilities.""" + +from __future__ import annotations + +SPARK_CHARS = "▁▂▃▄▅▆▇█" + + +def sparkline(values: list[float | int]) -> str: + """Generate an ASCII sparkline from numeric values. + + >>> sparkline([1, 5, 3, 7, 2]) + '▁▆▃█▂' + """ + if not values: + return "" + lo = min(values) + hi = max(values) + if hi == lo: + mid = len(SPARK_CHARS) // 2 + return SPARK_CHARS[mid] * len(values) + span = hi - lo + return "".join( + SPARK_CHARS[min(int((v - lo) / span * (len(SPARK_CHARS) - 1)), len(SPARK_CHARS) - 1)] + for v in values + ) + + +def trend_arrow(values: list[float | int]) -> str: + """Return a directional indicator based on first-to-last change. + + Returns one of: ↑ (up >5%), ↗ (up 1-5%), → (flat <1%), + ↘ (down 1-5%), ↓ (down >5%). + """ + if len(values) < 2: + return "→" + first, last = values[0], values[-1] + if first == 0: + return "↑" if last > 0 else "→" + pct = (last - first) / abs(first) + if pct > 0.05: + return "↑" + if pct > 0.01: + return "↗" + if pct < -0.05: + return "↓" + if pct < -0.01: + return "↘" + return "→" + + +def format_number(n: float | int) -> str: + """Human-readable number formatting. + + >>> format_number(9_260_000) + '9.26M' + >>> format_number(1446.0) + '1446.0' + >>> format_number(0.891) + '0.891' + """ + if isinstance(n, float) and n != int(n) and abs(n) < 1000: + return f"{n:.3f}" + abs_n = abs(n) + if abs_n >= 1_000_000: + return f"{n / 1_000_000:.2f}M" + if abs_n >= 1_000: + return f"{n / 1_000:.1f}K" + if isinstance(n, int): + return str(n) + return f"{n:.1f}" + + +def format_seconds_as_minutes(seconds: float) -> str: + """Format seconds as a minutes string. + + >>> format_seconds_as_minutes(1074.0) + '17.9m' + """ + return f"{seconds / 60:.1f}m" + + +def format_delta(delta: float | int, precision: int = 1) -> str: + """Format a delta value with sign prefix. + + >>> format_delta(56) + '+56' + >>> format_delta(-3) + '-3' + >>> format_delta(0.028, precision=3) + '+0.028' + """ + if isinstance(delta, int): + return f"{delta:+d}" + return f"{delta:+.{precision}f}" + + +def format_pct(value: float) -> str: + """Format a 0-1 ratio as a percentage string. + + >>> format_pct(0.965) + '96.5%' + """ + return f"{value * 100:.1f}%" diff --git a/scripts/aidlc-evaluator/packages/trend-reports/tests/conftest.py b/scripts/aidlc-evaluator/packages/trend-reports/tests/conftest.py new file mode 100644 index 00000000..651b65dd --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/tests/conftest.py @@ -0,0 +1,78 @@ +"""Shared test fixtures and factory functions for trend-reports tests.""" + +from __future__ import annotations + +from trend_reports.models import ( + BaselineMetrics, + CodeQualityMetrics, + ContractTestResults, + DocumentScore, + QualitativeComparison, + RunConfig, + RunData, + RunMeta, + RunMetrics, + RunType, + SemVer, + TrendData, + UnitTestResults, +) + + +def make_run( + label: str = "v0.1.0", + run_type: RunType = RunType.RELEASE, + semver: SemVer | None = None, + pr_number: int | None = None, + passed: int = 100, + failed: int = 0, + qualitative_score: float = 0.9, + total_tokens: int = 1_000_000, + time_seconds: float = 600.0, + contract_passed: int = 88, + contract_total: int = 88, + document_scores: list[DocumentScore] | None = None, + inception_score: float = 0.0, + construction_score: float = 0.0, +) -> RunData: + """Create a RunData instance for testing.""" + if semver is None and run_type == RunType.RELEASE: + try: + semver = SemVer.parse(label) + except ValueError: + pass + return RunData( + label=label, + run_type=run_type, + semver=semver, + pr_number=pr_number, + meta=RunMeta(run_id="test", config=RunConfig(rules_ref=label)), + metrics=RunMetrics( + total_tokens=total_tokens, + execution_time_seconds=time_seconds, + ), + unit_tests=UnitTestResults(passed=passed, failed=failed, total=passed + failed), + contract_tests=ContractTestResults( + total=contract_total, + passed=contract_passed, + failed=contract_total - contract_passed, + pass_rate=contract_passed / contract_total if contract_total else 0.0, + ), + code_quality=CodeQualityMetrics(), + qualitative=QualitativeComparison( + overall_score=qualitative_score, + inception_score=inception_score, + construction_score=construction_score, + document_scores=document_scores or [], + ), + ) + + +def make_trend(*runs: RunData, baseline: BaselineMetrics | None = None) -> TrendData: + """Create a TrendData instance for testing.""" + return TrendData( + runs=list(runs), + baseline=baseline or BaselineMetrics(), + repo="test/repo", + generated_at="2026-01-01T00:00:00Z", + ) diff --git a/scripts/aidlc-evaluator/packages/trend-reports/tests/test_collector.py b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_collector.py new file mode 100644 index 00000000..03385a09 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_collector.py @@ -0,0 +1,581 @@ +"""Tests for zip extraction, YAML parsing, run classification, and trend assembly. + +Tests use tmp_path and real YAML files to avoid excessive mocking. +""" + +from __future__ import annotations + +import zipfile +from pathlib import Path + +import pytest +import yaml +from conftest import make_run +from trend_reports.collector import ( + classify_run, + collect_from_directory, + collect_from_zip, + collect_trend_data, + compute_deltas, + extract_zip, + find_yaml_files, + load_baseline, + parse_contract_tests, + parse_qualitative, + parse_quality_report, + parse_run_meta, + parse_run_metrics, + parse_test_results, + sort_runs, +) +from trend_reports.models import ( + CollectorError, + RunType, + SemVer, +) + + +def _write_yaml(path: Path, data: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w") as f: + yaml.safe_dump(data, f, default_flow_style=False) + + +def _make_report_zip(tmp_path: Path, yaml_files: dict[str, dict]) -> Path: + """Create a report zip with YAML file contents.""" + zip_path = tmp_path / "report.zip" + with zipfile.ZipFile(zip_path, "w") as zf: + for filename, data in yaml_files.items(): + zf.writestr(filename, yaml.safe_dump(data)) + return zip_path + + +# --------------------------------------------------------------------------- +# Zip handling +# --------------------------------------------------------------------------- + + +class TestExtractZip: + def test_normal_extraction(self, tmp_path): + zip_path = tmp_path / "test.zip" + with zipfile.ZipFile(zip_path, "w") as zf: + zf.writestr("hello.txt", "world") + + result = extract_zip(zip_path, tmp_path) + assert result.exists() + assert (result / "hello.txt").read_text() == "world" + + def test_corrupt_zip_raises(self, tmp_path): + bad_zip = tmp_path / "bad.zip" + bad_zip.write_bytes(b"not a zip") + with pytest.raises(CollectorError, match="Corrupt zip"): + extract_zip(bad_zip, tmp_path) + + +class TestFindYamlFiles: + def test_all_present(self, tmp_path): + for name in [ + "run-meta.yaml", + "run-metrics.yaml", + "test-results.yaml", + "contract-test-results.yaml", + "quality-report.yaml", + "qualitative-comparison.yaml", + ]: + (tmp_path / name).write_text("key: value") + result = find_yaml_files(tmp_path) + assert len(result) == 6 + + def test_none_present(self, tmp_path): + result = find_yaml_files(tmp_path) + assert len(result) == 0 + + def test_partial(self, tmp_path): + (tmp_path / "run-meta.yaml").write_text("key: value") + result = find_yaml_files(tmp_path) + assert len(result) == 1 + assert "run-meta" in result + + +# --------------------------------------------------------------------------- +# YAML parsers +# --------------------------------------------------------------------------- + + +class TestParseRunMeta: + def test_normal(self, tmp_path): + path = tmp_path / "run-meta.yaml" + _write_yaml( + path, + { + "run_folder": "run-001", + "config": {"rules_ref": "v0.1.5", "executor_model": "claude-3"}, + "vision_file": "test_cases/sci-calc/vision.md", + "started_at": "2026-01-01T00:00:00Z", + "completed_at": "2026-01-01T01:00:00Z", + "status": "completed", + }, + ) + meta = parse_run_meta(path) + assert meta.run_id == "run-001" + assert meta.config.rules_ref == "v0.1.5" + assert meta.config.model == "claude-3" + assert meta.config.target_project == "sci-calc" + + def test_missing_config(self, tmp_path): + path = tmp_path / "run-meta.yaml" + _write_yaml(path, {"run_folder": "run-002"}) + meta = parse_run_meta(path) + assert meta.config.rules_ref == "" + assert meta.config.model == "" + + +class TestParseRunMetrics: + def test_normal(self, tmp_path): + path = tmp_path / "run-metrics.yaml" + _write_yaml( + path, + { + "tokens": { + "total": {"total_tokens": 9000000, "input_tokens": 5000000}, + "per_agent": { + "executor": {"total_tokens": 8000000, "input_tokens": 4000000}, + }, + }, + "timing": { + "total_wall_clock_ms": 600000, + "handoffs": [ + {"handoff": 1, "node_id": "executor", "duration_ms": 300000}, + ], + }, + "errors": {"throttle_events": 0, "timeout_events": 0}, + "context_size": {"total": {"max_tokens": 100000}}, + }, + ) + metrics = parse_run_metrics(path) + assert metrics.total_tokens == 9000000 + assert metrics.execution_time_seconds == 600.0 + assert len(metrics.agent_tokens) == 1 + assert len(metrics.handoffs) == 1 + assert metrics.max_context_tokens == 100000 + + def test_empty(self, tmp_path): + path = tmp_path / "run-metrics.yaml" + _write_yaml(path, {}) + metrics = parse_run_metrics(path) + assert metrics.total_tokens == 0 + assert metrics.execution_time_seconds == 0.0 + + +class TestParseTestResults: + def test_normal(self, tmp_path): + path = tmp_path / "test-results.yaml" + _write_yaml( + path, + { + "test": {"parsed_results": {"passed": 175, "failed": 0, "total": 175}}, + }, + ) + result = parse_test_results(path) + assert result.passed == 175 + assert result.failed == 0 + assert result.total == 175 + + def test_none_values(self, tmp_path): + path = tmp_path / "test-results.yaml" + _write_yaml( + path, + { + "test": {"parsed_results": {"passed": None, "failed": None}}, + }, + ) + result = parse_test_results(path) + assert result.passed == 0 + assert result.failed == 0 + + +class TestParseContractTests: + def test_normal(self, tmp_path): + path = tmp_path / "contract-test-results.yaml" + _write_yaml( + path, + { + "total": 88, + "passed": 85, + "failed": 3, + "cases": [ + {"path": "/api/calc", "method": "GET", "passed": True}, + { + "path": "/api/err", + "method": "POST", + "passed": False, + "expected_status": 400, + "actual_status": 200, + }, + ], + }, + ) + result = parse_contract_tests(path) + assert result.total == 88 + assert result.passed == 85 + assert len(result.failures) == 1 + assert result.failures[0].endpoint == "/api/err" + + def test_zero_total(self, tmp_path): + path = tmp_path / "contract-test-results.yaml" + _write_yaml(path, {"total": 0, "passed": 0, "failed": 0}) + result = parse_contract_tests(path) + assert result.pass_rate == 0.0 + + +class TestParseQualityReport: + def test_with_security(self, tmp_path): + path = tmp_path / "quality-report.yaml" + _write_yaml( + path, + { + "lint": {"findings": [{"file": "a.py"}]}, + "security": {"available": True, "findings": [{"issue": "x"}]}, + "summary": {"lint_total": 1}, + }, + ) + result = parse_quality_report(path) + assert result.lint_findings == 1 + assert result.security_findings == 1 + assert result.security_scanner_available is True + + def test_without_security(self, tmp_path): + path = tmp_path / "quality-report.yaml" + _write_yaml(path, {"lint": {}, "summary": {}}) + result = parse_quality_report(path) + assert result.security_findings == -1 + assert result.security_scanner_available is False + + +class TestParseQualitative: + def test_normal(self, tmp_path): + path = tmp_path / "qualitative-comparison.yaml" + _write_yaml( + path, + { + "overall_score": 0.898, + "phases": [ + { + "phase": "inception", + "avg_overall": 0.87, + "documents": [ + {"path": "docs/requirements.md", "overall": 0.95}, + ], + }, + { + "phase": "construction", + "avg_overall": 0.92, + "documents": [ + {"path": "docs/build-instructions.md", "overall": 0.90}, + ], + }, + ], + }, + ) + result = parse_qualitative(path) + assert result.overall_score == 0.898 + assert result.inception_score == 0.87 + assert result.construction_score == 0.92 + assert len(result.document_scores) == 2 + + def test_empty_phases(self, tmp_path): + path = tmp_path / "qualitative-comparison.yaml" + _write_yaml(path, {"overall_score": 0.5, "phases": []}) + result = parse_qualitative(path) + assert result.inception_score == 0.0 + assert result.construction_score == 0.0 + assert result.document_scores == [] + + +# --------------------------------------------------------------------------- +# Run classification +# --------------------------------------------------------------------------- + + +class TestClassifyRun: + def test_release(self): + run_type, label, semver, pr = classify_run("v0.1.5") + assert run_type == RunType.RELEASE + assert label == "v0.1.5" + assert semver == SemVer(0, 1, 5) + assert pr is None + + def test_main(self): + run_type, label, semver, pr = classify_run("main") + assert run_type == RunType.MAIN + assert label == "main" + assert semver is None + + def test_pr(self): + run_type, label, semver, pr = classify_run("pr-42") + assert run_type == RunType.PR + assert label == "PR #42" + assert pr == 42 + + def test_unknown_format(self): + run_type, label, semver, pr = classify_run("some-branch") + assert run_type == RunType.RELEASE + assert label == "some-branch" + assert semver is None + + +# --------------------------------------------------------------------------- +# Sorting and deltas +# --------------------------------------------------------------------------- + + +class TestSortRuns: + def test_releases_sorted_by_semver(self): + runs = [ + make_run("v0.1.2"), + make_run("v0.1.0"), + make_run("v0.1.1"), + ] + sorted_runs = sort_runs(runs) + assert [r.label for r in sorted_runs] == ["v0.1.0", "v0.1.1", "v0.1.2"] + + def test_main_after_releases(self): + runs = [ + make_run("main", run_type=RunType.MAIN, semver=None), + make_run("v0.1.0"), + ] + sorted_runs = sort_runs(runs) + assert sorted_runs[0].label == "v0.1.0" + assert sorted_runs[1].label == "main" + + def test_pr_after_main(self): + runs = [ + make_run("PR #42", run_type=RunType.PR, semver=None, pr_number=42), + make_run("main", run_type=RunType.MAIN, semver=None), + make_run("v0.1.0"), + ] + sorted_runs = sort_runs(runs) + assert [r.label for r in sorted_runs] == ["v0.1.0", "main", "PR #42"] + + def test_empty_list(self): + assert sort_runs([]) == [] + + +class TestComputeDeltas: + def test_two_runs(self): + runs = [ + make_run("v0.1.0", passed=100, qualitative_score=0.85, total_tokens=1000000), + make_run("v0.1.1", passed=120, qualitative_score=0.90, total_tokens=1200000), + ] + deltas = compute_deltas(runs) + assert len(deltas) == 1 + assert deltas[0].from_label == "v0.1.0" + assert deltas[0].to_label == "v0.1.1" + assert deltas[0].unit_tests_delta == 20 + assert abs(deltas[0].qualitative_delta - 0.05) < 0.001 + assert deltas[0].token_delta == 200000 + + def test_empty_list(self): + assert compute_deltas([]) == [] + + def test_single_run(self): + assert compute_deltas([make_run("v0.1.0")]) == [] + + +# --------------------------------------------------------------------------- +# Baseline loading +# --------------------------------------------------------------------------- + + +class TestLoadBaseline: + def test_file_exists(self, tmp_path): + path = tmp_path / "golden.yaml" + _write_yaml( + path, + { + "execution": {"wall_clock_ms": 1200000, "total_tokens": 9000000}, + "unit_tests": {"passed": 192, "total": 192}, + "contract_tests": {"passed": 88, "total": 88}, + "code_quality": {"lint_total": 18}, + "qualitative": { + "overall_score": 0.891, + "document_scores": {"requirements.md": 0.97, "components.md": 0.98}, + }, + }, + ) + bl = load_baseline(path) + assert bl.unit_tests_passed == 192 + assert bl.qualitative_overall == 0.891 + assert bl.execution_time_seconds == 1200.0 + assert bl.document_scores["requirements.md"] == 0.97 + + def test_file_missing(self, tmp_path): + bl = load_baseline(tmp_path / "nonexistent.yaml") + assert bl.unit_tests_passed == 0 + assert bl.qualitative_overall == 0.0 + + +# --------------------------------------------------------------------------- +# collect_from_zip +# --------------------------------------------------------------------------- + + +class TestCollectFromZip: + def test_full_zip(self, tmp_path): + zip_path = _make_report_zip( + tmp_path, + { + "run-meta.yaml": { + "run_folder": "run-001", + "config": {"rules_ref": "v0.1.5"}, + }, + "run-metrics.yaml": { + "tokens": {"total": {"total_tokens": 9000000}}, + "timing": {"total_wall_clock_ms": 600000}, + }, + "test-results.yaml": { + "test": {"parsed_results": {"passed": 175, "failed": 0, "total": 175}}, + }, + "contract-test-results.yaml": {"total": 88, "passed": 88, "failed": 0}, + "quality-report.yaml": {"lint": {}, "summary": {"lint_total": 0}}, + "qualitative-comparison.yaml": {"overall_score": 0.898, "phases": []}, + }, + ) + run = collect_from_zip(zip_path, tmp_path / "work") + assert run.label == "v0.1.5" + assert run.run_type == RunType.RELEASE + assert run.unit_tests.passed == 175 + assert run.qualitative.overall_score == 0.898 + + def test_missing_run_meta_raises(self, tmp_path): + zip_path = _make_report_zip( + tmp_path, + { + "test-results.yaml": {"test": {"parsed_results": {}}}, + }, + ) + with pytest.raises(CollectorError, match="run-meta.yaml missing"): + collect_from_zip(zip_path, tmp_path / "work") + + def test_missing_optional_files_use_defaults(self, tmp_path): + zip_path = _make_report_zip( + tmp_path, + { + "run-meta.yaml": { + "run_folder": "run-002", + "config": {"rules_ref": "v0.1.0"}, + }, + }, + ) + run = collect_from_zip(zip_path, tmp_path / "work") + assert run.unit_tests.passed == 0 + assert run.contract_tests.total == 0 + assert run.qualitative.overall_score == 0.0 + + +# --------------------------------------------------------------------------- +# collect_from_directory +# --------------------------------------------------------------------------- + + +class TestCollectFromDirectory: + def test_full_directory(self, tmp_path): + run_dir = tmp_path / "run-001" + run_dir.mkdir() + _write_yaml(run_dir / "run-meta.yaml", { + "run_folder": "run-001", + "config": {"rules_ref": "v0.1.5"}, + }) + _write_yaml(run_dir / "run-metrics.yaml", { + "tokens": {"total": {"total_tokens": 9000000}}, + "timing": {"total_wall_clock_ms": 600000}, + }) + _write_yaml(run_dir / "test-results.yaml", { + "test": {"parsed_results": {"passed": 175, "failed": 0, "total": 175}}, + }) + _write_yaml(run_dir / "contract-test-results.yaml", { + "total": 88, "passed": 88, "failed": 0, + }) + _write_yaml(run_dir / "quality-report.yaml", { + "lint": {}, "summary": {"lint_total": 0}, + }) + _write_yaml(run_dir / "qualitative-comparison.yaml", { + "overall_score": 0.898, "phases": [], + }) + + run = collect_from_directory(run_dir) + assert run.label == "v0.1.5" + assert run.run_type == RunType.RELEASE + assert run.unit_tests.passed == 175 + assert run.qualitative.overall_score == 0.898 + + def test_missing_run_meta_raises(self, tmp_path): + run_dir = tmp_path / "run-bad" + run_dir.mkdir() + _write_yaml(run_dir / "test-results.yaml", {"test": {"parsed_results": {}}}) + with pytest.raises(CollectorError, match="run-meta.yaml missing"): + collect_from_directory(run_dir) + + def test_not_a_directory_raises(self, tmp_path): + file_path = tmp_path / "not-a-dir.txt" + file_path.write_text("hello") + with pytest.raises(CollectorError, match="Not a directory"): + collect_from_directory(file_path) + + def test_nonexistent_path_raises(self, tmp_path): + with pytest.raises(CollectorError, match="Not a directory"): + collect_from_directory(tmp_path / "nonexistent") + + def test_missing_optional_files_use_defaults(self, tmp_path): + run_dir = tmp_path / "run-minimal" + run_dir.mkdir() + _write_yaml(run_dir / "run-meta.yaml", { + "run_folder": "run-002", + "config": {"rules_ref": "v0.1.0"}, + }) + run = collect_from_directory(run_dir) + assert run.unit_tests.passed == 0 + assert run.contract_tests.total == 0 + assert run.qualitative.overall_score == 0.0 + + +# --------------------------------------------------------------------------- +# collect_trend_data — directory dispatch +# --------------------------------------------------------------------------- + + +class TestCollectTrendDataDirectoryDispatch: + def test_mix_of_zips_and_directories(self, tmp_path): + # Create a directory bundle + run_dir = tmp_path / "dir-bundle" + run_dir.mkdir() + _write_yaml(run_dir / "run-meta.yaml", { + "run_folder": "run-dir", "config": {"rules_ref": "pr-42"}, + }) + _write_yaml(run_dir / "run-metrics.yaml", {"tokens": {"total": {}}, "timing": {}}) + _write_yaml(run_dir / "test-results.yaml", {"test": {"parsed_results": {}}}) + _write_yaml(run_dir / "contract-test-results.yaml", { + "total": 0, "passed": 0, "failed": 0, + }) + _write_yaml(run_dir / "quality-report.yaml", {"lint": {}, "summary": {}}) + _write_yaml(run_dir / "qualitative-comparison.yaml", { + "overall_score": 0.5, "phases": [], + }) + + # Create a zip bundle + zip_path = _make_report_zip(tmp_path, { + "run-meta.yaml": {"run_folder": "run-zip", "config": {"rules_ref": "v0.1.0"}}, + "run-metrics.yaml": {"tokens": {"total": {}}, "timing": {}}, + "test-results.yaml": {"test": {"parsed_results": {}}}, + "contract-test-results.yaml": {"total": 0, "passed": 0, "failed": 0}, + "quality-report.yaml": {"lint": {}, "summary": {}}, + "qualitative-comparison.yaml": {"overall_score": 0.6, "phases": []}, + }) + + baseline_path = tmp_path / "golden.yaml" + _write_yaml(baseline_path, {}) + + trend = collect_trend_data( + [zip_path, run_dir], baseline_path, "test/repo", tmp_path / "work", + ) + assert len(trend.runs) == 2 diff --git a/scripts/aidlc-evaluator/packages/trend-reports/tests/test_fetcher.py b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_fetcher.py new file mode 100644 index 00000000..34850936 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_fetcher.py @@ -0,0 +1,292 @@ +"""Tests for data retrieval via the gh CLI. + +All tests mock subprocess.run to avoid requiring the gh CLI or network access. +""" + +from __future__ import annotations + +import json +from unittest.mock import MagicMock, patch + +import pytest +from trend_reports.fetcher import ( + check_gh_available, + fetch_artifact_bundle, + fetch_prerelease_bundles, + fetch_release_bundle, + fetch_release_bundles, + fetch_release_list, + fetch_workflow_runs, +) +from trend_reports.models import FetchError + + +def _mock_run(stdout="", stderr="", returncode=0): + result = MagicMock() + result.stdout = stdout + result.stderr = stderr + result.returncode = returncode + return result + + +class TestCheckGhAvailable: + def test_gh_not_installed(self): + with patch( + "trend_reports.fetcher.subprocess.run", + side_effect=FileNotFoundError, + ): + with pytest.raises(FetchError, match="gh CLI not found"): + check_gh_available() + + def test_gh_version_error(self): + with patch( + "trend_reports.fetcher.subprocess.run", + return_value=_mock_run(returncode=1, stderr="some error"), + ): + with pytest.raises(FetchError, match="gh CLI returned an error"): + check_gh_available() + + def test_gh_not_authenticated(self): + with patch( + "trend_reports.fetcher.subprocess.run", + side_effect=[ + _mock_run(returncode=0), # gh version succeeds + _mock_run(returncode=1, stderr="not logged in"), # auth fails + ], + ): + with pytest.raises(FetchError, match="not authenticated"): + check_gh_available() + + def test_success(self): + with patch( + "trend_reports.fetcher.subprocess.run", + side_effect=[ + _mock_run(returncode=0), # gh version + _mock_run(returncode=0), # gh auth status + ], + ): + check_gh_available() # Should not raise + + +class TestFetchReleaseList: + def test_success(self): + releases = [ + {"tagName": "v0.1.1", "publishedAt": "2026-02-01"}, + {"tagName": "v0.1.0", "publishedAt": "2026-01-01"}, + ] + with patch( + "trend_reports.fetcher.subprocess.run", + return_value=_mock_run(stdout=json.dumps(releases)), + ): + result = fetch_release_list("owner/repo") + # Should be sorted by publishedAt ascending + assert result[0]["tagName"] == "v0.1.0" + assert result[1]["tagName"] == "v0.1.1" + + def test_error_raises(self): + with patch( + "trend_reports.fetcher.subprocess.run", + return_value=_mock_run(returncode=1, stderr="API error"), + ): + with pytest.raises(FetchError, match="Failed to list releases"): + fetch_release_list("owner/repo") + + def test_empty_list(self): + with patch( + "trend_reports.fetcher.subprocess.run", + return_value=_mock_run(stdout="[]"), + ): + result = fetch_release_list("owner/repo") + assert result == [] + + +class TestFetchReleaseBundle: + def test_success(self, tmp_path): + tag_dir = tmp_path / "v0.1.0" + tag_dir.mkdir() + (tag_dir / "report-v0.1.0.zip").write_bytes(b"fake") + + with patch( + "trend_reports.fetcher.subprocess.run", + return_value=_mock_run(returncode=0), + ): + result = fetch_release_bundle("owner/repo", "v0.1.0", tmp_path) + assert result is not None + assert result.name == "report-v0.1.0.zip" + + def test_no_assets_match(self, tmp_path): + with patch( + "trend_reports.fetcher.subprocess.run", + return_value=_mock_run(returncode=1, stderr="no assets match the pattern"), + ): + result = fetch_release_bundle("owner/repo", "v0.1.0", tmp_path) + assert result is None + + def test_no_zip_on_disk(self, tmp_path): + with patch( + "trend_reports.fetcher.subprocess.run", + return_value=_mock_run(returncode=0), + ): + result = fetch_release_bundle("owner/repo", "v0.1.0", tmp_path) + assert result is None + + def test_other_error_raises(self, tmp_path): + with patch( + "trend_reports.fetcher.subprocess.run", + return_value=_mock_run(returncode=1, stderr="network timeout"), + ): + with pytest.raises(FetchError, match="Failed to download report"): + fetch_release_bundle("owner/repo", "v0.1.0", tmp_path) + + +class TestFetchWorkflowRuns: + def test_success_filters_non_success(self): + runs = [ + {"databaseId": 1, "conclusion": "success", "headBranch": "main"}, + {"databaseId": 2, "conclusion": "failure", "headBranch": "main"}, + {"databaseId": 3, "conclusion": "success", "headBranch": "main"}, + ] + with patch( + "trend_reports.fetcher.subprocess.run", + return_value=_mock_run(stdout=json.dumps(runs)), + ): + result = fetch_workflow_runs("owner/repo") + assert len(result) == 2 + assert all(r["conclusion"] == "success" for r in result) + + def test_with_branch_filter(self): + with patch( + "trend_reports.fetcher.subprocess.run", + return_value=_mock_run(stdout="[]"), + ) as mock: + fetch_workflow_runs("owner/repo", branch="main") + cmd = mock.call_args[0][0] + assert "--branch" in cmd + assert "main" in cmd + + def test_with_event_filter(self): + with patch( + "trend_reports.fetcher.subprocess.run", + return_value=_mock_run(stdout="[]"), + ) as mock: + fetch_workflow_runs("owner/repo", event="pull_request") + cmd = mock.call_args[0][0] + assert "--event" in cmd + assert "pull_request" in cmd + + def test_error_raises(self): + with patch( + "trend_reports.fetcher.subprocess.run", + return_value=_mock_run(returncode=1, stderr="API error"), + ): + with pytest.raises(FetchError, match="Failed to list workflow runs"): + fetch_workflow_runs("owner/repo") + + +class TestFetchArtifactBundle: + def test_success(self, tmp_path): + artifact_dir = tmp_path / "report-main" + artifact_dir.mkdir(parents=True) + (artifact_dir / "report-main.zip").write_bytes(b"fake") + + with patch( + "trend_reports.fetcher.subprocess.run", + return_value=_mock_run(returncode=0), + ): + result = fetch_artifact_bundle("owner/repo", 123, "report-main", tmp_path) + assert result is not None + assert result.name == "report-main.zip" + + def test_no_artifact(self, tmp_path): + with patch( + "trend_reports.fetcher.subprocess.run", + return_value=_mock_run(returncode=1, stderr="no artifact found"), + ): + result = fetch_artifact_bundle("owner/repo", 123, "report-main", tmp_path) + assert result is None + + def test_no_zip_in_download(self, tmp_path): + with patch( + "trend_reports.fetcher.subprocess.run", + return_value=_mock_run(returncode=0), + ): + result = fetch_artifact_bundle("owner/repo", 123, "report-main", tmp_path) + assert result is None + + def test_other_error_raises(self, tmp_path): + with patch( + "trend_reports.fetcher.subprocess.run", + return_value=_mock_run(returncode=1, stderr="server error"), + ): + with pytest.raises(FetchError, match="Failed to download artifact"): + fetch_artifact_bundle("owner/repo", 123, "report-main", tmp_path) + + +class TestFetchPrereleaseBundles: + def test_no_runs_returns_empty(self, tmp_path): + with patch( + "trend_reports.fetcher.fetch_workflow_runs", + return_value=[], + ): + result = fetch_prerelease_bundles("owner/repo", work_dir=tmp_path) + assert result == [] + + def test_fetch_error_returns_empty(self, tmp_path): + with patch( + "trend_reports.fetcher.fetch_workflow_runs", + side_effect=FetchError("fail"), + ): + result = fetch_prerelease_bundles("owner/repo", work_dir=tmp_path) + assert result == [] + + def test_main_artifact_found(self, tmp_path): + main_zip = tmp_path / "report-main" / "report-main.zip" + main_zip.parent.mkdir(parents=True) + main_zip.write_bytes(b"fake") + + with ( + patch( + "trend_reports.fetcher.fetch_workflow_runs", + side_effect=[ + [{"databaseId": 1, "headBranch": "main"}], # main runs + [], # PR runs + ], + ), + patch( + "trend_reports.fetcher.fetch_artifact_bundle", + return_value=main_zip, + ), + ): + result = fetch_prerelease_bundles("owner/repo", work_dir=tmp_path) + assert len(result) == 1 + assert result[0] == main_zip + + +class TestFetchReleaseBundles: + def test_no_bundles_raises(self, tmp_path): + with ( + patch( + "trend_reports.fetcher.fetch_release_list", + return_value=[{"tagName": "v0.1.0", "publishedAt": "2026-01-01"}], + ), + patch("trend_reports.fetcher.fetch_release_bundle", return_value=None), + ): + with pytest.raises(FetchError, match="No report bundles found"): + fetch_release_bundles("owner/repo", work_dir=tmp_path) + + def test_specific_tags_filter(self, tmp_path): + fake_zip = tmp_path / "report.zip" + fake_zip.write_bytes(b"fake") + + with ( + patch( + "trend_reports.fetcher.fetch_release_list", + return_value=[ + {"tagName": "v0.1.0", "publishedAt": "2026-01-01"}, + {"tagName": "v0.1.1", "publishedAt": "2026-02-01"}, + ], + ), + patch("trend_reports.fetcher.fetch_release_bundle", return_value=fake_zip), + ): + result = fetch_release_bundles("owner/repo", tags=["v0.1.1"], work_dir=tmp_path) + assert len(result) == 1 diff --git a/scripts/aidlc-evaluator/packages/trend-reports/tests/test_gate.py b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_gate.py new file mode 100644 index 00000000..fc3ba706 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_gate.py @@ -0,0 +1,95 @@ +"""Tests for CI regression gate logic.""" + +from __future__ import annotations + +from conftest import make_run, make_trend +from trend_reports.gate import check_regressions, find_latest_and_previous +from trend_reports.models import RunType + + +class TestCheckRegressions: + def test_no_regressions_passes(self): + r1 = make_run("v0.1.0", qualitative_score=0.85) + r2 = make_run("v0.1.1", qualitative_score=0.90) + result = check_regressions(make_trend(r1, r2)) + assert result.passed is True + assert result.regressions == [] + + def test_contract_test_regression(self): + r1 = make_run("v0.1.0", contract_passed=88, contract_total=88) + r2 = make_run("v0.1.1", contract_passed=85, contract_total=88) + result = check_regressions(make_trend(r1, r2)) + assert result.passed is False + assert any("contract" in r.lower() for r in result.regressions) + + def test_unit_test_failures_regression(self): + r1 = make_run("v0.1.0", passed=100, failed=0) + r2 = make_run("v0.1.1", passed=95, failed=5) + result = check_regressions(make_trend(r1, r2)) + assert result.passed is False + assert any("unit" in r.lower() or "test" in r.lower() for r in result.regressions) + + def test_qualitative_regression(self): + r1 = make_run("v0.1.0", qualitative_score=0.90) + r2 = make_run("v0.1.1", qualitative_score=0.85) + result = check_regressions(make_trend(r1, r2)) + assert result.passed is False + assert any("qualitative" in r.lower() for r in result.regressions) + + def test_small_qualitative_drop_not_regression(self): + r1 = make_run("v0.1.0", qualitative_score=0.90) + r2 = make_run("v0.1.1", qualitative_score=0.885) + result = check_regressions(make_trend(r1, r2)) + assert result.passed is True + + def test_fewer_than_two_runs_passes(self): + r1 = make_run("v0.1.0") + result = check_regressions(make_trend(r1)) + assert result.passed is True + + def test_empty_runs_passes(self): + result = check_regressions(make_trend()) + assert result.passed is True + + def test_labels_set(self): + r1 = make_run("v0.1.0") + r2 = make_run("v0.1.1") + result = check_regressions(make_trend(r1, r2)) + assert result.latest_label == "v0.1.1" + assert result.comparison_label == "v0.1.0" + + +class TestFindLatestAndPrevious: + def test_empty_runs(self): + trend = make_trend() + latest, prev = find_latest_and_previous(trend) + assert latest is None + assert prev is None + + def test_single_run(self): + r1 = make_run("v0.1.0") + latest, prev = find_latest_and_previous(make_trend(r1)) + assert latest is r1 + assert prev is None + + def test_two_releases(self): + r1 = make_run("v0.1.0") + r2 = make_run("v0.1.1") + latest, prev = find_latest_and_previous(make_trend(r1, r2)) + assert latest is r2 + assert prev is r1 + + def test_latest_is_main(self): + r1 = make_run("v0.1.0") + r2 = make_run("v0.1.1") + r_main = make_run("main", run_type=RunType.MAIN, semver=None) + latest, prev = find_latest_and_previous(make_trend(r1, r2, r_main)) + assert latest is r_main + assert prev is r2 + + def test_latest_is_pr(self): + r1 = make_run("v0.1.0") + r_pr = make_run("PR #42", run_type=RunType.PR, semver=None, pr_number=42) + latest, prev = find_latest_and_previous(make_trend(r1, r_pr)) + assert latest is r_pr + assert prev is r1 diff --git a/scripts/aidlc-evaluator/packages/trend-reports/tests/test_main.py b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_main.py new file mode 100644 index 00000000..ed4d0139 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_main.py @@ -0,0 +1,96 @@ +"""Tests for CLI entry point and format resolution.""" + +from __future__ import annotations + +from unittest.mock import patch + +import pytest +from trend_reports.__main__ import _resolve_formats +from trend_reports.models import TrendReportError + + +class TestResolveFormats: + def test_both(self): + assert _resolve_formats("both") == {"md", "html"} + + def test_all(self): + assert _resolve_formats("all") == {"md", "html", "yaml"} + + def test_md(self): + assert _resolve_formats("md") == {"md"} + + def test_html(self): + assert _resolve_formats("html") == {"html"} + + def test_yaml(self): + assert _resolve_formats("yaml") == {"yaml"} + + +class TestCmdTrendLocalBundle: + def test_missing_local_bundle_raises(self, tmp_path): + """Local bundle path that does not exist should raise TrendReportError.""" + from trend_reports.__main__ import cmd_trend + + fake_zip = tmp_path / "nonexistent.zip" + dummy_zip = tmp_path / "dummy.zip" + dummy_zip.write_bytes(b"fake") + + with ( + patch( + "trend_reports.fetcher.check_gh_available", + ), + patch( + "trend_reports.fetcher.fetch_release_list", + return_value=[{"tagName": "v0.1.0", "publishedAt": "2026-01-01"}], + ), + patch( + "trend_reports.fetcher.fetch_release_bundle", + return_value=dummy_zip, + ), + ): + with pytest.raises(TrendReportError, match="Local bundle not found"): + cmd_trend( + baseline=str(tmp_path / "golden.yaml"), + fmt="md", + output_dir=str(tmp_path / "out"), + repo="test/repo", + cache_prefix="report-", + gate=False, + tags=["v0.1.0"], + local_bundles=[str(fake_zip)], + ) + + +class TestCmdTrendLocalRunDir: + def test_missing_local_run_dir_raises(self, tmp_path): + """Local run dir path that does not exist should raise TrendReportError.""" + from trend_reports.__main__ import cmd_trend + + fake_dir = tmp_path / "nonexistent" + dummy_zip = tmp_path / "dummy.zip" + dummy_zip.write_bytes(b"fake") + + with ( + patch( + "trend_reports.fetcher.check_gh_available", + ), + patch( + "trend_reports.fetcher.fetch_release_list", + return_value=[{"tagName": "v0.1.0", "publishedAt": "2026-01-01"}], + ), + patch( + "trend_reports.fetcher.fetch_release_bundle", + return_value=dummy_zip, + ), + ): + with pytest.raises(TrendReportError, match="Local run directory not found"): + cmd_trend( + baseline=str(tmp_path / "golden.yaml"), + fmt="md", + output_dir=str(tmp_path / "out"), + repo="test/repo", + cache_prefix="report-", + gate=False, + tags=["v0.1.0"], + local_run_dirs=[str(fake_dir)], + ) diff --git a/scripts/aidlc-evaluator/packages/trend-reports/tests/test_models.py b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_models.py new file mode 100644 index 00000000..2734ceaf --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_models.py @@ -0,0 +1,84 @@ +"""Tests for data models, enums, exceptions, and SemVer parsing.""" + +from __future__ import annotations + +import pytest +from trend_reports.models import ( + BaselineMetrics, + CollectorError, + FetchError, + GateResult, + RunType, + SemVer, + TrendReportError, +) + + +class TestSemVer: + def test_parse_with_v_prefix(self): + sv = SemVer.parse("v1.2.3") + assert sv == SemVer(1, 2, 3) + + def test_parse_without_v_prefix(self): + sv = SemVer.parse("0.1.5") + assert sv == SemVer(0, 1, 5) + + def test_parse_large_numbers(self): + sv = SemVer.parse("v999.888.777") + assert sv == SemVer(999, 888, 777) + + def test_parse_invalid_empty(self): + with pytest.raises(ValueError, match="Cannot parse semver"): + SemVer.parse("") + + def test_parse_invalid_text(self): + with pytest.raises(ValueError, match="Cannot parse semver"): + SemVer.parse("abc") + + def test_parse_invalid_two_parts(self): + with pytest.raises(ValueError, match="Cannot parse semver"): + SemVer.parse("1.2") + + def test_str(self): + assert str(SemVer(0, 1, 5)) == "v0.1.5" + + def test_ordering(self): + assert SemVer(0, 1, 0) < SemVer(0, 2, 0) + assert SemVer(0, 1, 5) < SemVer(0, 1, 6) + assert SemVer(0, 1, 9) < SemVer(1, 0, 0) + + def test_equality(self): + assert SemVer(1, 2, 3) == SemVer(1, 2, 3) + + def test_frozen(self): + sv = SemVer(1, 2, 3) + with pytest.raises(AttributeError): + sv.major = 5 # type: ignore[misc] + + +class TestRunType: + def test_values(self): + assert RunType.RELEASE.value == "release" + assert RunType.MAIN.value == "main" + assert RunType.PR.value == "pr" + + +class TestExceptions: + def test_fetch_error_is_trend_report_error(self): + assert issubclass(FetchError, TrendReportError) + + def test_collector_error_is_trend_report_error(self): + assert issubclass(CollectorError, TrendReportError) + + +class TestDataclassDefaults: + def test_baseline_metrics_defaults(self): + bl = BaselineMetrics() + assert bl.unit_tests_passed == 0 + assert bl.qualitative_overall == 0.0 + assert bl.document_scores == {} + + def test_gate_result_defaults(self): + gr = GateResult(passed=True) + assert gr.regressions == [] + assert gr.latest_label == "" diff --git a/scripts/aidlc-evaluator/packages/trend-reports/tests/test_render_html.py b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_render_html.py new file mode 100644 index 00000000..d7655642 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_render_html.py @@ -0,0 +1,76 @@ +"""Tests for HTML trend report rendering. + +Smoke tests verify the output is valid HTML with expected sections. +""" + +from __future__ import annotations + +from conftest import make_run +from trend_reports.models import ( + BaselineMetrics, + TrendData, +) +from trend_reports.render_html import render_trend_html + + +def _make_trend(*labels: str) -> TrendData: + runs = [ + make_run(label, qualitative_score=0.85 + i * 0.02) + for i, label in enumerate(labels) + ] + return TrendData( + runs=runs, + baseline=BaselineMetrics( + unit_tests_passed=192, + qualitative_overall=0.891, + total_tokens=9840000, + execution_time_seconds=1446.0, + ), + repo="test/repo", + generated_at="2026-01-01T00:00:00Z", + ) + + +class TestRenderTrendHtml: + def test_output_is_html(self): + trend = _make_trend("v0.1.0", "v0.1.1") + result = render_trend_html(trend) + assert "" in result + + def test_contains_section_anchors(self): + trend = _make_trend("v0.1.0", "v0.1.1", "v0.1.2") + result = render_trend_html(trend) + for section_id in [ + "a-executive-summary", + "b-functional-correctness", + "c-qualitative-evaluation", + "d-efficiency-cost-metrics", + "e-code-quality", + "f-stability-reliability", + "g-version-over-version-deltas", + "h-pre-release-data-points", + ]: + assert section_id in result, f"Missing anchor {section_id}" + + def test_contains_version_labels(self): + trend = _make_trend("v0.1.0", "v0.1.1") + result = render_trend_html(trend) + assert "v0.1.0" in result + assert "v0.1.1" in result + + def test_empty_runs_no_crash(self): + trend = TrendData( + runs=[], + baseline=BaselineMetrics(), + repo="test/repo", + generated_at="2026-01-01T00:00:00Z", + ) + result = render_trend_html(trend) + assert "" in result diff --git a/scripts/aidlc-evaluator/packages/trend-reports/tests/test_render_md.py b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_render_md.py new file mode 100644 index 00000000..05ed7436 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_render_md.py @@ -0,0 +1,75 @@ +"""Tests for Markdown trend report rendering. + +Smoke tests verify sections are present and the renderer does not crash on +various inputs. Does not validate exact Markdown formatting. +""" + +from __future__ import annotations + +from conftest import make_run +from trend_reports.models import ( + BaselineMetrics, + TrendData, +) +from trend_reports.render_md import render_trend_markdown + + +def _make_trend(*labels: str) -> TrendData: + runs = [ + make_run(label, qualitative_score=0.85 + i * 0.02) + for i, label in enumerate(labels) + ] + return TrendData( + runs=runs, + baseline=BaselineMetrics( + unit_tests_passed=192, + qualitative_overall=0.891, + total_tokens=9840000, + execution_time_seconds=1446.0, + ), + repo="test/repo", + generated_at="2026-01-01T00:00:00Z", + ) + + +class TestRenderTrendMarkdown: + def test_output_is_string(self): + trend = _make_trend("v0.1.0", "v0.1.1") + result = render_trend_markdown(trend) + assert isinstance(result, str) + + def test_contains_all_sections(self): + trend = _make_trend("v0.1.0", "v0.1.1", "v0.1.2") + result = render_trend_markdown(trend) + for section in [ + "## A. Executive Summary", + "## B. Functional Correctness", + "## C. Qualitative Evaluation", + "## D. Efficiency & Cost Metrics", + "## E. Code Quality", + "## F. Stability", + "## G. Version-over-Version Deltas", + "## H. Pre-Release", + ]: + assert section in result, f"Missing {section}" + + def test_contains_version_labels(self): + trend = _make_trend("v0.1.0", "v0.1.1") + result = render_trend_markdown(trend) + assert "v0.1.0" in result + assert "v0.1.1" in result + + def test_empty_runs_no_crash(self): + trend = TrendData( + runs=[], + baseline=BaselineMetrics(), + repo="test/repo", + generated_at="2026-01-01T00:00:00Z", + ) + result = render_trend_markdown(trend) + assert isinstance(result, str) + + def test_single_run(self): + trend = _make_trend("v0.1.0") + result = render_trend_markdown(trend) + assert "v0.1.0" in result diff --git a/scripts/aidlc-evaluator/packages/trend-reports/tests/test_render_yaml.py b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_render_yaml.py new file mode 100644 index 00000000..0cd01653 --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_render_yaml.py @@ -0,0 +1,74 @@ +"""Tests for YAML data export and serialization roundtrip.""" + +from __future__ import annotations + +import yaml +from trend_reports.models import ( + BaselineMetrics, + CodeQualityMetrics, + ContractTestResults, + QualitativeComparison, + RunConfig, + RunData, + RunMeta, + RunMetrics, + RunType, + SemVer, + TrendData, + UnitTestResults, +) +from trend_reports.render_yaml import render_trend_yaml + + +def _make_trend() -> TrendData: + run = RunData( + label="v0.1.0", + run_type=RunType.RELEASE, + semver=SemVer(0, 1, 0), + pr_number=None, + meta=RunMeta(run_id="run-001", config=RunConfig(rules_ref="v0.1.0")), + metrics=RunMetrics(total_tokens=9000000), + unit_tests=UnitTestResults(passed=175, total=175), + contract_tests=ContractTestResults(total=88, passed=88), + code_quality=CodeQualityMetrics(), + qualitative=QualitativeComparison(overall_score=0.898), + ) + return TrendData( + runs=[run], + baseline=BaselineMetrics(unit_tests_passed=192, qualitative_overall=0.891), + repo="test/repo", + generated_at="2026-01-01T00:00:00Z", + ) + + +class TestRenderTrendYaml: + def test_roundtrip(self): + trend = _make_trend() + yaml_str = render_trend_yaml(trend) + parsed = yaml.safe_load(yaml_str) + assert parsed["repo"] == "test/repo" + assert len(parsed["runs"]) == 1 + assert parsed["runs"][0]["label"] == "v0.1.0" + assert parsed["runs"][0]["unit_tests"]["passed"] == 175 + + def test_run_type_serialized_as_value(self): + trend = _make_trend() + yaml_str = render_trend_yaml(trend) + parsed = yaml.safe_load(yaml_str) + assert parsed["runs"][0]["run_type"] == "release" + + def test_empty_runs(self): + trend = TrendData( + runs=[], + baseline=BaselineMetrics(), + repo="test/repo", + generated_at="2026-01-01T00:00:00Z", + ) + yaml_str = render_trend_yaml(trend) + parsed = yaml.safe_load(yaml_str) + assert parsed["runs"] == [] + + def test_output_is_string(self): + trend = _make_trend() + result = render_trend_yaml(trend) + assert isinstance(result, str) diff --git a/scripts/aidlc-evaluator/packages/trend-reports/tests/test_sparkline.py b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_sparkline.py new file mode 100644 index 00000000..901a049e --- /dev/null +++ b/scripts/aidlc-evaluator/packages/trend-reports/tests/test_sparkline.py @@ -0,0 +1,135 @@ +"""Tests for ASCII sparkline and formatting utilities.""" + +from __future__ import annotations + +from trend_reports.sparkline import ( + format_delta, + format_number, + format_pct, + format_seconds_as_minutes, + sparkline, + trend_arrow, +) + + +class TestSparkline: + def test_empty_list(self): + assert sparkline([]) == "" + + def test_single_value(self): + result = sparkline([5]) + assert len(result) == 1 + + def test_all_identical(self): + result = sparkline([3, 3, 3, 3]) + assert len(result) == 4 + assert len(set(result)) == 1 + + def test_ascending(self): + result = sparkline([1, 2, 3, 4, 5]) + assert len(result) == 5 + assert result[0] < result[-1] + + def test_two_values_min_max(self): + result = sparkline([0, 100]) + assert len(result) == 2 + + def test_negative_values(self): + result = sparkline([-10, 0, 10]) + assert len(result) == 3 + + +class TestTrendArrow: + def test_empty_list(self): + assert trend_arrow([]) == "→" + + def test_single_value(self): + assert trend_arrow([5]) == "→" + + def test_strong_increase(self): + assert trend_arrow([100, 110]) == "↑" + + def test_strong_decrease(self): + assert trend_arrow([100, 90]) == "↓" + + def test_flat(self): + assert trend_arrow([100, 100.5]) == "→" + + def test_zero_first_positive_last(self): + assert trend_arrow([0, 10]) == "↑" + + def test_zero_both(self): + assert trend_arrow([0, 0]) == "→" + + def test_mild_increase(self): + assert trend_arrow([100, 103]) == "↗" + + def test_mild_decrease(self): + assert trend_arrow([100, 97]) == "↘" + + +class TestFormatNumber: + def test_integer_small(self): + assert format_number(42) == "42" + + def test_integer_thousands(self): + result = format_number(1500) + assert "K" in result + + def test_integer_millions(self): + result = format_number(9260000) + assert "M" in result + + def test_float_small(self): + assert format_number(0.891) == "0.891" + + def test_float_millions(self): + result = format_number(9.26e6) + assert "M" in result + + def test_zero_int(self): + assert format_number(0) == "0" + + +class TestFormatSecondsAsMinutes: + def test_zero(self): + assert format_seconds_as_minutes(0) == "0.0m" + + def test_one_minute(self): + assert format_seconds_as_minutes(60) == "1.0m" + + def test_fractional(self): + result = format_seconds_as_minutes(90) + assert result == "1.5m" + + +class TestFormatDelta: + def test_positive_int(self): + assert format_delta(5) == "+5" + + def test_negative_int(self): + assert format_delta(-3) == "-3" + + def test_zero_int(self): + assert format_delta(0) == "+0" + + def test_positive_float(self): + assert format_delta(0.5) == "+0.5" + + def test_custom_precision(self): + assert format_delta(0.028, precision=3) == "+0.028" + + +class TestFormatPct: + def test_zero(self): + assert format_pct(0.0) == "0.0%" + + def test_full(self): + assert format_pct(1.0) == "100.0%" + + def test_partial(self): + assert format_pct(0.5) == "50.0%" + + def test_over_one(self): + result = format_pct(1.5) + assert "150" in result diff --git a/scripts/aidlc-evaluator/pyproject.toml b/scripts/aidlc-evaluator/pyproject.toml new file mode 100644 index 00000000..77a963c6 --- /dev/null +++ b/scripts/aidlc-evaluator/pyproject.toml @@ -0,0 +1,69 @@ +[project] +name = "aidlc-evaluation-framework" +version = "0.1.0" +description = "Evaluation and reporting framework for AI-DLC workflows" +requires-python = ">=3.13" +dependencies = [ + "pyyaml>=6.0", + "boto3>=1.42.47", + "aidlc-runner", + "aidlc-qualitative", + "aidlc-quantitative", + "aidlc-contracttest", + "aidlc-nonfunctional", + "aidlc-reporting", + "aidlc-shared", + "aidlc-trend-reports", +] + +[tool.uv.workspace] +members = [ + "packages/execution", + "packages/qualitative", + "packages/quantitative", + "packages/contracttest", + "packages/nonfunctional", + "packages/reporting", + "packages/shared", + "packages/ide-harness", + "packages/cli-harness", + "packages/trend-reports", +] + +[tool.uv.sources] +aidlc-runner = { workspace = true } +aidlc-qualitative = { workspace = true } +aidlc-quantitative = { workspace = true } +aidlc-contracttest = { workspace = true } +aidlc-nonfunctional = { workspace = true } +aidlc-reporting = { workspace = true } +aidlc-shared = { workspace = true } +aidlc-trend-reports = { workspace = true } + +[dependency-groups] +dev = [ + "pytest>=8.0.0", + "pytest-asyncio>=0.23.0", + "ruff>=0.3.0", + "bandit>=1.7.0", + "semgrep>=1.0.0", +] + +[tool.ruff] +line-length = 100 +target-version = "py313" + +[tool.ruff.lint] +select = ["E", "F", "I", "N", "W"] +ignore = [] + +[tool.ruff.lint.per-file-ignores] +"packages/trend-reports/src/trend_reports/render_html.py" = ["E501"] +"packages/trend-reports/src/trend_reports/render_md.py" = ["E501"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_classes = ["Test*"] +python_functions = ["test_*"] +collect_ignore_glob = ["**/src/**"] diff --git a/scripts/aidlc-evaluator/run.py b/scripts/aidlc-evaluator/run.py new file mode 100644 index 00000000..3190e7ca --- /dev/null +++ b/scripts/aidlc-evaluator/run.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +"""Master run script for AIDLC evaluation framework. + +This is the main entry point for running AIDLC evaluations in various modes. +It dispatches to specialized runner scripts in the scripts/ directory. + +Available modes: + - full Full evaluation (execute workflow + score outputs) + - cli Evaluation through a CLI AI assistant (kiro-cli, claude-code, etc.) + - ide Evaluation through an IDE AI assistant (cursor, cline, kiro) + - batch Batch evaluation across multiple models + - compare Generate cross-model comparison report + - ext-test Test extension hooks with different opt-in configurations + - ext-report Regenerate extension test comparison report + - trend Generate trend report across AIDLC rules releases + - test Run unit tests for all packages + +Usage: + # Full pipeline evaluation + python run.py full --vision test_cases/sci-calc/vision.md + + # CLI evaluation + python run.py cli --cli kiro-cli --scenario sci-calc + + # IDE evaluation + python run.py ide --ide cursor --scenario sci-calc + + # Batch evaluation across models + python run.py batch --models all --scenario sci-calc + + # Generate comparison report + python run.py compare --scenario sci-calc + + # Test extension hooks (all yes vs all no) + python run.py ext-test --scenario sci-calc + + # Regenerate extension comparison report + python run.py ext-report --runs-dir runs/sci-calc/extension-test + + # Generate trend report across releases + python run.py trend --baseline test_cases/sci-calc/golden.yaml + + # Run tests + python run.py test + + # Get help for a specific mode + python run.py full --help + python run.py cli --help + python run.py ext-test --help +""" + +from __future__ import annotations + +import argparse +import subprocess +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent +SCRIPTS_DIR = REPO_ROOT / "scripts" + + +def main() -> None: + parser = argparse.ArgumentParser( + prog="run.py", + description="AIDLC Evaluation Framework — unified entry point", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + + subparsers = parser.add_subparsers( + dest="mode", + title="evaluation modes", + description="Choose an evaluation mode to run", + help="Mode-specific help available via: python run.py --help", + ) + + # Full evaluation mode + subparsers.add_parser( + "full", + help="Full evaluation: execute AIDLC workflow + score outputs", + add_help=False, + ) + + # CLI evaluation mode + subparsers.add_parser( + "cli", + help="Evaluation through CLI AI assistants (kiro-cli, claude-code, etc.)", + add_help=False, + ) + + # IDE evaluation mode + subparsers.add_parser( + "ide", + help="Evaluation through IDE AI assistants (cursor, cline, kiro)", + add_help=False, + ) + + # Batch evaluation mode + subparsers.add_parser( + "batch", + help="Batch evaluation across multiple Bedrock models", + add_help=False, + ) + + # Comparison report mode + subparsers.add_parser( + "compare", + help="Generate cross-model comparison report from batch runs", + add_help=False, + ) + + # Extension test mode + subparsers.add_parser( + "ext-test", + help="Test extension hooks with different opt-in configurations", + add_help=False, + ) + + # Extension report regeneration mode + subparsers.add_parser( + "ext-report", + help="Regenerate extension test comparison report from completed runs", + add_help=False, + ) + + # Trend report mode + subparsers.add_parser( + "trend", + help="Generate trend report across AIDLC rules releases", + add_help=False, + ) + + # Test mode + subparsers.add_parser( + "test", + help="Run unit tests for all packages", + add_help=False, + ) + + # Parse just the mode, then delegate to the appropriate script + args, remaining = parser.parse_known_args() + + if not args.mode: + parser.print_help() + sys.exit(1) + + # Map modes to scripts + mode_to_script = { + "full": SCRIPTS_DIR / "run_evaluation.py", + "cli": SCRIPTS_DIR / "run_cli_evaluation.py", + "ide": SCRIPTS_DIR / "run_ide_evaluation.py", + "batch": SCRIPTS_DIR / "run_batch_evaluation.py", + "compare": SCRIPTS_DIR / "run_comparison_report.py", + "ext-test": SCRIPTS_DIR / "run_extension_test.py", + "ext-report": SCRIPTS_DIR / "regenerate_extension_report.py", + "trend": SCRIPTS_DIR / "run_trend_report.py", + "test": SCRIPTS_DIR / "run_evaluation.py", # test mode is in run_evaluation.py + } + + script = mode_to_script[args.mode] + + if not script.exists(): + print(f"Error: script not found: {script}", file=sys.stderr) + sys.exit(1) + + # Build command to delegate to the specific script + cmd = [sys.executable, str(script)] + + # For test mode, add --test flag + if args.mode == "test": + cmd.append("--test") + + # Forward all remaining arguments + cmd.extend(remaining) + + # Execute the script + try: + # nosec B603 - Executing trusted framework scripts from scripts/ directory + # nosemgrep: dangerous-subprocess-use-audit + result = subprocess.run(cmd) + sys.exit(result.returncode) + except KeyboardInterrupt: + print("\n[Interrupted]", file=sys.stderr) + sys.exit(130) + except Exception as e: + print(f"Error running {script.name}: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/aidlc-evaluator/scripts/README.md b/scripts/aidlc-evaluator/scripts/README.md new file mode 100644 index 00000000..4b6a2086 --- /dev/null +++ b/scripts/aidlc-evaluator/scripts/README.md @@ -0,0 +1,118 @@ +# AIDLC Evaluation Scripts + +This directory contains the specialized run scripts for the AIDLC evaluation framework. + +## Overview + +All run scripts have been consolidated into this `scripts/` directory for better organization. The main entry point is now `run.py` in the repository root, which dispatches to these specialized scripts based on the evaluation mode. + +## Scripts + +### Core Evaluation Scripts + +- **run_evaluation.py** - Full evaluation pipeline (execute AIDLC workflow + score outputs) + - Orchestrates all 6 stages: execution, post-run tests, quantitative analysis, contract tests, qualitative evaluation, and reporting + - Can run in test mode with `--test` flag + +- **run_cli_evaluation.py** - CLI-based evaluation + - Runs evaluation through CLI AI assistants (kiro-cli, claude-code, etc.) + - Uses adapters from `packages/cli-harness` + +- **run_ide_evaluation.py** - IDE-based evaluation + - Runs evaluation through IDE AI assistants (cursor, cline, kiro) + - Uses adapters from `packages/ide-harness` + +### Batch Processing Scripts + +- **run_batch_evaluation.py** - Batch evaluation runner + - Runs AIDLC evaluation across multiple Bedrock models sequentially + - Reads model configs from `config/` directory + - Delegates to `run_evaluation.py` for each model + +- **run_comparison_report.py** - Cross-model comparison + - Aggregates results from batch runs + - Generates comparison matrices in Markdown and YAML formats + - Compares against golden baseline + +- **run_extension_test.py** - Extension hook testing + - Tests AIDLC evaluations with different extension opt-in configurations + - Runs multiple evaluations with "all yes" vs "all no" opt-in answers + - Generates comparison report showing impact of extension choices + - Uses the extension hook feature branch (feat/extension_hook_question_split) + +### Trend Reporting + +- **run_trend_report.py** - Cross-release trend report generation + - Fetches evaluation bundles from GitHub releases and Actions artifacts + - Generates HTML, Markdown, and YAML trend reports comparing metrics across releases + - Uses the `packages/trend-reports` package + - Executive summary cards show: Qualitative Score, Contract Tests, Unit Test pass rate (%), Lint Findings, Execution Time, and Total Tokens + - Execution Time and Total Tokens are "lower is better" metrics (shown with green indicators since lower values are desirable) + +## Usage + +### Using the Master Entry Point (Recommended) + +The recommended way to run evaluations is through the master `run.py` script in the repository root: + +```bash +# Full pipeline evaluation +python run.py full --vision test_cases/sci-calc/vision.md + +# CLI evaluation +python run.py cli --cli kiro-cli --scenario sci-calc + +# IDE evaluation +python run.py ide --ide cursor --scenario sci-calc + +# Batch evaluation across models +python run.py batch --models all --scenario sci-calc + +# Generate comparison report +python run.py compare --scenario sci-calc + +# Test extension hooks (all yes vs all no) +python run.py ext-test --scenario sci-calc + +# Generate trend report across releases +python run.py trend --baseline test_cases/sci-calc/golden.yaml + +# Run tests +python run.py test +``` + +### Direct Script Invocation + +Scripts can also be invoked directly if needed: + +```bash +# Full evaluation +python scripts/run_evaluation.py --vision test_cases/sci-calc/vision.md + +# CLI evaluation +python scripts/run_cli_evaluation.py --cli kiro-cli --scenario sci-calc + +# Batch evaluation +python scripts/run_batch_evaluation.py --models all --scenario sci-calc + +# Extension hook testing +python scripts/run_extension_test.py --scenario sci-calc + +# Trend report +python scripts/run_trend_report.py --baseline test_cases/sci-calc/golden.yaml +``` + +## Path Resolution + +All scripts properly resolve paths relative to the repository root, so they work correctly whether invoked: + +- Through the master `run.py` dispatcher +- Directly from the repository root +- Directly from the `scripts/` directory + +## Architecture Notes + +- **REPO_ROOT**: All scripts use `Path(__file__).resolve().parent.parent` to locate the repository root +- **Output**: Run outputs go to `runs//` by default +- **Config**: Configuration files are read from `config/` in the repository root +- **Test Cases**: Test case scenarios are located in `test_cases/` in the repository root diff --git a/scripts/aidlc-evaluator/scripts/regenerate_extension_report.py b/scripts/aidlc-evaluator/scripts/regenerate_extension_report.py new file mode 100644 index 00000000..d71197ab --- /dev/null +++ b/scripts/aidlc-evaluator/scripts/regenerate_extension_report.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +"""Regenerate extension comparison report from completed extension test runs.""" + +import argparse +import sys +from pathlib import Path + +import yaml + +REPO_ROOT = Path(__file__).resolve().parent.parent + +# Add packages to path +sys.path.insert(0, str(REPO_ROOT / "packages" / "shared" / "src")) +sys.path.insert(0, str(REPO_ROOT / "packages" / "reporting" / "src")) + +from reporting.baseline import BaselineMetrics, extract_baseline # noqa: E402 +from reporting.collector import collect # noqa: E402 + + +def load_config_metrics(run_folder: Path) -> BaselineMetrics | None: + """Load evaluation metrics from an extension test run folder.""" + try: + data = collect(run_folder) + return extract_baseline(data) + except Exception as e: + print(f" [WARN] Failed to collect metrics from {run_folder}: {e}", file=sys.stderr) + return None + + +def format_num(val: float | int | None, decimals: int = 1) -> str: + """Format a number for display.""" + if val is None: + return "—" + if isinstance(val, float): + return f"{val:.{decimals}f}" + return str(val) + + +def main(): + parser = argparse.ArgumentParser( + description="Regenerate extension comparison report from completed runs" + ) + parser.add_argument( + "--runs-dir", + type=Path, + required=True, + help="Extension test runs directory (e.g., runs/sci-calc/extension-test/)", + ) + args = parser.parse_args() + + comparison_dir = args.runs_dir / "extension-comparison" + summary_path = comparison_dir / "extension-test-summary.yaml" + + if not summary_path.exists(): + print(f"Error: {summary_path} not found", file=sys.stderr) + print("Make sure you're pointing to the extension test runs directory", file=sys.stderr) + sys.exit(1) + + # Load existing summary + with open(summary_path, encoding="utf-8") as f: + summary = yaml.safe_load(f) + + results = summary.get("runs", []) + scenario_name = summary.get("scenario", "unknown") + + print(f"Found {len(results)} extension test runs") + + # Load metrics from each run (regardless of status - we want to compare quality) + config_metrics: dict[str, BaselineMetrics] = {} + for result in results: + run_folder = Path(result["output_dir"]) + if not run_folder.is_dir(): + print(f" Skipping {result['config_name']} (folder not found)") + continue + print(f" Loading metrics: {result['config_name']} (status: {result['status']})...") + metrics = load_config_metrics(run_folder) + if metrics: + config_metrics[result["config_name"]] = metrics + else: + print(f" Failed to load metrics for {result['config_name']}") + + if not config_metrics: + print("No metrics loaded, cannot generate report", file=sys.stderr) + sys.exit(1) + + # Generate detailed metrics comparison report + report_lines = [ + "# Extension Hook Test Report", + "", + f"**Scenario:** {scenario_name}", + f"**Generated:** {summary.get('generated_at', 'unknown')}", + f"**Regenerated:** {yaml.safe_dump({'now': None}).split(':')[1].strip()}", + "", + "## Test Configurations", + "", + ] + + # Configuration summary + for result in results: + status_icon = "✅" if result["status"] == "success" else "❌" + report_lines.extend( + [ + f"### {status_icon} {result['config_display_name']}", + "", + f"- **Config ID:** `{result['config_name']}`", + f"- **Description:** {result['config_description']}", + f"- **Status:** {result['status'].upper()}", + f"- **Duration:** {result.get('elapsed_seconds', 0) / 60:.1f} minutes", + f"- **Output:** `{result['output_dir']}`", + "", + ] + ) + + # Detailed metrics comparison + if config_metrics: + report_lines.extend( + [ + "", + "## Detailed Metrics Comparison", + "", + ] + ) + + # Build columns (sorted by config name) + columns = sorted(config_metrics.items()) + + # Header + header = "| Metric |" + separator = "|--------|" + for config_name, _ in columns: + header += f" {config_name} |" + separator += "---------|" + report_lines.append(header) + report_lines.append(separator) + + # Metric rows + metric_rows: list[tuple[str, str, bool]] = [ + ("**Unit Tests**", "", True), + ("Pass %", "tests_pass_pct", True), + ("Passed", "tests_passed", True), + ("Failed", "tests_failed", False), + ("Total", "tests_total", True), + ("Coverage %", "coverage_pct", True), + ("**Contract Tests**", "", True), + ("Passed", "contract_passed", True), + ("Failed", "contract_failed", False), + ("Total", "contract_total", True), + ("**Code Quality**", "", True), + ("Lint Errors", "lint_errors", False), + ("Lint Warnings", "lint_warnings", False), + ("Lint Total", "lint_total", False), + ("Security Findings", "security_total", False), + ("Security High", "security_high", False), + ("Duplication Blocks", "duplication_blocks", False), + ("**Qualitative**", "", True), + ("Overall Score", "qualitative_score", True), + ("Inception Score", "inception_score", True), + ("Construction Score", "construction_score", True), + ("**Artifacts**", "", True), + ("Source Files", "source_files", True), + ("Test Files", "test_files", True), + ("Total Files", "total_files", True), + ("Lines of Code", "lines_of_code", True), + ("Doc Files", "doc_files", True), + ("**Execution**", "", True), + ("Total Tokens", "total_tokens", False), + ("Executor Tokens", "executor_total_tokens", False), + ("Simulator Tokens", "simulator_total_tokens", False), + ("Wall Clock (min)", "wall_clock_min", False), + ("Handoffs", "handoffs", False), + ("**Context Size**", "", True), + ("Max Tokens", "context_size_max", False), + ("Avg Tokens", "context_size_avg", False), + ("Median Tokens", "context_size_median", False), + ] + + for display_name, attr, higher_is_better in metric_rows: + if not attr: + # Section header row + row = f"| {display_name} |" + for _ in columns: + row += " |" + report_lines.append(row) + continue + + row = f"| {display_name} |" + for config_name, metrics in columns: + if attr == "wall_clock_min": + val = ( + metrics.wall_clock_ms / 60000 if metrics.wall_clock_ms else None + ) + else: + val = getattr(metrics, attr, None) + row += f" {format_num(val)} |" + report_lines.append(row) + + report_lines.append("") + + # Next steps + report_lines.extend( + [ + "", + "## Next Steps", + "", + "1. Review the individual run reports in each output directory", + "2. Compare qualitative scores between configurations", + "3. Examine differences in generated artifacts", + "4. Analyze the impact of extension opt-ins on output quality", + "", + ] + ) + + report_path = comparison_dir / "extension-test-report.md" + report_path.write_text("\n".join(report_lines), encoding="utf-8") + print(f"\n✅ Extension test report regenerated: {report_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/aidlc-evaluator/scripts/regenerate_single_report.py b/scripts/aidlc-evaluator/scripts/regenerate_single_report.py new file mode 100644 index 00000000..d568f5e2 --- /dev/null +++ b/scripts/aidlc-evaluator/scripts/regenerate_single_report.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +"""Regenerate a single report to show new token breakdown.""" + +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent +sys.path.insert(0, str(REPO_ROOT / "packages" / "reporting" / "src")) + +from reporting.collector import collect, TokenUsage +from reporting.render_md import render_markdown + +# Use the existing run +run_folder = REPO_ROOT / "runs/sci-calc/extension-test/20260309T221909-aidlc-workflows_featextension_hook_question_split-ext-all-extensions" + +print(f"Collecting data from: {run_folder}") +data = collect(run_folder) + +# The old run-metrics.yaml doesn't have repeated_context and api_total yet +# Let's manually populate them to demonstrate the new feature +# Based on the pattern we saw: repeated context is about 45-50% of API total +executor = data.metrics.executor_tokens.total_tokens +simulator = data.metrics.simulator_tokens.total_tokens +unique_total = executor + simulator + +# From the original analysis, API total was 8,524,073 +api_total_tokens = 8524073 +repeated = api_total_tokens - unique_total + +print(f"\n=== Current Data (Old Format) ===") +print(f"Executor: {executor:,}") +print(f"Simulator: {simulator:,}") +print(f"Sum: {unique_total:,}") +print(f"Total field (from YAML): {data.metrics.total_tokens.total_tokens:,}") +print(f"Missing: {data.metrics.total_tokens.total_tokens - unique_total:,}") + +# Simulate the new format +print(f"\n=== New Token Breakdown (Fixed) ===") +data.metrics.total_tokens = TokenUsage( + unique_total - simulator - executor, # Adjust input + data.metrics.total_tokens.output_tokens, + unique_total +) +data.metrics.repeated_context_tokens = TokenUsage( + repeated, 0, repeated +) +data.metrics.api_total_tokens = TokenUsage( + api_total_tokens - data.metrics.total_tokens.output_tokens, + data.metrics.total_tokens.output_tokens, + api_total_tokens +) + +print(f"Total Unique Tokens: {data.metrics.total_tokens.total_tokens:,}") +print(f" - Executor: {executor:,}") +print(f" - Simulator: {simulator:,}") +print(f"\nRepeated Context: {data.metrics.repeated_context_tokens.total_tokens:,}") +print(f" ({repeated / api_total_tokens * 100:.1f}% of API total)") +print(f"\nAPI Total: {data.metrics.api_total_tokens.total_tokens:,}") +print(f"\nVerification: {unique_total:,} + {repeated:,} = {api_total_tokens:,} ✓") + +# Generate markdown report +output_file = run_folder / "report-regenerated.md" +print(f"\n\nGenerating report: {output_file}") +markdown = render_markdown(data) +output_file.write_text(markdown) +print(f"✓ Report generated") +print(f"\nView the Token Usage section in: {output_file}") diff --git a/scripts/aidlc-evaluator/scripts/run_batch_evaluation.py b/scripts/aidlc-evaluator/scripts/run_batch_evaluation.py new file mode 100644 index 00000000..855220f1 --- /dev/null +++ b/scripts/aidlc-evaluator/scripts/run_batch_evaluation.py @@ -0,0 +1,432 @@ +#!/usr/bin/env python3 +"""Batch evaluation runner — run the AIDLC evaluation for multiple models sequentially. + +Reads per-model config files from config/ and invokes run_evaluation.py for +each selected model. The base config (config/default.yaml) provides AWS +credentials, swarm parameters, scorer model, and other defaults; per-model +configs override the executor model ID. + +After each run, the timestamped run folder is renamed to append the model +name as a suffix (e.g., runs/20260225T190020-aidlc-workflows_main-nova-pro/). + +Usage: + # Run all configured models + python run_batch_evaluation.py --models all + + # Run specific models (names match config file stems in config/) + python run_batch_evaluation.py --models nova-pro,sonnet-4-5 + + # List available model configs + python run_batch_evaluation.py --list + + # Override AWS profile + python run_batch_evaluation.py --models all --profile my-aws-profile +""" + +from __future__ import annotations + +import argparse +import shutil +import subprocess +import sys +import time +from datetime import UTC, datetime +from pathlib import Path + +import yaml + +REPO_ROOT = Path(__file__).resolve().parent.parent +CONFIG_DIR = REPO_ROOT / "config" +DEFAULT_CONFIG = CONFIG_DIR / "default.yaml" +TEST_CASES_DIR = REPO_ROOT / "test_cases" + +# Add shared package to path +sys.path.insert(0, str(REPO_ROOT / "packages" / "shared" / "src")) +from shared.scenario import resolve_scenario # noqa: E402 + +# Exclude the default config — it's the baseline config, not a model-under-test +EXCLUDE_CONFIGS = {"default"} + + +def discover_models() -> dict[str, dict]: + """Find all per-model config files and extract model IDs.""" + models = {} + for config_path in sorted(CONFIG_DIR.glob("*.yaml")): + name = config_path.stem + if name in EXCLUDE_CONFIGS: + continue + with open(config_path, encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + model_id = data.get("models", {}).get("executor", {}).get("model_id") + if model_id: + models[name] = { + "config_path": config_path, + "model_id": model_id, + } + return models + + +def _find_latest_timestamped_run(runs_dir: Path) -> Path | None: + """Find the most recent timestamped run folder under runs/. + + Only considers directories with names starting with a digit (YYYYMMDD...) + to avoid matching model-specific directories (nova-pro, sonnet-4-5, etc.). + """ + if not runs_dir.is_dir(): + return None + folders = sorted( + (d for d in runs_dir.iterdir() + if d.is_dir() and not d.name.startswith(".") and d.name[0:1].isdigit()), + reverse=True, + ) + return folders[0] if folders else None + + +def run_model( + name: str, + model_id: str, + config_path: Path, + base_config: Path, + runs_dir: Path, + profile: str, + region: str, + vision: Path, + tech_env: Path | None, + golden: Path, + openapi: Path, + baseline: Path | None, + scorer_model: str, + use_sandbox: bool = True, +) -> dict: + """Run the full evaluation pipeline for a single model. + + Args: + name: Model config name (e.g., "nova-pro"). + model_id: Bedrock model ID extracted from the per-model config. + config_path: Path to the per-model config file. + base_config: Path to the base config (default.yaml) passed to run_evaluation.py. + runs_dir: Base directory for run outputs. + profile: AWS profile name. + region: AWS region. + vision: Path to vision markdown file. + tech_env: Optional path to tech-env markdown file. + golden: Path to golden aidlc-docs directory. + openapi: Path to OpenAPI spec. + baseline: Optional path to golden.yaml baseline. + scorer_model: Bedrock model ID for qualitative scoring. + + Runs without --output-dir so the framework creates a timestamped folder + under runs/. After the run, the folder is renamed to append the model + name as a suffix (e.g., runs/20260225T190020-aidlc-workflows_main-nova-pro/). + """ + # Snapshot for legacy fallback (in case runner doesn't write sentinel) + existing_runs = set() + if runs_dir.is_dir(): + existing_runs = {d.name for d in runs_dir.iterdir() if d.is_dir()} + + cmd = [ + sys.executable, str(REPO_ROOT / "scripts" / "run_evaluation.py"), + "--config", str(base_config), + "--vision", str(vision), + "--golden", str(golden), + "--openapi", str(openapi), + "--executor-model", model_id, + "--scorer-model", scorer_model, + "--report-format", "both", + ] + if profile: + cmd += ["--profile", profile] + if region: + cmd += ["--region", region] + if tech_env and tech_env.is_file(): + cmd += ["--tech-env", str(tech_env)] + if baseline and baseline.is_file(): + cmd += ["--baseline", str(baseline)] + if use_sandbox: + cmd.append("--sandbox") + else: + cmd.append("--no-sandbox") + + # Log to a temp location, move with the run folder later + log_dir = runs_dir / ".batch-logs" + log_dir.mkdir(parents=True, exist_ok=True) + log_path = log_dir / f"{name}.log" + + print(f"\n{'=' * 70}") + print(f" Model: {name}") + print(f" Bedrock ID: {model_id}") + print(f" Config: {config_path}") + print(f" Log: {log_path}") + print(f"{'=' * 70}\n") + + start = time.monotonic() + started_at = datetime.now(UTC).isoformat(timespec="seconds") + + with open(log_path, "w", encoding="utf-8") as log_file: + # nosec B603 - Executing trusted run_evaluation.py script with validated model config + # nosemgrep: dangerous-subprocess-use-audit + result = subprocess.run( + cmd, + stdout=log_file, + stderr=subprocess.STDOUT, + ) + + elapsed_s = time.monotonic() - start + elapsed_min = elapsed_s / 60 + + status = "success" if result.returncode == 0 else "failed" + print(f" [{status.upper()}] {name} — {elapsed_min:.1f} min (exit code {result.returncode})") + + # Find the new run folder — prefer sentinel file over directory-diff. + _sentinel_name = ".last_run_folder" + sentinel = runs_dir / _sentinel_name + run_folder: Path | None = None + if sentinel.is_file(): + try: + candidate = Path(sentinel.read_text(encoding="utf-8").strip()) + sentinel.unlink(missing_ok=True) + if candidate.is_dir(): + run_folder = candidate + except OSError: + pass + + # Legacy fallback: before/after directory diff + if run_folder is None and runs_dir.is_dir(): + new_dirs = [ + d for d in runs_dir.iterdir() + if d.is_dir() and d.name not in existing_runs and d.name[0:1].isdigit() + ] + if new_dirs: + run_folder = sorted(new_dirs, reverse=True)[0] + + # Rename run folder to append model name as suffix + if run_folder: + model_dir = runs_dir / f"{run_folder.name}-{name}" + if model_dir.exists(): + shutil.rmtree(model_dir) + run_folder.rename(model_dir) + print(f" Run folder: {model_dir}") + # Move log into the run folder + shutil.move(str(log_path), str(model_dir / "batch-run.log")) + else: + print(f" [WARN] No run folder found after execution") + model_dir = runs_dir / name + model_dir.mkdir(parents=True, exist_ok=True) + shutil.move(str(log_path), str(model_dir / "batch-run.log")) + + # Write batch-level summary for this model + summary = { + "model_name": name, + "model_id": model_id, + "config_file": str(config_path), + "started_at": started_at, + "elapsed_seconds": round(elapsed_s, 1), + "exit_code": result.returncode, + "status": status, + "output_dir": str(model_dir), + } + summary_path = model_dir / "batch-summary.yaml" + with open(summary_path, "w", encoding="utf-8") as f: + yaml.safe_dump(summary, f, default_flow_style=False, sort_keys=False) + + return summary + + +def _load_base_config(config_path: Path) -> dict: + """Load the base config YAML and return its contents as a dict.""" + if config_path.is_file(): + with open(config_path, encoding="utf-8") as f: + return yaml.safe_load(f) or {} + return {} + + +def main() -> None: + parser = argparse.ArgumentParser( + prog="run_batch_evaluation", + description="Run AIDLC evaluation across multiple Bedrock models", + ) + parser.add_argument( + "--models", type=str, default=None, + help='Comma-separated model names (config file stems), or "all"', + ) + parser.add_argument( + "--list", action="store_true", + help="List available model configs and exit", + ) + parser.add_argument( + "--config", type=Path, default=DEFAULT_CONFIG, + help="Base config YAML providing AWS, swarm, and scorer defaults (default: config/default.yaml)", + ) + parser.add_argument( + "--profile", default=None, + help="AWS profile (default: from base config YAML)", + ) + parser.add_argument( + "--region", default=None, + help="AWS region (default: from base config YAML)", + ) + parser.add_argument( + "--scenario", type=str, default="sci-calc", + help="Scenario name or path to test case directory (default: sci-calc)", + ) + parser.add_argument("--vision", type=Path, default=None) + parser.add_argument("--tech-env", type=Path, default=None) + parser.add_argument("--golden", type=Path, default=None) + parser.add_argument("--openapi", type=Path, default=None) + parser.add_argument("--baseline", type=Path, default=None) + parser.add_argument( + "--scorer-model", default=None, + help="Bedrock model for qualitative scoring (default: from base config YAML)", + ) + parser.add_argument( + "--runs-dir", type=Path, default=REPO_ROOT / "runs", + help="Base directory for run outputs (default: ./runs)", + ) + + # Sandbox + sandbox_group = parser.add_mutually_exclusive_group() + sandbox_group.add_argument( + "--sandbox", action="store_true", default=True, + help="Run generated code in a Docker sandbox (default)", + ) + sandbox_group.add_argument( + "--no-sandbox", action="store_false", dest="sandbox", + help="Run generated code directly on the host (no isolation)", + ) + + args = parser.parse_args() + + # Resolve scenario and apply defaults + scenario = resolve_scenario(args.scenario, TEST_CASES_DIR) + if args.vision is None: + args.vision = scenario.vision_path + if args.tech_env is None: + args.tech_env = scenario.tech_env_path + if args.golden is None: + args.golden = scenario.golden_aidlc_docs_path + if args.openapi is None: + args.openapi = scenario.openapi_path + if args.baseline is None: + candidate = scenario.golden_baseline_path + if candidate.is_file(): + args.baseline = candidate + + # Route runs under runs// + if args.runs_dir == REPO_ROOT / "runs": + args.runs_dir = REPO_ROOT / "runs" / scenario.name + + available = discover_models() + + if args.list: + print("Available model configs:") + for name, info in available.items(): + print(f" {name:20s} {info['model_id']}") + sys.exit(0) + + if not args.models: + parser.error("--models is required (use --list to see available configs)") + + # ── Resolve defaults from base config YAML ──────────────────────── + base_cfg = _load_base_config(args.config) + + if args.profile is None: + args.profile = base_cfg.get("aws", {}).get("profile") + if args.region is None: + args.region = base_cfg.get("aws", {}).get("region") + if args.scorer_model is None: + args.scorer_model = ( + base_cfg.get("models", {}).get("scorer", {}).get("model_id") + ) + if args.scorer_model is None: + parser.error( + "--scorer-model is required (or set models.scorer.model_id in base config YAML)" + ) + + # ── Select models ───────────────────────────────────────────────── + if args.models == "all": + selected = list(available.keys()) + else: + selected = [m.strip() for m in args.models.split(",")] + for name in selected: + if name not in available: + parser.error( + f"Unknown model '{name}'. Available: {', '.join(available.keys())}" + ) + + batch_start = time.monotonic() + batch_started_at = datetime.now(UTC).isoformat(timespec="seconds") + results: list[dict] = [] + + print(f"AIDLC Batch Evaluation") + print(f" Scenario: {scenario.name}") + print(f" Models: {', '.join(selected)}") + print(f" Config: {args.config}") + print(f" Profile: {args.profile}") + print(f" Region: {args.region}") + print(f" Scorer: {args.scorer_model}") + print(f" Vision: {args.vision}") + print(f" Baseline: {args.baseline}") + + for name in selected: + model_info = available[name] + summary = run_model( + name=name, + model_id=model_info["model_id"], + config_path=model_info["config_path"], + base_config=args.config, + runs_dir=args.runs_dir, + profile=args.profile, + region=args.region, + vision=args.vision, + tech_env=args.tech_env, + golden=args.golden, + openapi=args.openapi, + baseline=args.baseline, + scorer_model=args.scorer_model, + use_sandbox=args.sandbox, + ) + results.append(summary) + + batch_elapsed = time.monotonic() - batch_start + + # Write batch-level summary + batch_summary = { + "started_at": batch_started_at, + "total_elapsed_seconds": round(batch_elapsed, 1), + "base_config": str(args.config), + "profile": args.profile, + "region": args.region, + "scorer_model": args.scorer_model, + "models_run": len(results), + "models_passed": sum(1 for r in results if r["status"] == "success"), + "models_failed": sum(1 for r in results if r["status"] == "failed"), + "results": results, + } + batch_summary_path = args.runs_dir / "batch-summary.yaml" + args.runs_dir.mkdir(parents=True, exist_ok=True) + with open(batch_summary_path, "w", encoding="utf-8") as f: + yaml.safe_dump(batch_summary, f, default_flow_style=False, sort_keys=False) + + # Clean up temp log dir + log_dir = args.runs_dir / ".batch-logs" + if log_dir.is_dir() and not any(log_dir.iterdir()): + log_dir.rmdir() + + # Print final summary + print(f"\n{'=' * 70}") + print(f" Batch Evaluation Complete") + print(f"{'=' * 70}") + print(f" Total time: {batch_elapsed / 60:.1f} min") + print(f" Models run: {len(results)}") + for r in results: + marker = "PASS" if r["status"] == "success" else "FAIL" + print(f" [{marker}] {r['model_name']:20s} {r['elapsed_seconds'] / 60:.1f} min") + print(f" Batch summary: {batch_summary_path}") + print(f"\n Run 'python run_comparison_report.py' to generate cross-model comparison.\n") + + failed = sum(1 for r in results if r["status"] == "failed") + sys.exit(1 if failed > 0 else 0) + + +if __name__ == "__main__": + main() diff --git a/scripts/aidlc-evaluator/scripts/run_cli_evaluation.py b/scripts/aidlc-evaluator/scripts/run_cli_evaluation.py new file mode 100644 index 00000000..c5129f51 --- /dev/null +++ b/scripts/aidlc-evaluator/scripts/run_cli_evaluation.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 +"""Run AIDLC evaluation through a CLI adapter. + +Usage: + # List available adapters + python run_cli_evaluation.py --list + + # Run evaluation through kiro-cli + python run_cli_evaluation.py --cli kiro-cli \ + --vision test_cases/sci-calc/vision.md \ + --golden test_cases/sci-calc/golden-aidlc-docs + + # Check prerequisites for a CLI tool + python run_cli_evaluation.py --cli kiro-cli --check-only + + # Override rules ref (branch/tag/commit) + python run_cli_evaluation.py --cli claude-code --rules-ref v0.2.0 + + # Use local rules directory instead of git clone + python run_cli_evaluation.py --cli claude-code --rules-path /path/to/rules +""" + +from __future__ import annotations + +import argparse +import os +import re +import shutil +import stat +import subprocess +import sys +from datetime import UTC, datetime +from pathlib import Path +from urllib.parse import urlparse + +import yaml + +REPO_ROOT = Path(__file__).resolve().parent.parent +PACKAGES = REPO_ROOT / "packages" + +# Add cli-harness to path +sys.path.insert(0, str(PACKAGES / "cli-harness" / "src")) + +from cli_harness.registry import get_adapter, list_adapters # noqa: E402 +from cli_harness.orchestrator import run_cli_evaluation # noqa: E402 + +_SLUG_MAX_LEN = 80 + + +def _rules_slug( + rules_source: str, + rules_repo: str, + rules_ref: str, + rules_local_path: str | None, +) -> str: + """Derive a filesystem-safe slug from the AIDLC rules configuration. + + Mirrors packages/execution/src/aidlc_runner/runner.py:_rules_slug(). + """ + if rules_source == "local" and rules_local_path: + raw = f"local_{Path(rules_local_path).name}" + else: + path = urlparse(rules_repo).path.rstrip("/") + repo_name = Path(path).stem # strips .git suffix + raw = f"{repo_name}_{rules_ref}" + slug = raw.replace(" ", "-") + slug = re.sub(r"[^a-zA-Z0-9._-]", "", slug) + return slug[:_SLUG_MAX_LEN] + + +def _default_output_dir(cli_name: str, slug: str) -> Path: + """Generate a timestamped output directory matching the normal run pattern. + + Format: runs/{timestamp}-{rules_slug}-{cli_name} + Example: runs/20260227T160245-aidlc-workflows_main-kiro-cli + """ + ts = datetime.now(UTC).strftime("%Y%m%dT%H%M%S") + return REPO_ROOT / "runs" / f"{ts}-{slug}-{cli_name.lower()}" + + +def _setup_rules( + output_dir: Path, + *, + rules_source: str = "git", + rules_repo: str = "https://github.com/awslabs/aidlc-workflows.git", + rules_ref: str = "main", + rules_local_path: str | None = None, +) -> Path: + """Download or copy AIDLC rules into the output directory. + + Mirrors the pattern from packages/execution/src/aidlc_runner/runner.py:setup_rules(). + """ + rules_dest = output_dir / "aidlc-rules" + + if rules_source == "local" and rules_local_path: + local_path = Path(rules_local_path) + if not local_path.exists(): + raise FileNotFoundError(f"Local rules path not found: {local_path}") + shutil.copytree(local_path / "aidlc-rules", rules_dest) + else: + # Git clone (shallow, single branch) + print(f" Cloning AIDLC rules from {rules_repo} (ref: {rules_ref})...") + # nosec B603, B607 - Git clone of trusted AIDLC rules repository + # nosemgrep: dangerous-subprocess-use-audit + result = subprocess.run( + [ + "git", "clone", + "--branch", rules_ref, + "--depth", "1", + rules_repo, + str(rules_dest / "_repo"), + ], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + raise RuntimeError(f"Failed to clone AIDLC rules repo:\n{result.stderr}") + + # Move aidlc-rules content up from _repo/aidlc-rules/ to rules_dest/ + repo_rules = rules_dest / "_repo" / "aidlc-rules" + if repo_rules.exists(): + for item in repo_rules.iterdir(): + shutil.move(str(item), str(rules_dest / item.name)) + + # Clean up the full repo clone (force-remove read-only git pack files) + def _force_remove_readonly(func, path, _exc_info): + os.chmod(path, stat.S_IWRITE) + func(path) + + # onexc was added in Python 3.12; fall back to onerror on older versions + if sys.version_info >= (3, 12): + shutil.rmtree(rules_dest / "_repo", onexc=_force_remove_readonly) + else: + shutil.rmtree(rules_dest / "_repo", onerror=_force_remove_readonly) + + return rules_dest + + +def main() -> None: + parser = argparse.ArgumentParser( + prog="run_cli_evaluation", + description="Run AIDLC evaluation through a CLI AI assistant", + ) + parser.add_argument( + "--cli", type=str, + help="CLI adapter name (e.g., kiro-cli)", + ) + parser.add_argument( + "--list", action="store_true", + help="List available CLI adapters and exit", + ) + parser.add_argument( + "--check-only", action="store_true", + help="Only check CLI prerequisites, don't run evaluation", + ) + parser.add_argument( + "--config", type=Path, + default=REPO_ROOT / "config" / "default.yaml", + help="Path to YAML config file (default: config/default.yaml)", + ) + parser.add_argument("--vision", type=Path, default=REPO_ROOT / "test_cases" / "sci-calc" / "vision.md") + parser.add_argument("--tech-env", type=Path, default=REPO_ROOT / "test_cases" / "sci-calc" / "tech-env.md") + parser.add_argument("--golden", type=Path, default=REPO_ROOT / "test_cases" / "sci-calc" / "golden-aidlc-docs") + parser.add_argument("--openapi", type=Path, default=REPO_ROOT / "test_cases" / "sci-calc" / "openapi.yaml") + parser.add_argument("--baseline", type=Path, default=REPO_ROOT / "test_cases" / "sci-calc" / "golden.yaml") + parser.add_argument( + "--rules-ref", default=None, + help="Git ref (branch/tag/commit) for AIDLC rules (overrides config value)", + ) + parser.add_argument( + "--rules-path", type=Path, default=None, + help="Path to local AIDLC rules directory (overrides git clone)", + ) + parser.add_argument("--output-dir", type=Path, default=None) + parser.add_argument("--profile", default=None, help="AWS profile (default: from config YAML)") + parser.add_argument("--region", default=None, help="AWS region (default: from config YAML)") + parser.add_argument("--scorer-model", default=None, help="Bedrock model for scoring (default: from config YAML)") + parser.add_argument("--model", default=None, help="Model to use with the CLI adapter (e.g., claude-sonnet-4)") + parser.add_argument( + "--verbose", "-v", action="store_true", + help="Enable verbose logging output", + ) + + args = parser.parse_args() + + if args.verbose: + import logging + logging.basicConfig( + level=logging.DEBUG, + format="%(asctime)s %(name)s %(levelname)s: %(message)s", + ) + + if args.list: + print("Available CLI adapters:") + for name in list_adapters(): + try: + adapter = get_adapter(name) + ok, msg = adapter.check_prerequisites() + status = "ready" if ok else "not ready" + print(f" {name:15s} [{status}] {msg}") + except Exception as e: + print(f" {name:15s} [error] {e}") + sys.exit(0) + + if not args.cli: + parser.error("--cli is required (use --list to see available adapters)") + + adapter = get_adapter(args.cli) + adapter.verbose = args.verbose + + if args.check_only: + ok, msg = adapter.check_prerequisites() + print(f"{adapter.name}: {'OK' if ok else 'FAIL'} — {msg}") + sys.exit(0 if ok else 1) + + # ── Resolve defaults from config YAML when not provided on CLI ────── + cfg_data: dict = {} + if args.config and args.config.exists(): + with open(args.config, encoding="utf-8") as f: + cfg_data = yaml.safe_load(f) or {} + + if args.profile is None: + args.profile = cfg_data.get("aws", {}).get("profile") + if args.region is None: + args.region = cfg_data.get("aws", {}).get("region") + if args.scorer_model is None: + args.scorer_model = ( + cfg_data.get("models", {}).get("scorer", {}).get("model_id") + ) + if args.scorer_model is None: + parser.error( + "--scorer-model is required (or set models.scorer.model_id in config YAML)" + ) + + # ── Resolve AIDLC rules config ──────────────────────────────────────── + aidlc_cfg = cfg_data.get("aidlc", {}) + rules_source = aidlc_cfg.get("rules_source", "git") + rules_repo = aidlc_cfg.get("rules_repo", "https://github.com/awslabs/aidlc-workflows.git") + rules_ref = args.rules_ref or aidlc_cfg.get("rules_ref", "main") + + if args.rules_path: + rules_source = "local" + rules_local_path = str(Path(args.rules_path).resolve()) + else: + rules_local_path = aidlc_cfg.get("rules_local_path") + + # Resolve all paths relative to cwd so they work from any directory + vision_path = Path(args.vision).resolve() + tech_env_path = Path(args.tech_env).resolve() + golden_docs = Path(args.golden).resolve() + openapi_path = Path(args.openapi).resolve() + baseline_path = Path(args.baseline).resolve() + slug = _rules_slug(rules_source, rules_repo, rules_ref, rules_local_path) + output_dir = ( + Path(args.output_dir).resolve() + if args.output_dir + else _default_output_dir(args.cli, slug) + ) + + # ── Setup AIDLC rules (git clone or local copy) ───────────────────── + output_dir.mkdir(parents=True, exist_ok=True) + + rules_path = _setup_rules( + output_dir, + rules_source=rules_source, + rules_repo=rules_repo, + rules_ref=rules_ref, + rules_local_path=rules_local_path, + ) + + result, eval_rc = run_cli_evaluation( + adapter=adapter, + vision_path=vision_path, + output_dir=output_dir, + golden_docs=golden_docs, + rules_path=rules_path, + tech_env_path=tech_env_path, + openapi_path=openapi_path, + baseline_path=baseline_path, + profile=args.profile, + region=args.region, + scorer_model=args.scorer_model, + model=args.model, + rules_source=rules_source, + rules_ref=rules_ref, + rules_repo=rules_repo, + ) + + if not result.success: + print(f"\n[FAILED] {adapter.name}: {result.error}") + sys.exit(1) + + print(f"\n[DONE] {adapter.name} evaluation complete.") + print(f" Output: {result.output_dir}") + sys.exit(eval_rc) + + +if __name__ == "__main__": + main() diff --git a/scripts/aidlc-evaluator/scripts/run_comparison_report.py b/scripts/aidlc-evaluator/scripts/run_comparison_report.py new file mode 100644 index 00000000..3d67cc78 --- /dev/null +++ b/scripts/aidlc-evaluator/scripts/run_comparison_report.py @@ -0,0 +1,412 @@ +#!/usr/bin/env python3 +"""Cross-model comparison report — aggregate results from batch runs and golden baseline. + +Scans model-specific run directories, loads their evaluation metrics, and produces +a comparison matrix in both Markdown and YAML formats. + +Usage: + # Compare all model runs found under runs/ + python run_comparison_report.py + + # Specify runs directory and baseline + python run_comparison_report.py --runs-dir ./runs --baseline test_cases/sci-calc/golden.yaml + + # Compare specific models + python run_comparison_report.py --models nova-pro,sonnet-4-5 +""" + +from __future__ import annotations + +import argparse +import sys +from datetime import UTC, datetime +from pathlib import Path + +import yaml + +REPO_ROOT = Path(__file__).resolve().parent.parent +TEST_CASES_DIR = REPO_ROOT / "test_cases" + +# Add reporting and shared packages to path +sys.path.insert(0, str(REPO_ROOT / "packages" / "reporting" / "src")) +sys.path.insert(0, str(REPO_ROOT / "packages" / "shared" / "src")) + +from reporting.baseline import BaselineMetrics, load_baseline # noqa: E402 +from reporting.collector import collect # noqa: E402 +from reporting.baseline import extract_baseline # noqa: E402 +from shared.scenario import resolve_scenario # noqa: E402 + + +def _extract_model_suffix(dirname: str) -> str | None: + """Extract the model name suffix from a batch run folder name. + + Batch folders are named --, e.g. + '20260225T190020-aidlc-workflows_main-nova-pro'. The model suffix + is everything after the rules slug, which itself follows the first + timestamp segment. We match known config stems against the end of + the directory name. + """ + config_dir = Path(__file__).resolve().parent.parent / "config" + if config_dir.is_dir(): + # Try longest names first so "mistral-large-3" matches before "large-3" + stems = sorted( + (p.stem for p in config_dir.glob("*.yaml") if p.stem != "default"), + key=len, reverse=True, + ) + for stem in stems: + if dirname.endswith(f"-{stem}"): + return stem + return None + + +def find_model_runs(runs_dir: Path) -> dict[str, Path]: + """Find model-specific run directories. + + Supports three layouts: + - runs/--/ (batch runner output, current) + - runs// (legacy batch runner or direct --output-dir) + - runs/// (legacy nested batch output) + """ + models = {} + if not runs_dir.is_dir(): + return models + + for entry in sorted(runs_dir.iterdir()): + if not entry.is_dir() or entry.name.startswith("."): + continue + + # Current format: timestamped dir with model suffix + if entry.name[0:1].isdigit() and (entry / "run-meta.yaml").is_file(): + model = _extract_model_suffix(entry.name) + if model: + # Keep the most recent run per model (sorted order = latest last) + models[model] = entry + continue + + # Legacy: direct output with run-meta.yaml + if (entry / "run-meta.yaml").is_file(): + models[entry.name] = entry + continue + + # Legacy: nested timestamped subdirectories + sub_runs = sorted( + (d for d in entry.iterdir() if d.is_dir() and (d / "run-meta.yaml").is_file()), + reverse=True, + ) + if sub_runs: + models[entry.name] = sub_runs[0] + + return models + + +def load_model_metrics(run_folder: Path) -> BaselineMetrics | None: + """Load evaluation metrics from a run folder.""" + try: + data = collect(run_folder) + return extract_baseline(data) + except Exception as e: + print(f" [WARN] Failed to collect metrics from {run_folder}: {e}", file=sys.stderr) + return None + + +def format_num(val: float | int | None, decimals: int = 1) -> str: + """Format a number for display.""" + if val is None: + return "—" + if isinstance(val, float): + return f"{val:.{decimals}f}" + return str(val) + + +def generate_comparison_markdown( + model_metrics: dict[str, BaselineMetrics], + golden: BaselineMetrics | None, +) -> str: + """Generate a Markdown comparison matrix.""" + lines: list[str] = [] + lines.append("# Cross-Model Comparison Report") + lines.append("") + lines.append(f"Generated: {datetime.now(UTC).isoformat(timespec='seconds')}") + lines.append("") + + # Build column list: golden first (if present), then alphabetical models + columns: list[tuple[str, BaselineMetrics]] = [] + if golden: + golden_label = f"Golden ({golden.executor_model.split('.')[-1]})" if golden.executor_model else "Golden" + columns.append((golden_label, golden)) + for name in sorted(model_metrics.keys()): + columns.append((name, model_metrics[name])) + + if not columns: + lines.append("No model runs found.") + return "\n".join(lines) + + # Header + header = "| Metric |" + separator = "|--------|" + for col_name, _ in columns: + header += f" {col_name} |" + separator += "--------|" + lines.append(header) + lines.append(separator) + + # Metric rows + metric_rows: list[tuple[str, str, str, bool]] = [ + # (display_name, category, attr_name, higher_is_better) + ("**Unit Tests**", "", "", True), + ("Pass %", "unit", "tests_pass_pct", True), + ("Passed", "unit", "tests_passed", True), + ("Failed", "unit", "tests_failed", False), + ("Total", "unit", "tests_total", True), + ("Coverage %", "unit", "coverage_pct", True), + ("**Contract Tests**", "", "", True), + ("Passed", "contract", "contract_passed", True), + ("Failed", "contract", "contract_failed", False), + ("Total", "contract", "contract_total", True), + ("**Code Quality**", "", "", True), + ("Lint Errors", "quality", "lint_errors", False), + ("Lint Warnings", "quality", "lint_warnings", False), + ("Lint Total", "quality", "lint_total", False), + ("Security Findings", "quality", "security_total", False), + ("Security High", "quality", "security_high", False), + ("Duplication Blocks", "quality", "duplication_blocks", False), + ("**Qualitative**", "", "", True), + ("Overall Score", "qual", "qualitative_score", True), + ("Inception Score", "qual", "inception_score", True), + ("Construction Score", "qual", "construction_score", True), + ("**Artifacts**", "", "", True), + ("Source Files", "art", "source_files", True), + ("Test Files", "art", "test_files", True), + ("Total Files", "art", "total_files", True), + ("Lines of Code", "art", "lines_of_code", True), + ("Doc Files", "art", "doc_files", True), + ("**Execution**", "", "", True), + ("Total Tokens", "exec", "total_tokens", False), + ("Executor Tokens", "exec", "executor_total_tokens", False), + ("Simulator Tokens", "exec", "simulator_total_tokens", False), + ("Wall Clock (min)", "exec", "wall_clock_min", False), + ("Handoffs", "exec", "handoffs", False), + ("**Context Size**", "", "", True), + ("Max Tokens", "ctx", "context_size_max", False), + ("Avg Tokens", "ctx", "context_size_avg", False), + ("Median Tokens", "ctx", "context_size_median", False), + ] + + for display_name, category, attr, higher_is_better in metric_rows: + if not attr: + # Section header row + row = f"| {display_name} |" + for _ in columns: + row += " |" + lines.append(row) + continue + + row = f"| {display_name} |" + golden_val = None + if golden and attr != "wall_clock_min": + golden_val = getattr(golden, attr, None) + elif golden and attr == "wall_clock_min": + golden_val = golden.wall_clock_ms / 60000 if golden.wall_clock_ms else None + + for col_name, metrics in columns: + if attr == "wall_clock_min": + val = metrics.wall_clock_ms / 60000 if metrics.wall_clock_ms else None + else: + val = getattr(metrics, attr, None) + + formatted = format_num(val) + + # Add delta indicator vs golden (skip for golden column itself) + if golden and col_name != "Golden (Opus 4.6)" and val is not None and golden_val is not None: + delta = float(val) - float(golden_val) + if abs(delta) > 0.001: + if higher_is_better: + indicator = " ^" if delta > 0 else " v" + else: + indicator = " v" if delta > 0 else " ^" + formatted += indicator + + row += f" {formatted} |" + lines.append(row) + + # Legend + lines.append("") + lines.append("**Legend:** ^ = better than golden, v = worse than golden") + lines.append("") + + return "\n".join(lines) + + +def generate_comparison_yaml( + model_metrics: dict[str, BaselineMetrics], + golden: BaselineMetrics | None, +) -> dict: + """Generate structured YAML comparison data.""" + result: dict = { + "generated_at": datetime.now(UTC).isoformat(timespec="seconds"), + "models": {}, + } + + if golden: + result["golden"] = { + "executor_model": golden.executor_model, + "run_folder": golden.run_folder, + } + + for name, metrics in sorted(model_metrics.items()): + result["models"][name] = { + "executor_model": metrics.executor_model, + "simulator_model": metrics.simulator_model, + "run_folder": metrics.run_folder, + "unit_tests": { + "passed": metrics.tests_passed, + "failed": metrics.tests_failed, + "total": metrics.tests_total, + "pass_pct": metrics.tests_pass_pct, + "coverage_pct": metrics.coverage_pct, + }, + "contract_tests": { + "passed": metrics.contract_passed, + "failed": metrics.contract_failed, + "total": metrics.contract_total, + }, + "code_quality": { + "lint_errors": metrics.lint_errors, + "lint_warnings": metrics.lint_warnings, + "lint_total": metrics.lint_total, + "security_total": metrics.security_total, + "security_high": metrics.security_high, + "duplication_blocks": metrics.duplication_blocks, + }, + "qualitative": { + "overall_score": metrics.qualitative_score, + "inception_score": metrics.inception_score, + "construction_score": metrics.construction_score, + }, + "artifacts": { + "source_files": metrics.source_files, + "test_files": metrics.test_files, + "total_files": metrics.total_files, + "lines_of_code": metrics.lines_of_code, + "doc_files": metrics.doc_files, + }, + "execution": { + "total_tokens": metrics.total_tokens, + "input_tokens": metrics.input_tokens, + "output_tokens": metrics.output_tokens, + "executor_total_tokens": metrics.executor_total_tokens, + "simulator_total_tokens": metrics.simulator_total_tokens, + "wall_clock_ms": metrics.wall_clock_ms, + "handoffs": metrics.handoffs, + }, + "context_size": { + "max_tokens": metrics.context_size_max, + "avg_tokens": metrics.context_size_avg, + "median_tokens": metrics.context_size_median, + }, + } + + return result + + +def main() -> None: + parser = argparse.ArgumentParser( + prog="run_comparison_report", + description="Generate cross-model comparison report from batch evaluation runs", + ) + parser.add_argument( + "--scenario", type=str, default="sci-calc", + help="Scenario name or path to test case directory (default: sci-calc)", + ) + parser.add_argument( + "--runs-dir", type=Path, default=None, + help="Directory containing model run folders (default: runs//)", + ) + parser.add_argument( + "--baseline", type=Path, default=None, + help="Path to golden.yaml baseline (default: from scenario)", + ) + parser.add_argument( + "--models", type=str, default=None, + help="Comma-separated model names to include (default: all found)", + ) + parser.add_argument( + "--output", type=Path, default=None, + help="Output directory for reports (default: /comparison/)", + ) + + args = parser.parse_args() + + # Resolve scenario and apply defaults + scenario = resolve_scenario(args.scenario, TEST_CASES_DIR) + if args.runs_dir is None: + args.runs_dir = REPO_ROOT / "runs" / scenario.name + if args.baseline is None: + args.baseline = scenario.golden_baseline_path + + # Load golden baseline + golden: BaselineMetrics | None = None + if args.baseline and args.baseline.is_file(): + golden = load_baseline(args.baseline) + print(f"Loaded golden baseline: {args.baseline}") + print(f" Model: {golden.executor_model}") + else: + print("No golden baseline found — comparison will be model-to-model only.") + + # Discover model runs + all_runs = find_model_runs(args.runs_dir) + if not all_runs: + print(f"No model runs found in {args.runs_dir}") + sys.exit(1) + + # Filter to selected models + if args.models: + selected = {m.strip() for m in args.models.split(",")} + runs = {k: v for k, v in all_runs.items() if k in selected} + missing = selected - set(runs.keys()) + if missing: + print(f"Warning: runs not found for: {', '.join(missing)}", file=sys.stderr) + else: + runs = all_runs + + print(f"\nFound {len(runs)} model run(s):") + for name, path in runs.items(): + print(f" {name:20s} {path}") + + # Collect metrics from each run + model_metrics: dict[str, BaselineMetrics] = {} + for name, run_folder in runs.items(): + print(f"\nCollecting metrics: {name}...") + metrics = load_model_metrics(run_folder) + if metrics: + model_metrics[name] = metrics + else: + print(f" [SKIP] Could not load metrics for {name}") + + if not model_metrics: + print("No valid model metrics collected.") + sys.exit(1) + + # Generate reports + output_dir = args.output or args.runs_dir / "comparison" + output_dir.mkdir(parents=True, exist_ok=True) + + # Markdown report + md_content = generate_comparison_markdown(model_metrics, golden) + md_path = output_dir / "comparison-report.md" + md_path.write_text(md_content, encoding="utf-8") + print(f"\nMarkdown report: {md_path}") + + # YAML data + yaml_data = generate_comparison_yaml(model_metrics, golden) + yaml_path = output_dir / "comparison-data.yaml" + with open(yaml_path, "w", encoding="utf-8") as f: + yaml.safe_dump(yaml_data, f, default_flow_style=False, sort_keys=False) + print(f"YAML data: {yaml_path}") + + # Print summary to stdout + print(f"\n{md_content}") + + +if __name__ == "__main__": + main() diff --git a/scripts/aidlc-evaluator/scripts/run_evaluation.py b/scripts/aidlc-evaluator/scripts/run_evaluation.py new file mode 100644 index 00000000..d57a08ae --- /dev/null +++ b/scripts/aidlc-evaluator/scripts/run_evaluation.py @@ -0,0 +1,881 @@ +#!/usr/bin/env python3 +"""Full evaluation suite — run AIDLC workflow, then score the outputs. + +This is the controlling script for the AIDLC evaluation framework. It +orchestrates six stages: + + 1. EXECUTION — Run the AIDLC two-agent workflow (packages/execution) + to produce aidlc-docs + workspace code from a vision doc. + 2. POST-RUN — Automatically install deps and run generated tests + (handled inside the execution stage). + 3. QUANTITATIVE — Lint and security scan the generated code + (packages/quantitative). + 4. CONTRACT — Spin up the generated app and validate endpoints + against an API spec (packages/contracttest). + 5. QUALITATIVE — Compare the produced aidlc-docs against a golden + reference using Bedrock for semantic similarity scoring. + 6. REPORT — Generate a consolidated Markdown + HTML report + aggregating all evaluation data (packages/reporting). + +Usage: + + # Full pipeline: execute AIDLC workflow + semantic evaluation + python run_evaluation.py \ + --vision test_cases/sci-calc/vision.md \ + --tech-env test_cases/sci-calc/tech-env.md \ + --golden test_cases/sci-calc/golden-aidlc-docs + + # Evaluate an existing run (skip execution, just score) + python run_evaluation.py \ + --evaluate-only runs/20260213T202730-.../aidlc-docs \ + --golden test_cases/sci-calc/golden-aidlc-docs + + # Run unit tests for all packages + python run_evaluation.py --test +""" + +from __future__ import annotations + +import argparse +import os +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path + +import yaml + +REPO_ROOT = Path(__file__).resolve().parent.parent +PACKAGES = REPO_ROOT / "packages" +TEST_CASES_DIR = REPO_ROOT / "test_cases" + +# Add shared package to path +sys.path.insert(0, str(PACKAGES / "shared" / "src")) +from shared.io import atomic_yaml_dump # noqa: E402 +from shared.scenario import resolve_scenario # noqa: E402 + + +# ── helpers ────────────────────────────────────────────────────────────────── + +def _read_test_status(test_results_path: Path) -> dict | None: + """Read test-results.yaml and return a normalized status dict, or None.""" + if not test_results_path.exists(): + return None + + with open(test_results_path, encoding="utf-8") as f: + data = yaml.safe_load(f) or {} + + status = data.get("status", "unknown") + install_ok = data.get("install", {}).get("success", False) + test_ok = data.get("test", {}).get("success", False) + parsed = data.get("test", {}).get("parsed_results", {}) + passed = parsed.get("passed") + failed = parsed.get("failed") + errors = parsed.get("errors") + total = parsed.get("total") + + if status == "install_failed": + summary = f"install failed — {data.get('install', {}).get('output', '')[:120]}" + tests_passed = False + elif not test_ok: + if total is not None: + summary = f"{passed or 0} passed, {failed or 0} failed, {errors or 0} errors (of {total})" + else: + summary = f"tests failed — {data.get('test', {}).get('output', '')[:120]}" + tests_passed = False + else: + summary = f"{passed or 0} passed, {failed or 0} failed (of {total})" + tests_passed = (failed or 0) == 0 and (errors or 0) == 0 + + return { + "tests_passed": tests_passed, + "install_ok": install_ok, + "test_ok": test_ok, + "summary": summary, + "passed": passed, + "failed": failed, + "errors": errors, + "total": total, + "raw_status": status, + } + + +def _print_test_summary(test_status: dict | None) -> None: + """Print a clear post-run test summary.""" + print(f"\n{'=' * 60}") + print(" Stage 2: Post-Run Test Results") + print(f"{'=' * 60}\n") + if test_status is None: + print(" No testable project detected — skipped.") + return + if test_status["tests_passed"]: + print(f" PASS {test_status['summary']}") + else: + print(f" FAIL {test_status['summary']}") + if not test_status["install_ok"]: + print(" (dependency install failed — tests could not run)") + + +def _run_cmd(cmd: list[str], label: str, **kwargs) -> subprocess.CompletedProcess: + print(f"\n{'=' * 60}") + print(f" {label}") + print(f"{'=' * 60}\n") + # nosec B603 - Executing trusted framework evaluation packages with validated args + # nosemgrep: dangerous-subprocess-use-audit + result = subprocess.run(cmd, **kwargs) + if result.returncode != 0: + print(f"\n[FAILED] {label} (exit code {result.returncode})", file=sys.stderr) + return result + + +def _get_repo_info() -> dict: + """Get git info for the aidlc-regression repository.""" + # nosec B603, B607 - Static git commands for repo metadata collection + # nosemgrep: dangerous-subprocess-use-audit + info: dict = {} + try: + info["commit"] = subprocess.run( + ["git", "rev-parse", "HEAD"], + capture_output=True, text=True, check=True, cwd=REPO_ROOT, + ).stdout.strip() + except (subprocess.CalledProcessError, FileNotFoundError): + info["commit"] = None + + try: + info["branch"] = subprocess.run( + ["git", "rev-parse", "--abbrev-ref", "HEAD"], + capture_output=True, text=True, check=True, cwd=REPO_ROOT, + ).stdout.strip() + except (subprocess.CalledProcessError, FileNotFoundError): + info["branch"] = None + + try: + porcelain = subprocess.run( + ["git", "status", "--porcelain"], + capture_output=True, text=True, check=True, cwd=REPO_ROOT, + ).stdout.strip() + info["dirty"] = bool(porcelain) + except (subprocess.CalledProcessError, FileNotFoundError): + info["dirty"] = None + + return info + + +def _save_evaluation_config( + run_folder: Path, + args: argparse.Namespace, + cfg_data: dict, +) -> None: + """Write aidlc-regression repo info and full resolved configuration to the run folder. + + Creates evaluation-config.yaml with the full config dump, and also + updates run-meta.yaml with evaluation-level fields (scorer model, + golden/openapi paths, git commit, etc.). + """ + repo_info = _get_repo_info() + + def _rel(p: Path | None) -> str | None: + if p is None: + return None + try: + return str(p.resolve().relative_to(Path.cwd().resolve())) + except ValueError: + return str(p) + + eval_config: dict = { + "saved_at": datetime.now(timezone.utc).isoformat(), + "aidlc_regression_repo": repo_info, + "config_file": _rel(args.config), + "resolved_config": cfg_data, + "cli_args": { + "vision": _rel(args.vision), + "tech_env": _rel(args.tech_env), + "golden": _rel(args.golden), + "openapi": _rel(args.openapi), + "evaluate_only": _rel(args.evaluate_only), + "profile": args.profile, + "region": args.region, + "scorer_model": args.scorer_model, + "executor_model": args.executor_model, + "rules_ref": args.rules_ref, + "output_dir": _rel(args.output_dir), + "sandbox": args.sandbox, + "report_format": args.report_format, + }, + } + + atomic_yaml_dump(eval_config, run_folder / "evaluation-config.yaml") + + # Also update run-meta.yaml with evaluation-level fields + meta_path = run_folder / "run-meta.yaml" + if meta_path.exists(): + with open(meta_path, encoding="utf-8") as f: + meta = yaml.safe_load(f) or {} + else: + meta = {} + + meta["aidlc_regression_commit"] = repo_info.get("commit") + meta["aidlc_regression_branch"] = repo_info.get("branch") + meta["aidlc_regression_dirty"] = repo_info.get("dirty") + meta.setdefault("config", {})["scorer_model"] = args.scorer_model + meta["golden_dir"] = _rel(args.golden) + meta["openapi_spec"] = _rel(args.openapi) + meta["baseline"] = _rel(args.baseline) + + atomic_yaml_dump(meta, meta_path) + + +_SENTINEL_NAME = ".last_run_folder" + + +def _read_run_sentinel(output_dir: Path) -> Path | None: + """Read the sentinel file written by create_run_folder(). + + Returns the run folder path if the sentinel exists and the directory + is valid, otherwise None. The sentinel is removed after reading so + it does not confuse subsequent runs. + """ + sentinel = output_dir / _SENTINEL_NAME + if not sentinel.is_file(): + return None + try: + run_folder = Path(sentinel.read_text(encoding="utf-8").strip()) + sentinel.unlink(missing_ok=True) + if run_folder.is_dir(): + return run_folder + except OSError: + pass + return None + + +def _list_run_folders(output_dir: Path | None = None) -> set[Path]: + """Return the current set of run folders under runs/. + + Args: + output_dir: Directory to search for run folders. Defaults to REPO_ROOT / "runs". + """ + runs_dir = output_dir if output_dir else REPO_ROOT / "runs" + if not runs_dir.is_dir(): + return set() + return {d for d in runs_dir.iterdir() if d.is_dir() and not d.name.startswith(".")} + + +def _find_new_run(before: set[Path], output_dir: Path | None = None) -> Path | None: + """Find the single new run folder created since *before* was captured. + + Falls back to the newest folder if multiple appeared (shouldn't happen + in normal single-run usage). + + Args: + before: Set of run folders that existed before execution. + output_dir: Directory to search for new run folders. Defaults to REPO_ROOT / "runs". + + .. deprecated:: + Prefer :func:`_read_run_sentinel` which avoids the TOCTOU race + condition inherent in before/after directory listing. + """ + after = _list_run_folders(output_dir) + new = sorted(after - before, reverse=True) + return new[0] if new else None + + +def _find_latest_run(scenario_name: str | None = None) -> Path | None: + """Find the most recent timestamped run folder under runs/. + + Only matches directories whose names start with a digit (YYYYMMDD...), + skipping model-named directories (e.g., nova-pro/, sonnet-4-5/) that + the batch runner creates. + + If *scenario_name* is given, searches under ``runs//`` + instead of ``runs/``. + """ + runs_dir = REPO_ROOT / "runs" + if scenario_name: + runs_dir = runs_dir / scenario_name + if not runs_dir.is_dir(): + return None + folders = sorted( + (d for d in runs_dir.iterdir() + if d.is_dir() and not d.name.startswith(".") and d.name[0:1].isdigit()), + reverse=True, + ) + return folders[0] if folders else None + + +# ── stages ─────────────────────────────────────────────────────────────────── + +def stage_execute(args: argparse.Namespace) -> Path | None: + """Stage 1: Run the AIDLC workflow via packages/execution. + + Returns the run folder even if the runner exits non-zero, as long as + aidlc-docs were produced (the swarm may fail on a late handoff after + all documents are already written). + """ + cmd = [ + sys.executable, "-m", "aidlc_runner", + "--vision", str(args.vision), + "--config", str(args.config), + ] + if args.tech_env: + cmd += ["--tech-env", str(args.tech_env)] + if args.profile: + cmd += ["--aws-profile", args.profile] + if args.region: + cmd += ["--aws-region", args.region] + if args.executor_model: + cmd += ["--executor-model", args.executor_model] + if args.rules_ref: + cmd += ["--rules-ref", args.rules_ref] + # Route output under runs// by default + output_dir = args.output_dir + if not output_dir and hasattr(args, "_scenario_name"): + output_dir = REPO_ROOT / "runs" / args._scenario_name + if output_dir: + cmd += ["--output-dir", str(output_dir)] + + env_pythonpath = os.pathsep.join([ + str(PACKAGES / "execution" / "src"), + str(PACKAGES / "shared" / "src"), + ]) + env = {**os.environ, "PYTHONPATH": env_pythonpath} + + # Determine the output directory so we can read the sentinel file after. + effective_output_dir = output_dir or (REPO_ROOT / "runs") + + # Snapshot for the legacy fallback (in case the runner doesn't write + # the sentinel, e.g. older runner versions). + existing_runs = _list_run_folders(output_dir) + + result = _run_cmd(cmd, "Stage 1: AIDLC Workflow Execution", env=env) + + # Prefer the sentinel file written by create_run_folder() — it avoids + # the TOCTOU race inherent in before/after directory listing. + run_folder = _read_run_sentinel(effective_output_dir) + if run_folder is None: + # Fall back to directory-diff for backwards compatibility. + run_folder = _find_new_run(existing_runs, output_dir) + if run_folder is None: + return None + + docs_dir = run_folder / "aidlc-docs" + doc_files = list(docs_dir.rglob("*.md")) if docs_dir.is_dir() else [] + real_docs = [f for f in doc_files if f.name not in ("aidlc-state.md", "audit.md")] + + if result.returncode != 0: + if real_docs: + print( + f"\n[WARN] Execution exited non-zero but {len(real_docs)} " + f"aidlc-doc(s) were produced — continuing evaluation.", + file=sys.stderr, + ) + else: + print( + "\n[ABORT] Execution failed and no aidlc-docs were produced.", + file=sys.stderr, + ) + return None + + return run_folder + + +def stage_quantitative(workspace: Path, output_path: Path, pmd_path: str | None = None) -> dict | None: + """Stage 3: Lint and security analysis on generated code.""" + import os + + env_pythonpath = str(PACKAGES / "quantitative" / "src") + env = {**os.environ, "PYTHONPATH": env_pythonpath} + + cmd = [ + sys.executable, "-m", "quantitative", "analyze", + str(workspace), "--output", str(output_path), + ] + if pmd_path: + cmd += ["--pmd-path", pmd_path] + result = _run_cmd(cmd, "Stage 3: Code Quality (Lint & Security)", env=env) + + if result.returncode != 0 or not output_path.exists(): + return None + + with open(output_path, encoding="utf-8") as f: + return yaml.safe_load(f) + + +def _read_quality_status(quality_data: dict | None) -> dict | None: + """Normalize quality-report.yaml into a summary dict.""" + if quality_data is None: + return None + summary = quality_data.get("summary", {}) + lint_total = summary.get("lint_total", 0) + lint_errors = summary.get("lint_errors", 0) + sec_total = summary.get("security_total", 0) + sec_high = summary.get("security_high", 0) + + passed = lint_errors == 0 and sec_high == 0 + parts = [] + if "lint_total" in summary: + parts.append(f"lint: {lint_total} finding(s), {lint_errors} error(s)") + if "security_total" in summary: + parts.append(f"security: {sec_total} finding(s), {sec_high} high") + text = "; ".join(parts) if parts else "no tools ran" + + return { + "passed": passed, + "summary": text, + "lint_total": lint_total, + "lint_errors": lint_errors, + "security_total": sec_total, + "security_high": sec_high, + } + + +def stage_contract( + workspace: Path, + spec_path: Path, + output_path: Path, + use_sandbox: bool = True, +) -> dict | None: + """Stage 4: API contract tests — spin up app and validate endpoints.""" + import os + + env_pythonpath = str(PACKAGES / "contracttest" / "src") + shared_pythonpath = str(PACKAGES / "shared" / "src") + env = {**os.environ, "PYTHONPATH": f"{env_pythonpath}{os.pathsep}{shared_pythonpath}"} + + cmd = [ + sys.executable, "-m", "contracttest", "run", + str(workspace), "--openapi", str(spec_path), + "--output", str(output_path), + ] + if use_sandbox: + cmd.append("--sandbox") + else: + cmd.append("--no-sandbox") + result = _run_cmd(cmd, "Stage 4: API Contract Tests", env=env) + + if not output_path.exists(): + return None + + with open(output_path, encoding="utf-8") as f: + return yaml.safe_load(f) + + +def _read_contract_status(contract_data: dict | None) -> dict | None: + """Normalize contract-test-results.yaml into a summary dict.""" + if contract_data is None: + return None + total = contract_data.get("total", 0) + passed = contract_data.get("passed", 0) + failed = contract_data.get("failed", 0) + errors = contract_data.get("errors", 0) + server_ok = contract_data.get("server_started", False) + + if not server_ok: + summary = f"server failed to start — {contract_data.get('server_error', '?')[:120]}" + ok = False + else: + summary = f"{passed}/{total} passed, {failed} failed, {errors} errors" + ok = failed == 0 and errors == 0 + + return {"passed": ok, "summary": summary, "total": total, + "test_passed": passed, "test_failed": failed, "test_errors": errors} + + +def stage_qualitative( + candidate_docs: Path, + golden_docs: Path, + output_path: Path | None, + profile: str, + region: str, + model_id: str, +) -> int: + """Stage 5: Semantic comparison of aidlc-docs via Bedrock.""" + cmd = [ + sys.executable, "-m", "qualitative", "compare", + "--reference", str(golden_docs), + "--candidate", str(candidate_docs), + "--model-id", model_id, + ] + if profile: + cmd += ["--profile", profile] + if region: + cmd += ["--region", region] + if output_path: + cmd += ["--output", str(output_path)] + + env_pythonpath = str(PACKAGES / "qualitative" / "src") + import os + env = {**os.environ, "PYTHONPATH": env_pythonpath} + + result = _run_cmd(cmd, "Stage 5: Qualitative (Semantic) Evaluation via Bedrock", env=env) + return result.returncode + + +def stage_report(run_folder: Path, report_format: str, baseline: Path | None = None) -> None: + """Stage 6: Generate consolidated Markdown + HTML report.""" + import os + + env_pythonpath = str(PACKAGES / "reporting" / "src") + env = {**os.environ, "PYTHONPATH": env_pythonpath} + + cmd = [ + sys.executable, "-m", "reporting", "generate", + str(run_folder), "--format", report_format, + ] + if baseline and baseline.is_file(): + cmd += ["--baseline", str(baseline)] + _run_cmd(cmd, "Stage 6: Consolidated Report", env=env) + + +def stage_test() -> int: + """Run unit tests for all packages.""" + import os + + pythonpath_parts = [ + str(PACKAGES / "execution" / "src"), + str(PACKAGES / "qualitative" / "src"), + str(PACKAGES / "quantitative" / "src"), + str(PACKAGES / "contracttest" / "src"), + str(PACKAGES / "reporting" / "src"), + str(PACKAGES / "shared" / "src"), + str(PACKAGES / "cli-harness" / "src"), + str(PACKAGES / "ide-harness" / "src"), + ] + pythonpath = os.pathsep.join(pythonpath_parts) + env = {**os.environ, "PYTHONPATH": pythonpath} + + test_dirs = [ + str(PACKAGES / "execution" / "tests"), + str(PACKAGES / "qualitative" / "tests"), + str(PACKAGES / "quantitative" / "tests"), + str(PACKAGES / "contracttest" / "tests"), + str(PACKAGES / "reporting" / "tests"), + str(PACKAGES / "cli-harness" / "tests"), + str(PACKAGES / "ide-harness" / "tests"), + ] + cmd = [sys.executable, "-m", "pytest"] + test_dirs + ["-v", "--import-mode=importlib"] + + result = _run_cmd(cmd, "Unit Tests: All Packages", env=env) + return result.returncode + + +# ── CLI ────────────────────────────────────────────────────────────────────── + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="run_evaluation", + description="AIDLC Evaluation Suite — execute workflow + score outputs via Bedrock", + ) + + parser.add_argument( + "--test", action="store_true", + help="Run unit tests for all packages and exit", + ) + + # Scenario selection + parser.add_argument( + "--scenario", type=str, default="sci-calc", + help="Scenario name or path to test case directory (default: sci-calc)", + ) + + # Execution config + parser.add_argument( + "--config", type=Path, + default=REPO_ROOT / "config" / "default.yaml", + help="Path to YAML config file for the execution stage (default: config/default.yaml)", + ) + + # Execution inputs (override scenario defaults) + parser.add_argument( + "--vision", type=Path, default=None, + help="Path to vision markdown file (default: from scenario)", + ) + parser.add_argument( + "--tech-env", type=Path, default=None, + help="Path to technical environment markdown file (default: from scenario)", + ) + + # Evaluation-only mode + parser.add_argument( + "--evaluate-only", type=Path, metavar="AIDLC_DOCS_PATH", + help="Skip execution; evaluate an existing run's aidlc-docs against the golden baseline", + ) + + # Golden reference (override scenario default) + parser.add_argument( + "--golden", type=Path, default=None, + help="Path to golden aidlc-docs directory (default: from scenario)", + ) + + # OpenAPI specification (override scenario default) + parser.add_argument( + "--openapi", type=Path, default=None, + help="Path to OpenAPI 3.x spec with x-test-cases (default: from scenario)", + ) + + # Sandbox + sandbox_group = parser.add_mutually_exclusive_group() + sandbox_group.add_argument( + "--sandbox", action="store_true", default=True, + help="Run generated code in a Docker sandbox (default)", + ) + sandbox_group.add_argument( + "--no-sandbox", action="store_false", dest="sandbox", + help="Run generated code directly on the host (no isolation)", + ) + + # Report format + parser.add_argument( + "--report-format", choices=["markdown", "html", "both"], default="both", + help="Report output format (default: both)", + ) + + # Baseline for regression comparison (auto-discovered from scenario) + parser.add_argument( + "--baseline", type=Path, default=None, + help="Path to golden.yaml for regression comparison (default: from scenario)", + ) + + # Output + parser.add_argument( + "--output-dir", type=Path, + help="Override run output directory", + ) + parser.add_argument( + "--results", type=Path, + help="Write qualitative results YAML to this path", + ) + + # AWS / Bedrock + parser.add_argument("--profile", default=None, help="AWS profile (default: from config YAML)") + parser.add_argument("--region", default=None, help="AWS region (default: from config YAML)") + parser.add_argument( + "--rules-ref", default=None, + help="Git ref (branch/tag/commit) for AIDLC rules (overrides config value)", + ) + parser.add_argument( + "--executor-model", default=None, + help="Override executor model ID", + ) + parser.add_argument( + "--scorer-model", default=None, + help="Bedrock model for qualitative scoring (default: from config YAML)", + ) + + return parser + + +def main() -> None: + parser = build_parser() + args = parser.parse_args() + + # ── Test mode ──────────────────────────────────────────────────────── + if args.test: + rc = stage_test() + sys.exit(rc) + + # ── Resolve defaults from config YAML when not provided on CLI ────── + cfg_data: dict = {} + cfg_path = args.config + if cfg_path and cfg_path.exists(): + with open(cfg_path, encoding="utf-8") as f: + cfg_data = yaml.safe_load(f) or {} + + if args.profile is None: + args.profile = cfg_data.get("aws", {}).get("profile") + if args.region is None: + args.region = cfg_data.get("aws", {}).get("region") + if args.scorer_model is None: + args.scorer_model = ( + cfg_data.get("models", {}).get("scorer", {}).get("model_id") + ) + if args.scorer_model is None: + parser.error( + "--scorer-model is required (or set models.scorer.model_id in config YAML)" + ) + + # ── Optional tool paths from config ───────────────────────────────── + pmd_path = cfg_data.get("tools", {}).get("pmd_path") + + # ── Resolve scenario and apply defaults ────────────────────────────── + scenario = resolve_scenario(args.scenario, TEST_CASES_DIR) + if args.vision is None: + args.vision = scenario.vision_path + if args.tech_env is None: + args.tech_env = scenario.tech_env_path + if args.golden is None: + args.golden = scenario.golden_aidlc_docs_path + if args.openapi is None: + args.openapi = scenario.openapi_path + if args.baseline is None: + candidate = scenario.golden_baseline_path + if candidate.is_file(): + args.baseline = candidate + + # Store scenario name for output path routing + args._scenario_name = scenario.name + + # ── Evaluate-only mode ─────────────────────────────────────────────── + if args.evaluate_only: + if not args.evaluate_only.is_dir(): + print(f"Error: path not found: {args.evaluate_only}", file=sys.stderr) + sys.exit(1) + if not args.golden.is_dir(): + print(f"Error: golden baseline not found: {args.golden}", file=sys.stderr) + sys.exit(1) + + run_folder = args.evaluate_only.parent + + # Save evaluation config and repo info + _save_evaluation_config(run_folder, args, cfg_data) + + # Show test results if evaluating a run folder that has them + test_results_path = run_folder / "test-results.yaml" + test_status = _read_test_status(test_results_path) + if test_status: + _print_test_summary(test_status) + + # Run quantitative analysis if workspace exists + workspace = run_folder / "workspace" + if workspace.is_dir(): + quality_report_path = run_folder / "quality-report.yaml" + quality_data = stage_quantitative(workspace, quality_report_path, pmd_path=pmd_path) + quality_status = _read_quality_status(quality_data) + if quality_status and not quality_status["passed"]: + print(f"\n ** CODE QUALITY WARNINGS: {quality_status['summary']} **") + + # Run contract tests if spec exists and workspace is present + if workspace.is_dir() and args.openapi and args.openapi.is_file(): + contract_report_path = run_folder / "contract-test-results.yaml" + contract_data = stage_contract( + workspace, args.openapi, contract_report_path, + use_sandbox=args.sandbox, + ) + contract_status = _read_contract_status(contract_data) + if contract_status: + _clabel = "PASS" if contract_status["passed"] else "FAIL" + print(f"\n Contract tests: {_clabel} ({contract_status['summary']})") + + stage_qualitative( + candidate_docs=args.evaluate_only, + golden_docs=args.golden, + output_path=args.results, + profile=args.profile, + region=args.region, + model_id=args.scorer_model, + ) + + stage_report(run_folder, args.report_format, args.baseline) + # Evaluation completed successfully - exit 0 + sys.exit(0) + + # ── Full pipeline ──────────────────────────────────────────────────── + if not args.vision.exists(): + print(f"Error: vision file not found: {args.vision}", file=sys.stderr) + sys.exit(1) + if not args.golden.is_dir(): + print(f"Error: golden baseline not found: {args.golden}", file=sys.stderr) + sys.exit(1) + + print("AIDLC Evaluation Suite") + print(f" Scenario: {scenario.name}") + print(f" Config: {args.config}") + print(f" Vision: {args.vision}") + print(f" Tech-env: {args.tech_env or '(none)'}") + print(f" OpenAPI: {args.openapi or '(none)'}") + print(f" Golden: {args.golden}") + print(f" Baseline: {args.baseline or '(none)'}") + print(f" Profile: {args.profile}") + print(f" Region: {args.region}") + print(f" Sandbox: {'enabled' if args.sandbox else 'disabled'}") + + # Stage 1: Execute the AIDLC workflow + run_folder = stage_execute(args) + if run_folder is None: + print("\n[ABORT] Execution stage failed.", file=sys.stderr) + sys.exit(1) + + print(f"\nRun completed: {run_folder}") + + # Save evaluation config and repo info + _save_evaluation_config(run_folder, args, cfg_data) + + # Stage 2: Post-run test results (executed inside the runner) + test_results_path = run_folder / "test-results.yaml" + test_status = _read_test_status(test_results_path) + _print_test_summary(test_status) + + # Stage 3: Code quality (lint + security) + quality_report_path = run_folder / "quality-report.yaml" + workspace = run_folder / "workspace" + quality_data = stage_quantitative(workspace, quality_report_path, pmd_path=pmd_path) + quality_status = _read_quality_status(quality_data) + + # Stage 4: API contract tests + contract_status: dict | None = None + if args.openapi and args.openapi.is_file(): + contract_report_path = run_folder / "contract-test-results.yaml" + contract_data = stage_contract( + workspace, args.openapi, contract_report_path, + use_sandbox=args.sandbox, + ) + contract_status = _read_contract_status(contract_data) + + # Stage 5: Qualitative evaluation + candidate_docs = run_folder / "aidlc-docs" + results_path = args.results or (run_folder / "qualitative-comparison.yaml") + + qual_rc = stage_qualitative( + candidate_docs=candidate_docs, + golden_docs=args.golden, + output_path=results_path, + profile=args.profile, + region=args.region, + model_id=args.scorer_model, + ) + + # Stage 6: Consolidated report + stage_report(run_folder, args.report_format, args.baseline) + + # ── Summary ────────────────────────────────────────────────────────── + print(f"\n{'=' * 60}") + print(" Evaluation Complete") + print(f"{'=' * 60}") + print(f" Run folder: {run_folder}") + print(f" Run metrics: {run_folder / 'run-metrics.yaml'}") + if test_status: + _label = "PASS" if test_status["tests_passed"] else "FAIL" + print(f" Post-run tests: {_label} ({test_status['summary']})") + else: + print(f" Post-run tests: N/A (no testable project detected)") + if quality_status: + _qlabel = "PASS" if quality_status["passed"] else "WARN" + print(f" Code quality: {_qlabel} ({quality_status['summary']})") + else: + print(f" Code quality: N/A (no workspace or tools unavailable)") + if contract_status: + _clabel = "PASS" if contract_status["passed"] else "FAIL" + print(f" Contract tests: {_clabel} ({contract_status['summary']})") + else: + print(f" Contract tests: N/A (no --openapi provided)") + print(f" Qualitative comparison: {results_path}") + print(f" Qualitative score: (see above)") + + # Report quality findings (not technical failures) + findings = [] + if test_status and not test_status["tests_passed"]: + findings.append(f"post-run tests ({test_status['summary']})") + if contract_status and not contract_status["passed"]: + findings.append(f"contract tests ({contract_status['summary']})") + if quality_status and not quality_status["passed"]: + findings.append(f"code quality ({quality_status['summary']})") + + if findings: + print(f"\n ** QUALITY ISSUES FOUND: {'; '.join(findings)} **") + + # Evaluation completed successfully - exit 0 regardless of quality findings + # (Only exit 1 on technical failures like missing run folder, scorer crash, etc.) + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/scripts/aidlc-evaluator/scripts/run_extension_test.py b/scripts/aidlc-evaluator/scripts/run_extension_test.py new file mode 100644 index 00000000..ee45c5e2 --- /dev/null +++ b/scripts/aidlc-evaluator/scripts/run_extension_test.py @@ -0,0 +1,607 @@ +#!/usr/bin/env python3 +"""Extension Hook Test Runner — test AIDLC evaluations with different extension opt-in configurations. + +This script tests the progressive loading of AIDLC rules extensions by running +evaluations with different opt-in answers. It runs multiple evaluations and +generates a comparison report showing the impact of different extension configurations. + +The extension hook feature (feat/extension_hook_question_split branch) adds opt-in +questions for rules extensions like security-baseline.opt-in.md. + +Usage: + # Run standard comparison (all yes vs all no) + python run_extension_test.py --scenario sci-calc + + # Run with custom configurations + python run_extension_test.py --scenario sci-calc \ + --configs baseline,security-only,performance-only + + # Use specific rules branch + python run_extension_test.py --scenario sci-calc \ + --rules-ref feat/extension_hook_question_split + + # Compare specific extension sets + python run_extension_test.py --scenario sci-calc \ + --extensions security,performance,observability + +Reference: + - Extension hook feature: https://github.com/awslabs/aidlc-workflows/tree/feat/extension_hook_question_split + - Opt-in example: https://github.com/awslabs/aidlc-workflows/blob/feat/extension_hook_question_split/aidlc-rules/aws-aidlc-rule-details/extensions/security/baseline/security-baseline.opt-in.md +""" + +from __future__ import annotations + +import argparse +import shutil +import subprocess +import sys +import time +from datetime import UTC, datetime +from pathlib import Path + +import yaml + +REPO_ROOT = Path(__file__).resolve().parent.parent +CONFIG_DIR = REPO_ROOT / "config" +DEFAULT_CONFIG = CONFIG_DIR / "default.yaml" +TEST_CASES_DIR = REPO_ROOT / "test_cases" +SCRIPTS_DIR = REPO_ROOT / "scripts" + +# Add shared and reporting packages to path +sys.path.insert(0, str(REPO_ROOT / "packages" / "shared" / "src")) +sys.path.insert(0, str(REPO_ROOT / "packages" / "reporting" / "src")) +from shared.scenario import resolve_scenario # noqa: E402 +from reporting.baseline import BaselineMetrics, extract_baseline # noqa: E402 +from reporting.collector import collect # noqa: E402 + +# Default extension configurations to test +DEFAULT_CONFIGS = { + "all-extensions": { + "name": "All Extensions Enabled", + "description": "All extension opt-ins answered YES", + "opt_in_default": "yes", + }, + "no-extensions": { + "name": "No Extensions", + "description": "All extension opt-ins answered NO (baseline only)", + "opt_in_default": "no", + }, +} + + +def run_evaluation_with_config( + config_name: str, + config_spec: dict, + vision: Path, + tech_env: Path | None, + golden: Path, + openapi: Path, + baseline: Path | None, + base_config: Path, + runs_dir: Path, + profile: str, + region: str, + rules_ref: str, + executor_model: str | None, + scorer_model: str, +) -> dict: + """Run evaluation with a specific extension configuration. + + Args: + config_name: Short name for this config (e.g., "all-extensions") + config_spec: Configuration specification with opt-in settings + vision: Path to vision markdown + tech_env: Optional path to tech-env markdown + golden: Path to golden aidlc-docs directory + openapi: Path to OpenAPI spec + baseline: Optional path to golden.yaml baseline + base_config: Path to base config YAML + runs_dir: Base directory for run outputs + profile: AWS profile name + region: AWS region + rules_ref: Git ref for AIDLC rules (should include extension hook support) + executor_model: Optional executor model override + scorer_model: Bedrock model ID for scoring + + Returns: + dict: Summary of the run with status and metrics + """ + print(f"\n{'=' * 70}") + print(f" Configuration: {config_spec['name']}") + print(f" Description: {config_spec['description']}") + print(f" Opt-in Default: {config_spec.get('opt_in_default', 'N/A')}") + print(f"{'=' * 70}\n") + + # Build command to run evaluation + cmd = [ + sys.executable, str(SCRIPTS_DIR / "run_evaluation.py"), + "--config", str(base_config), + "--vision", str(vision), + "--golden", str(golden), + "--openapi", str(openapi), + "--scorer-model", scorer_model, + "--rules-ref", rules_ref, + "--report-format", "both", + "--output-dir", str(runs_dir), + ] + if profile: + cmd += ["--profile", profile] + if region: + cmd += ["--region", region] + + if tech_env and tech_env.is_file(): + cmd += ["--tech-env", str(tech_env)] + if baseline and baseline.is_file(): + cmd += ["--baseline", str(baseline)] + if executor_model: + cmd += ["--executor-model", executor_model] + + # NOTE: Extension opt-in configuration mechanism + # =============================================== + # The extension hook feature is still under development. Once the + # mechanism for controlling opt-in answers is finalized, we'll add + # the appropriate flags or environment variables here. + # + # Possible approaches: + # 1. Environment variable: AIDLC_EXTENSION_OPT_IN=yes|no|auto + # 2. Config file field: aidlc.extension_opt_in_default + # 3. CLI flag: --extension-opt-in yes|no|prompt + # 4. Answer file: --extension-answers answers.yaml + # + # For now, document the configuration in run metadata. + + # Note existing runs so we can find the new one after execution + existing_runs = set() + if runs_dir.is_dir(): + existing_runs = {d.name for d in runs_dir.iterdir() if d.is_dir()} + + # Log to temp location, will move to run folder after + runs_dir.mkdir(parents=True, exist_ok=True) + log_dir = runs_dir / ".extension-test-logs" + log_dir.mkdir(parents=True, exist_ok=True) + log_path = log_dir / f"{config_name}.log" + + # Run evaluation + start = time.monotonic() + started_at = datetime.now(UTC).isoformat(timespec="seconds") + with open(log_path, "w", encoding="utf-8") as log_file: + # nosec B603 - Executing trusted run_evaluation.py with extension config + # nosemgrep: dangerous-subprocess-use-audit + result = subprocess.run( + cmd, + stdout=log_file, + stderr=subprocess.STDOUT, + ) + + elapsed_s = time.monotonic() - start + elapsed_min = elapsed_s / 60 + + status = "success" if result.returncode == 0 else "failed" + print(f" [{status.upper()}] {config_name} — {elapsed_min:.1f} min (exit code {result.returncode})") + + # Find the new run folder (timestamped dir that didn't exist before) + run_folder: Path | None = None + if runs_dir.is_dir(): + new_dirs = [ + d for d in runs_dir.iterdir() + if d.is_dir() and d.name not in existing_runs and d.name[0:1].isdigit() + ] + if new_dirs: + run_folder = sorted(new_dirs, reverse=True)[0] + + # Rename run folder to include config name + if run_folder: + output_dir = runs_dir / f"{run_folder.name}-ext-{config_name}" + if output_dir.exists(): + shutil.rmtree(output_dir) + run_folder.rename(output_dir) + print(f" Output: {output_dir}") + + # Move log into the run folder + shutil.move(str(log_path), str(output_dir / "extension-test.log")) + + # Write configuration metadata to the run folder + timestamp = run_folder.name.split('-')[0] + config_meta = { + "extension_test_config": config_name, + "extension_config_spec": config_spec, + "rules_ref": rules_ref, + "run_timestamp": timestamp, + } + with open(output_dir / "extension-test-config.yaml", "w", encoding="utf-8") as f: + yaml.safe_dump(config_meta, f, default_flow_style=False, sort_keys=False) + else: + print(f" [WARN] No run folder found after execution") + # Create a placeholder directory for the log + timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%S") + output_dir = runs_dir / f"{timestamp}-ext-{config_name}-failed" + output_dir.mkdir(parents=True, exist_ok=True) + shutil.move(str(log_path), str(output_dir / "extension-test.log")) + print(f" Log saved to: {output_dir}") + + # Clean up temp log dir if empty + if log_dir.exists() and not any(log_dir.iterdir()): + log_dir.rmdir() + + # Return summary + return { + "config_name": config_name, + "config_display_name": config_spec["name"], + "config_description": config_spec["description"], + "started_at": started_at, + "elapsed_seconds": round(elapsed_s, 1), + "exit_code": result.returncode, + "status": status, + "output_dir": str(output_dir), + } + + +def load_config_metrics(run_folder: Path) -> BaselineMetrics | None: + """Load evaluation metrics from an extension test run folder.""" + try: + data = collect(run_folder) + return extract_baseline(data) + except Exception as e: + print(f" [WARN] Failed to collect metrics from {run_folder}: {e}", file=sys.stderr) + return None + + +def format_num(val: float | int | None, decimals: int = 1) -> str: + """Format a number for display.""" + if val is None: + return "—" + if isinstance(val, float): + return f"{val:.{decimals}f}" + return str(val) + + +def generate_extension_comparison( + runs_dir: Path, + results: list[dict], + scenario_name: str, +) -> None: + """Generate a detailed comparison report for extension test runs. + + Args: + runs_dir: Base directory containing all test runs + results: List of run summaries + scenario_name: Name of the test scenario + """ + comparison_dir = runs_dir / "extension-comparison" + comparison_dir.mkdir(parents=True, exist_ok=True) + + # Write extension test summary + summary = { + "generated_at": datetime.now(UTC).isoformat(timespec="seconds"), + "scenario": scenario_name, + "total_runs": len(results), + "runs": results, + } + + summary_path = comparison_dir / "extension-test-summary.yaml" + with open(summary_path, "w", encoding="utf-8") as f: + yaml.safe_dump(summary, f, default_flow_style=False, sort_keys=False) + + print(f"\n Extension test summary: {summary_path}") + + # Load metrics from each run (regardless of status - we want to compare quality) + config_metrics: dict[str, BaselineMetrics] = {} + for result in results: + run_folder = Path(result["output_dir"]) + if not run_folder.is_dir(): + continue + print(f" Loading metrics: {result['config_name']} (status: {result['status']})...") + metrics = load_config_metrics(run_folder) + if metrics: + config_metrics[result['config_name']] = metrics + + # Generate detailed metrics comparison report + report_lines = [ + "# Extension Hook Test Report", + "", + f"**Scenario:** {scenario_name}", + f"**Generated:** {summary['generated_at']}", + "", + "## Test Configurations", + "", + ] + + # Configuration summary + for result in results: + status_icon = "✅" if result["status"] == "success" else "❌" + report_lines.extend([ + f"### {status_icon} {result['config_display_name']}", + "", + f"- **Config ID:** `{result['config_name']}`", + f"- **Description:** {result['config_description']}", + f"- **Status:** {result['status'].upper()}", + f"- **Duration:** {result.get('elapsed_seconds', 0) / 60:.1f} minutes", + f"- **Output:** `{result['output_dir']}`", + "", + ]) + + # Detailed metrics comparison (if we have metrics) + if config_metrics: + report_lines.extend([ + "", + "## Detailed Metrics Comparison", + "", + ]) + + # Build columns (sorted by config name) + columns = sorted(config_metrics.items()) + + # Header + header = "| Metric |" + separator = "|--------|" + for config_name, _ in columns: + header += f" {config_name} |" + separator += "---------|" + report_lines.append(header) + report_lines.append(separator) + + # Metric rows + metric_rows: list[tuple[str, str, bool]] = [ + # (display_name, attr_name, higher_is_better) + ("**Unit Tests**", "", True), + ("Pass %", "tests_pass_pct", True), + ("Passed", "tests_passed", True), + ("Failed", "tests_failed", False), + ("Total", "tests_total", True), + ("Coverage %", "coverage_pct", True), + ("**Contract Tests**", "", True), + ("Passed", "contract_passed", True), + ("Failed", "contract_failed", False), + ("Total", "contract_total", True), + ("**Code Quality**", "", True), + ("Lint Errors", "lint_errors", False), + ("Lint Warnings", "lint_warnings", False), + ("Lint Total", "lint_total", False), + ("Security Findings", "security_total", False), + ("Security High", "security_high", False), + ("Duplication Blocks", "duplication_blocks", False), + ("**Qualitative**", "", True), + ("Overall Score", "qualitative_score", True), + ("Inception Score", "inception_score", True), + ("Construction Score", "construction_score", True), + ("**Artifacts**", "", True), + ("Source Files", "source_files", True), + ("Test Files", "test_files", True), + ("Total Files", "total_files", True), + ("Lines of Code", "lines_of_code", True), + ("Doc Files", "doc_files", True), + ("**Execution**", "", True), + ("Total Tokens", "total_tokens", False), + ("Executor Tokens", "executor_total_tokens", False), + ("Simulator Tokens", "simulator_total_tokens", False), + ("Wall Clock (min)", "wall_clock_min", False), + ("Handoffs", "handoffs", False), + ("**Context Size**", "", True), + ("Max Tokens", "context_size_max", False), + ("Avg Tokens", "context_size_avg", False), + ("Median Tokens", "context_size_median", False), + ] + + for display_name, attr, higher_is_better in metric_rows: + if not attr: + # Section header row + row = f"| {display_name} |" + for _ in columns: + row += " |" + report_lines.append(row) + continue + + row = f"| {display_name} |" + for config_name, metrics in columns: + if attr == "wall_clock_min": + val = metrics.wall_clock_ms / 60000 if metrics.wall_clock_ms else None + else: + val = getattr(metrics, attr, None) + row += f" {format_num(val)} |" + report_lines.append(row) + + report_lines.append("") + + # Next steps + report_lines.extend([ + "", + "## Next Steps", + "", + "1. Review the individual run reports in each output directory", + "2. Compare qualitative scores between configurations", + "3. Examine differences in generated artifacts", + "4. Analyze the impact of extension opt-ins on output quality", + "", + ]) + + report_path = comparison_dir / "extension-test-report.md" + report_path.write_text("\n".join(report_lines), encoding="utf-8") + print(f" Extension test report: {report_path}") + + +def main() -> None: + parser = argparse.ArgumentParser( + prog="run_extension_test", + description="Test AIDLC evaluation with different extension opt-in configurations", + ) + + # Scenario and test configuration + parser.add_argument( + "--scenario", type=str, default="sci-calc", + help="Scenario name or path to test case directory (default: sci-calc)", + ) + parser.add_argument( + "--configs", type=str, default=None, + help='Comma-separated config names to test (default: "all-extensions,no-extensions")', + ) + parser.add_argument( + "--list-configs", action="store_true", + help="List available extension configurations and exit", + ) + + # Rules configuration + parser.add_argument( + "--rules-ref", default="feat/extension_hook_question_split", + help="Git ref for AIDLC rules with extension hook support (default: feat/extension_hook_question_split)", + ) + + # Base configuration + parser.add_argument( + "--config", type=Path, default=DEFAULT_CONFIG, + help="Base config YAML (default: config/default.yaml)", + ) + parser.add_argument( + "--profile", default=None, + help="AWS profile (default: from config YAML)", + ) + parser.add_argument( + "--region", default=None, + help="AWS region (default: from config YAML)", + ) + parser.add_argument( + "--executor-model", default=None, + help="Override executor model ID", + ) + parser.add_argument( + "--scorer-model", default=None, + help="Bedrock model for scoring (default: from config YAML)", + ) + + # Scenario overrides + parser.add_argument("--vision", type=Path, default=None) + parser.add_argument("--tech-env", type=Path, default=None) + parser.add_argument("--golden", type=Path, default=None) + parser.add_argument("--openapi", type=Path, default=None) + parser.add_argument("--baseline", type=Path, default=None) + + # Output configuration + parser.add_argument( + "--runs-dir", type=Path, default=None, + help="Base directory for run outputs (default: runs//extension-test)", + ) + + args = parser.parse_args() + + # List configs mode + if args.list_configs: + print("Available extension test configurations:\n") + for name, spec in DEFAULT_CONFIGS.items(): + print(f" {name:20s} {spec['name']}") + print(f" {' ' * 20} {spec['description']}") + print() + sys.exit(0) + + # Resolve scenario and apply defaults + scenario = resolve_scenario(args.scenario, TEST_CASES_DIR) + if args.vision is None: + args.vision = scenario.vision_path + if args.tech_env is None: + args.tech_env = scenario.tech_env_path + if args.golden is None: + args.golden = scenario.golden_aidlc_docs_path + if args.openapi is None: + args.openapi = scenario.openapi_path + if args.baseline is None: + candidate = scenario.golden_baseline_path + if candidate.is_file(): + args.baseline = candidate + + # Route runs under runs//extension-test/ + if args.runs_dir is None: + args.runs_dir = REPO_ROOT / "runs" / scenario.name / "extension-test" + + # Load base config for defaults + base_cfg = {} + if args.config and args.config.exists(): + with open(args.config, encoding="utf-8") as f: + base_cfg = yaml.safe_load(f) or {} + + if args.profile is None: + args.profile = base_cfg.get("aws", {}).get("profile") + if args.region is None: + args.region = base_cfg.get("aws", {}).get("region") + + if args.scorer_model is None: + args.scorer_model = base_cfg.get("models", {}).get("scorer", {}).get("model_id") + if args.scorer_model is None: + parser.error("--scorer-model is required (or set models.scorer.model_id in config YAML)") + + # Select configurations to test + if args.configs: + selected = [c.strip() for c in args.configs.split(",")] + configs = {k: v for k, v in DEFAULT_CONFIGS.items() if k in selected} + missing = set(selected) - set(configs.keys()) + if missing: + parser.error(f"Unknown configs: {', '.join(missing)}. Use --list-configs to see available options.") + else: + configs = DEFAULT_CONFIGS + + # Print test plan + print("Extension Hook Test Plan") + print(f" Scenario: {scenario.name}") + print(f" Rules Ref: {args.rules_ref}") + print(f" Profile: {args.profile}") + print(f" Region: {args.region}") + print(f" Scorer: {args.scorer_model}") + print(f" Configs: {', '.join(configs.keys())}") + print(f" Vision: {args.vision}") + print(f" Golden: {args.golden}") + + # Run each configuration + test_start = time.monotonic() + test_started_at = datetime.now(UTC).isoformat(timespec="seconds") + results: list[dict] = [] + + for config_name, config_spec in configs.items(): + try: + summary = run_evaluation_with_config( + config_name=config_name, + config_spec=config_spec, + vision=args.vision, + tech_env=args.tech_env, + golden=args.golden, + openapi=args.openapi, + baseline=args.baseline, + base_config=args.config, + runs_dir=args.runs_dir, + profile=args.profile, + region=args.region, + rules_ref=args.rules_ref, + executor_model=args.executor_model, + scorer_model=args.scorer_model, + ) + results.append(summary) + except Exception as e: + print(f"\n[ERROR] Failed to run config {config_name}: {e}", file=sys.stderr) + results.append({ + "config_name": config_name, + "config_display_name": config_spec["name"], + "config_description": config_spec["description"], + "status": "error", + "error": str(e), + }) + + test_elapsed = time.monotonic() - test_start + + # Generate comparison report + generate_extension_comparison(args.runs_dir, results, scenario.name) + + # Print final summary + print(f"\n{'=' * 70}") + print(f" Extension Test Complete") + print(f"{'=' * 70}") + print(f" Total time: {test_elapsed / 60:.1f} min") + print(f" Configurations tested: {len(results)}") + for r in results: + marker = "PASS" if r.get("status") == "success" else "FAIL" + duration = r.get("elapsed_seconds", 0) / 60 + print(f" [{marker}] {r['config_display_name']:30s} {duration:.1f} min") + print(f"\n Results: {args.runs_dir}") + + # Exit with error if any config failed + failed = sum(1 for r in results if r.get("status") != "success") + sys.exit(1 if failed > 0 else 0) + + +if __name__ == "__main__": + main() diff --git a/scripts/aidlc-evaluator/scripts/run_ide_evaluation.py b/scripts/aidlc-evaluator/scripts/run_ide_evaluation.py new file mode 100644 index 00000000..ba459d5a --- /dev/null +++ b/scripts/aidlc-evaluator/scripts/run_ide_evaluation.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +"""Run AIDLC evaluation through an IDE adapter. + +Usage: + # List available adapters + python run_ide_evaluation.py --list + + # Run evaluation through Cursor + python run_ide_evaluation.py --ide cursor \ + --vision test_cases/sci-calc/vision.md \ + --golden test_cases/sci-calc/golden-aidlc-docs + + # Check prerequisites for an IDE + python run_ide_evaluation.py --ide kiro --check-only +""" + +from __future__ import annotations + +import argparse +import os +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +PACKAGES = REPO_ROOT / "packages" +TEST_CASES_DIR = REPO_ROOT / "test_cases" + +# Add ide-harness and shared packages to path +sys.path.insert(0, str(PACKAGES / "ide-harness" / "src")) +sys.path.insert(0, str(PACKAGES / "shared" / "src")) + +from ide_harness.registry import get_adapter, list_adapters # noqa: E402 +from ide_harness.orchestrator import run_ide_evaluation # noqa: E402 +from shared.scenario import resolve_scenario # noqa: E402 + + +def main() -> None: + parser = argparse.ArgumentParser( + prog="run_ide_evaluation", + description="Run AIDLC evaluation through an IDE AI assistant", + ) + parser.add_argument( + "--ide", type=str, + help="IDE adapter name (e.g., cursor, cline, kiro)", + ) + parser.add_argument( + "--list", action="store_true", + help="List available IDE adapters and exit", + ) + parser.add_argument( + "--check-only", action="store_true", + help="Only check IDE prerequisites, don't run evaluation", + ) + parser.add_argument( + "--scenario", type=str, default="sci-calc", + help="Scenario name or path to test case directory (default: sci-calc)", + ) + parser.add_argument("--vision", type=Path, default=None) + parser.add_argument("--tech-env", type=Path, default=None) + parser.add_argument("--golden", type=Path, default=None) + parser.add_argument("--openapi", type=Path, default=None) + parser.add_argument("--baseline", type=Path, default=None) + parser.add_argument("--rules", type=Path, default=None, help="Path to AIDLC rules directory") + parser.add_argument("--output-dir", type=Path, default=None) + parser.add_argument("--profile", default=None, help="AWS profile (default: from config YAML)") + parser.add_argument("--region", default=None, help="AWS region (default: from config YAML)") + parser.add_argument("--scorer-model", default="us.anthropic.claude-sonnet-4-5-20250929-v1:0") + + # Sandbox + sandbox_group = parser.add_mutually_exclusive_group() + sandbox_group.add_argument( + "--sandbox", action="store_true", default=True, + help="Run generated code in a Docker sandbox (default)", + ) + sandbox_group.add_argument( + "--no-sandbox", action="store_false", dest="sandbox", + help="Run generated code directly on the host (no isolation)", + ) + + args = parser.parse_args() + + if args.list: + print("Available IDE adapters:") + for name in list_adapters(): + try: + adapter = get_adapter(name) + ok, msg = adapter.check_prerequisites() + status = "ready" if ok else "not ready" + print(f" {name:15s} [{status}] {msg}") + except Exception as e: + print(f" {name:15s} [error] {e}") + sys.exit(0) + + if not args.ide: + parser.error("--ide is required (use --list to see available adapters)") + + adapter = get_adapter(args.ide) + + if args.check_only: + ok, msg = adapter.check_prerequisites() + print(f"{adapter.name}: {'OK' if ok else 'FAIL'} — {msg}") + sys.exit(0 if ok else 1) + + # Resolve scenario and apply defaults + scenario = resolve_scenario(args.scenario, TEST_CASES_DIR) + if args.vision is None: + args.vision = scenario.vision_path + if args.tech_env is None: + args.tech_env = scenario.tech_env_path + if args.golden is None: + args.golden = scenario.golden_aidlc_docs_path + if args.openapi is None: + args.openapi = scenario.openapi_path + if args.baseline is None: + candidate = scenario.golden_baseline_path + if candidate.is_file(): + args.baseline = candidate + + output_dir = args.output_dir or REPO_ROOT / "runs" / scenario.name / f"ide-{args.ide.lower()}" + rules_path = args.rules or REPO_ROOT / "aidlc-rules" + + result, eval_rc = run_ide_evaluation( + adapter=adapter, + vision_path=args.vision, + output_dir=output_dir, + golden_docs=args.golden, + rules_path=rules_path, + tech_env_path=args.tech_env, + openapi_path=args.openapi, + baseline_path=args.baseline, + profile=args.profile, + region=args.region, + scorer_model=args.scorer_model, + use_sandbox=args.sandbox, + ) + + if not result.success: + print(f"\n[FAILED] {adapter.name}: {result.error}") + sys.exit(1) + + print(f"\n[DONE] {adapter.name} evaluation complete.") + print(f" Output: {result.output_dir}") + sys.exit(eval_rc) + + +if __name__ == "__main__": + main() diff --git a/scripts/aidlc-evaluator/scripts/run_trend_report.py b/scripts/aidlc-evaluator/scripts/run_trend_report.py new file mode 100644 index 00000000..87d6d847 --- /dev/null +++ b/scripts/aidlc-evaluator/scripts/run_trend_report.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +"""Generate trend report across AIDLC rules releases. + +This script is invoked by run.py and delegates to the trend_reports package. + +Usage: + python run_trend_report.py --baseline test_cases/sci-calc/golden.yaml + python run_trend_report.py --baseline test_cases/sci-calc/golden.yaml --format html --gate +""" + +from __future__ import annotations + +import subprocess +import sys + + +def main() -> None: + cmd = [sys.executable, "-m", "trend_reports", "trend"] + sys.argv[1:] + # nosec B603 - Executing trusted framework package + # nosemgrep: dangerous-subprocess-use-audit + result = subprocess.run(cmd) + sys.exit(result.returncode) + + +if __name__ == "__main__": + main() diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/README.md b/scripts/aidlc-evaluator/test_cases/all-stages/README.md new file mode 100644 index 00000000..0c10fb64 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/README.md @@ -0,0 +1,204 @@ +# Test Case: all-stages + +## Purpose + +Force the adaptive AI-DLC workflow to execute **every conditional stage** rather +than skipping any. The sci-calc test case is deliberately simple (single unit, +no user stories, no NFR design, no infrastructure design). This test case is +designed so the AI model cannot reasonably skip anything. + +## Stages Triggered + +| Stage | Trigger Mechanism | +|---|---| +| Workspace Detection | Always executes | +| Reverse Engineering | **Not triggered** -- requires brownfield (see note below) | +| Requirements Analysis | Always executes | +| User Stories | 3 distinct user personas with different workflows | +| Workflow Planning | Always executes | +| Application Design | New components, services, and business rules | +| Units Generation | 2 modules requiring decomposition | +| Functional Design (per unit) | Complex business rules (lending limits, fees, holds) | +| NFR Requirements (per unit) | Explicit performance SLAs, security, scalability | +| NFR Design (per unit) | Resilience, caching, rate limiting patterns required | +| Infrastructure Design (per unit) | AWS services specified (Lambda, DynamoDB, Cognito, SQS) | +| Code Generation (per unit) | Always executes | +| Build and Test | Always executes | + +## Files + +| File | Description | +|---|---| +| `vision.md` | Project vision -- BookShelf community library API | +| `tech-env.md` | Technical environment -- Python/FastAPI, AWS serverless | +| `openapi.yaml` | API contract with `x-test-cases` for contract testing | +| `golden-aidlc-docs/` | Golden reference docs (created after first successful run) | +| `golden.yaml` | Baseline metrics from golden run (created after promotion) | + +--- + +## Running with run_evaluation.py + +`run_evaluation.py` orchestrates the full six-stage evaluation pipeline. Since +the defaults all point to `test_cases/sci-calc/`, you must override the input +paths for this test case. + +### First run (no golden baseline yet) + +On the very first run there is no `golden-aidlc-docs/` to compare against. +Run execution only, skip the qualitative comparison, and inspect the output +manually: + +```bash +# Pick a model config from config/ (e.g., opus.yaml, sonnet-4-5.yaml) +python run_evaluation.py \ + --config config/opus.yaml \ + --vision test_cases/all-stages/vision.md \ + --tech-env test_cases/all-stages/tech-env.md \ + --openapi test_cases/all-stages/openapi.yaml \ + --golden test_cases/all-stages/golden-aidlc-docs +``` + +The `--golden` path won't exist yet, so the qualitative stage will fail. That +is expected. The run itself will still produce: + +``` +runs// + aidlc-docs/ # Generated AIDLC documentation + workspace/ # Generated application code + test-results.yaml # Post-run unit test results + run-meta.yaml # Execution metadata +``` + +### Promoting a run to golden + +After inspecting a successful run and confirming the outputs look correct: + +```bash +# Copy the aidlc-docs from the run into the test case as the golden reference +cp -r runs//aidlc-docs test_cases/all-stages/golden-aidlc-docs +``` + +To create a `golden.yaml` baseline for regression comparison, capture the key +metrics from the run (see `test_cases/sci-calc/golden.yaml` for the format). + +### Subsequent runs (with golden baseline) + +Once `golden-aidlc-docs/` exists, the full pipeline works end-to-end: + +```bash +python run_evaluation.py \ + --config config/opus.yaml \ + --vision test_cases/all-stages/vision.md \ + --tech-env test_cases/all-stages/tech-env.md \ + --openapi test_cases/all-stages/openapi.yaml \ + --golden test_cases/all-stages/golden-aidlc-docs +``` + +The `--baseline` flag is auto-discovered from `golden.yaml` in the same +directory as `--golden`, so if `test_cases/all-stages/golden.yaml` exists it +will be used automatically for regression comparison. + +### Evaluate an existing run (skip execution) + +To re-score a previous run without re-executing the AIDLC workflow: + +```bash +python run_evaluation.py \ + --evaluate-only runs//aidlc-docs \ + --golden test_cases/all-stages/golden-aidlc-docs \ + --openapi test_cases/all-stages/openapi.yaml +``` + +### Key CLI flags reference + +| Flag | Purpose | Default | +|---|---|---| +| `--config` | Model config YAML (AWS creds, executor model, swarm settings) | `config/default.yaml` | +| `--vision` | Vision document | `test_cases/sci-calc/vision.md` | +| `--tech-env` | Technical environment document | `test_cases/sci-calc/tech-env.md` | +| `--openapi` | API contract spec with `x-test-cases` | `test_cases/sci-calc/openapi.yaml` | +| `--golden` | Golden reference aidlc-docs for qualitative scoring | `test_cases/sci-calc/golden-aidlc-docs` | +| `--baseline` | `golden.yaml` for regression comparison | Auto-discovered from `--golden` parent | +| `--executor-model` | Override the executor model ID from config | From config YAML | +| `--scorer-model` | Bedrock model for qualitative scoring | From config YAML | +| `--rules-ref` | Git ref for AIDLC rules (branch/tag/commit) | From config YAML | +| `--output-dir` | Override run output directory | `runs/` | +| `--report-format` | `markdown`, `html`, or `both` | `both` | + +--- + +## Running with run_batch_evaluation.py + +The batch runner evaluates multiple models against the same test case. It +calls `run_evaluation.py` once per model, collecting results into per-model +subdirectories under `runs/`. + +```bash +# All models defined in config/*.yaml +python run_batch_evaluation.py \ + --vision test_cases/all-stages/vision.md \ + --tech-env test_cases/all-stages/tech-env.md \ + --openapi test_cases/all-stages/openapi.yaml \ + --golden test_cases/all-stages/golden-aidlc-docs \ + --models all + +# Specific models only +python run_batch_evaluation.py \ + --vision test_cases/all-stages/vision.md \ + --tech-env test_cases/all-stages/tech-env.md \ + --openapi test_cases/all-stages/openapi.yaml \ + --golden test_cases/all-stages/golden-aidlc-docs \ + --models opus sonnet-4-5 + +# List available model configs +python run_batch_evaluation.py --list +``` + +--- + +## Evaluation Pipeline Stages + +For reference, here is what each stage does for this test case: + +| # | Stage | What It Does | +|---|---|---| +| 1 | **Execution** | Runs the two-agent AIDLC workflow (executor + simulator) to produce `aidlc-docs/` and `workspace/` code | +| 2 | **Post-Run Tests** | Installs deps in `workspace/` and runs `uv run pytest` (built into execution stage) | +| 3 | **Quantitative** | Lints generated code with ruff, runs security scans (bandit) | +| 4 | **Contract Tests** | Starts the FastAPI app from `workspace/`, sends requests from `openapi.yaml` `x-test-cases`, validates responses | +| 5 | **Qualitative** | Compares generated `aidlc-docs/` against `golden-aidlc-docs/` using Bedrock for semantic similarity scoring | +| 6 | **Report** | Generates consolidated Markdown + HTML report with all metrics | + +--- + +## Note on Reverse Engineering + +The Reverse Engineering stage only fires for brownfield projects (existing code +detected). The current runner hardcodes `"this is a greenfield project"` in the +initial prompt (`packages/execution/src/aidlc_runner/runner.py` line 224). To +test Reverse Engineering: + +1. Place existing code in an `existing-code/` directory within this test case +2. Modify `runner.py` to copy `existing-code/*` into `workspace/` before execution +3. Remove the greenfield assumption from the initial prompt + +This is the only AIDLC stage this test case does not cover. + +--- + +## Design Rationale + +The domain (library book lending) is simple enough that anyone can understand +the business rules, but includes enough structural complexity to force all +conditional stages: + +- **Multiple user personas** (Librarian, Member, Admin) with distinct workflows + force User Stories +- **Two logical modules** (Catalog, Lending) force Application Design and Units + Generation +- **Business rules** (checkout limits, late fees, hold queues, renewals) force + Functional Design +- **Explicit performance targets, security requirements, and scalability + constraints** force NFR Requirements and NFR Design +- **AWS cloud deployment requirements** force Infrastructure Design diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/aidlc-state.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/aidlc-state.md new file mode 100644 index 00000000..bfb663a4 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/aidlc-state.md @@ -0,0 +1,34 @@ +# AI-DLC State Tracking + +## Project Information +- **Project Name**: BookShelf Community Library API +- **Project Type**: Greenfield +- **Start Date**: 2025-01-15T10:00:00Z +- **Current Stage**: CONSTRUCTION - Build and Test (COMPLETE) + +## Workspace State +- **Existing Code**: Yes (generated) +- **Workspace Root**: workspace/ + +## Extension Configuration +| Extension | Enabled | Decided At | +|---|---|---| +| security-baseline | Yes | Requirements Analysis | + +## Stage Progress +- [x] INCEPTION - Workspace Detection (Greenfield detected) +- [x] INCEPTION - Requirements Analysis (Approved) +- [x] INCEPTION - User Stories (27 stories, 3 personas) +- [x] INCEPTION - Workflow Planning (Approved) +- [x] INCEPTION - Application Design (2 services, 13 components) +- [x] INCEPTION - Units Generation (Catalog Service + Lending Service) +- [x] CONSTRUCTION - Functional Design (Domain entities + business rules) +- [x] CONSTRUCTION - NFR Requirements (In-memory repos, JWT+bcrypt, structured logging) +- [x] CONSTRUCTION - Code Generation (All code generated for both services) +- [x] CONSTRUCTION - Build and Test (101 tests pass, 93%/87% coverage) + +## Current Status +- **Lifecycle Phase**: CONSTRUCTION (Complete) +- **Build Status**: Success +- **Test Status**: 101/101 tests pass +- **Coverage**: Catalog 93%, Lending 87% diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/audit.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/audit.md new file mode 100644 index 00000000..1b79adff --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/audit.md @@ -0,0 +1,84 @@ +# AI-DLC Audit Log + +## Workflow Start +**Timestamp**: 2025-01-15T10:00:00Z +**User Input**: "Begin the AIDLC workflow and execute it TO COMPLETION through ALL phases. The project vision is available at vision.md in the run folder. The technical environment document is available at tech-env.md in the run folder. It defines the required languages, frameworks, cloud services, security controls, testing standards, and prohibited technologies. Follow it as a binding reference during all Construction stages. Start by loading the core workflow rules and the process overview, then execute every stage of the Inception phase followed by every stage of the Construction phase. The workspace directory is 'workspace/' (currently empty — this is a greenfield project). You MUST generate all application code in workspace/ before the workflow is complete. Do NOT stop after requirements — continue through application design, code generation, and build-and-test." +**AI Response**: Loaded core-workflow, process-overview, content-validation, question-format-guide, session-continuity, welcome-message rules. Read vision.md and tech-env.md. Loaded security-baseline extension. +**Context**: Workflow initialization + +--- + +## INCEPTION - Workspace Detection +**Timestamp**: 2025-01-15T10:00:30Z +**User Input**: N/A (automated stage) +**AI Response**: Scanned workspace/ - empty. Greenfield project. Created aidlc-state.md. +**Context**: Workspace Detection complete. + +--- + +## INCEPTION - Requirements Analysis — Questions & Answers +**Timestamp**: 2025-01-15T10:01:00Z +**AI Response**: Created 11 clarifying questions. Received answers: Q1:B, Q2:A, Q3:C, Q4:A, Q5:C, Q6:C, Q7:C, Q8:A, Q9:C, Q10:D, Q11:A. Generated requirements.md. +**Context**: Requirements Analysis complete, approved. + +--- + +## INCEPTION - User Stories +**Timestamp**: 2025-01-15T10:03:00Z +**AI Response**: Generated 27 user stories across 7 epics with 3 personas. Approved. +**Context**: User Stories complete. + +--- + +## INCEPTION - Workflow Planning +**Timestamp**: 2025-01-15T10:04:00Z +**AI Response**: Created execution plan. 8 stages to execute, 2 skipped (NFR Design, Infrastructure Design). Approved. +**Context**: Workflow Planning complete. + +--- + +## INCEPTION - Application Design +**Timestamp**: 2025-01-15T10:05:00Z +**AI Response**: Designed 2 services with 13 components total. Created components.md, services.md, component-methods.md, component-dependency.md. Approved. +**Context**: Application Design complete. + +--- + +## INCEPTION - Units Generation +**Timestamp**: 2025-01-15T10:06:00Z +**AI Response**: Decomposed into 2 units: Catalog Service (7 stories) and Lending Service (20 stories). Build order: Catalog first. Approved. +**Context**: Units Generation complete. + +--- + +## CONSTRUCTION - Functional Design +**Timestamp**: 2025-01-15T10:07:00Z +**AI Response**: Created domain entities and business rules for both services. Catalog: 1 entity, 6 rules. Lending: 6 entities, 19 rules, 8 constants. Approved. +**Context**: Functional Design complete. + +--- + +## CONSTRUCTION - NFR Requirements +**Timestamp**: 2025-01-15T10:08:00Z +**AI Response**: Decided in-memory repositories, JWT+bcrypt auth, structured JSON logging. Tech stack confirmed. Approved. +**Context**: NFR Requirements complete. + +--- + +## CONSTRUCTION - Code Generation +**Timestamp**: 2025-01-15T10:09:00Z +**AI Response**: Generated all code for both services. 27-step plan executed. Fixed passlib/bcrypt compatibility issue by switching to direct bcrypt library. Approved. +**Context**: Code Generation complete. + +--- + +## CONSTRUCTION - Build and Test +**Timestamp**: 2025-01-15T10:15:00Z +**Build Status**: Success (both services) +**Test Status**: 101/101 tests pass +**Files Generated**: build-and-test-summary.md +**Coverage**: Catalog Service 93%, Lending Service 87% +**Issues Resolved**: Replaced passlib[bcrypt] with direct bcrypt library due to Python 3.14 incompatibility +**Context**: Build and Test complete. All phases of AIDLC workflow executed. + +--- diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/build-and-test/build-and-test-summary.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/build-and-test/build-and-test-summary.md new file mode 100644 index 00000000..1cfa8d87 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/build-and-test/build-and-test-summary.md @@ -0,0 +1,101 @@ +# Build and Test Summary + +## Build Status + +| Service | Build Status | Dependencies | Time | +|---------|-------------|-------------|------| +| Catalog Service | ✅ Success | 27 packages installed | ~10s | +| Lending Service | ✅ Success | 28 packages installed | ~7s | + +## Test Execution Summary + +### Catalog Service — Unit & Integration Tests +- **Total Tests**: 43 +- **Passed**: 43 +- **Failed**: 0 +- **Coverage**: 93% +- **Status**: ✅ PASS (exceeds 90% target) + +### Lending Service — Unit & Integration Tests +- **Total Tests**: 58 +- **Passed**: 58 +- **Failed**: 0 +- **Coverage**: 87% +- **Status**: ✅ PASS (CatalogClient HTTP code mocked in tests; business logic >90%) + +### Combined +- **Total Tests**: 101 +- **Passed**: 101 +- **Failed**: 0 +- **Overall Status**: ✅ ALL TESTS PASS + +## Test Coverage Detail + +### Catalog Service (93% overall) +| Module | Coverage | +|--------|---------| +| api/routes.py | 100% | +| services/book_service.py | 94% | +| repositories/in_memory.py | 97% | +| domain/entities.py | 100% | +| models/book.py | 100% | +| auth/middleware.py | 76% | +| core/exceptions.py | 100% | + +### Lending Service (87% overall) +| Module | Coverage | +|--------|---------| +| api/member_routes.py | 100% | +| api/checkout_routes.py | 100% | +| api/hold_routes.py | 97% | +| api/report_routes.py | 100% | +| services/member_service.py | 96% | +| services/fee_service.py | 96% | +| services/checkout_service.py | 81% | +| services/hold_service.py | 81% | +| services/auth_service.py | 80% | +| services/catalog_client.py | 26% (mocked in tests — real HTTP client) | +| repositories/in_memory.py | 97% | +| domain/entities.py | 100% | + +### Coverage Note +The `catalog_client.py` module (26%) is intentionally low because it makes real HTTP calls to the Catalog Service. All tests use `MockCatalogClient` which simulates the same interface. This is the correct testing strategy for inter-service communication. Excluding this module, business logic coverage exceeds 90%. + +## Issues Encountered and Resolved + +| Issue | Resolution | +|-------|-----------| +| `passlib[bcrypt]` incompatible with Python 3.14 + bcrypt 5.x | Replaced with direct `bcrypt` library usage | + +## Build Commands + +```bash +# Catalog Service +cd workspace/catalog-service +uv sync --all-extras +uv run pytest tests/ -v --cov=catalog_service --cov-report=term-missing + +# Lending Service +cd workspace/lending-service +uv sync --all-extras +uv run pytest tests/ -v --cov=lending_service --cov-report=term-missing +``` + +## Running the Services + +```bash +# Start Catalog Service (port 8000) +cd workspace/catalog-service +uv run uvicorn catalog_service.main:app --host 0.0.0.0 --port 8000 + +# Start Lending Service (port 8001) +cd workspace/lending-service +uv run uvicorn lending_service.main:app --host 0.0.0.0 --port 8001 +``` + +## Overall Status +- **Build**: ✅ Success (both services) +- **All Tests**: ✅ Pass (101/101) +- **Coverage**: ✅ Meets targets (93% Catalog, 87% Lending) +- **Business Rules Verified**: ✅ Checkout limits, hold limits, fees, renewals, RBAC +- **Ready for Operations**: Yes (deployment documentation needed) diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/catalog-service/functional-design/business-rules.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/catalog-service/functional-design/business-rules.md new file mode 100644 index 00000000..d5cabf22 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/catalog-service/functional-design/business-rules.md @@ -0,0 +1,40 @@ +# Functional Design — Catalog Service — Business Rules + +## BR-CAT-001: Book Creation +- Title, author, category are required +- ISBN is optional +- total_copies must be >= 1 +- available_copies is set to total_copies on creation +- String fields must respect max length constraints + +## BR-CAT-002: Book Update +- Only provided fields are updated (partial update) +- If total_copies is changed, new value must be >= (total_copies - available_copies) + - i.e., cannot reduce below currently checked-out count +- If total_copies increases, available_copies increases by the same delta +- If total_copies decreases, available_copies decreases by the same delta (but never below 0) + +## BR-CAT-003: Book Deletion +- Cannot delete a book with active checkouts (available_copies < total_copies) +- Returns 409 CONFLICT if deletion is blocked + +## BR-CAT-004: Book Search +- Search query matches substring in title OR author (case-insensitive) +- Category filter is exact match (case-insensitive) +- Available filter: if true, only return books with available_copies > 0 +- All filters are optional and combinable +- Empty results return empty list (not error) + +## BR-CAT-005: Availability Update +- Delta of -1: decrement available_copies (checkout) + - Fails with CONFLICT if available_copies == 0 +- Delta of +1: increment available_copies (return) + - Fails with CONFLICT if available_copies == total_copies +- Only delta values of -1 and +1 are allowed + +## BR-CAT-006: RBAC Rules +- **Public**: Health check only +- **Member**: Read operations (get, list, search) +- **Librarian**: All CRUD operations +- **Admin**: All operations +- **Internal**: Availability endpoints (service-to-service, validated by shared secret or JWT) diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/catalog-service/functional-design/domain-entities.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/catalog-service/functional-design/domain-entities.md new file mode 100644 index 00000000..b7dacf40 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/catalog-service/functional-design/domain-entities.md @@ -0,0 +1,30 @@ +# Functional Design — Catalog Service — Domain Entities + +## Entity: Book + +| Field | Type | Constraints | Description | +|-------|------|-------------|-------------| +| id | str (UUID) | Auto-generated | Unique identifier | +| title | str | Required, max 255 chars | Book title | +| author | str | Required, max 255 chars | Author name | +| isbn | str | Optional, max 13 chars | ISBN-10 or ISBN-13 | +| category | str | Required, max 100 chars | Book category | +| total_copies | int | Required, >= 1 | Total physical copies owned | +| available_copies | int | Auto-managed, >= 0, <= total_copies | Currently available copies | +| created_at | datetime (UTC) | Auto-generated | Creation timestamp | +| updated_at | datetime (UTC) | Auto-updated | Last update timestamp | + +### Invariants +- `available_copies` must always be `>= 0` +- `available_copies` must always be `<= total_copies` +- `total_copies` must always be `>= 1` +- When `total_copies` is updated, it must be `>= (total_copies - available_copies)` (cannot go below checked-out count) + +## Entity: AvailabilityInfo (Value Object) + +| Field | Type | Description | +|-------|------|-------------| +| book_id | str | Book identifier | +| title | str | Book title | +| total_copies | int | Total copies | +| available_copies | int | Available copies | diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/lending-service/functional-design/business-rules.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/lending-service/functional-design/business-rules.md new file mode 100644 index 00000000..aaf78ef6 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/lending-service/functional-design/business-rules.md @@ -0,0 +1,150 @@ +# Functional Design — Lending Service — Business Rules + +## Authentication Rules + +### BR-AUTH-001: Registration +- Name required, max 100 chars +- Email required, valid format, unique across all members +- Password required, minimum 8 characters +- Password stored as bcrypt hash (never plaintext) +- Auto-assigned role: "member" +- Account created as active (is_active = true) + +### BR-AUTH-002: Login +- Validate email exists and account is active +- Verify password against bcrypt hash +- Return JWT with: member_id, email, role, exp (24 hours from now) +- Invalid credentials return 401 (do not reveal whether email or password is wrong) + +### BR-AUTH-003: JWT Validation +- Verify signature, expiration, required claims (member_id, email, role) +- Reject expired tokens with 401 +- Extract member_id, email, role from valid token + +## Member Rules + +### BR-MEM-001: Profile Update +- Members can update own name and email only +- New email must be unique +- Cannot change role or active status via profile update + +### BR-MEM-002: Account Deactivation +- Only Admin can deactivate accounts +- Deactivated member: is_active = false +- Existing checkouts remain active (tracked until returned) +- Existing holds remain active +- Blocked actions: new checkout, new hold, new renewal +- Cannot deactivate self + +## Checkout Rules + +### BR-CHK-001: Checkout Validation +1. Book must exist (verify via Catalog Service) +2. Book must have available_copies > 0 +3. Member must be active (is_active = true) +4. Member active checkout count < MAX_CHECKOUTS (5) +5. Member outstanding fee balance <= FEE_THRESHOLD ($10.00) +6. All validations must pass before creating checkout + +### BR-CHK-002: Checkout Creation +- Set checkout_date = now (UTC) +- Set due_date = checkout_date + 14 days +- Set status = "active" +- Set renewal_count = 0 +- Decrement available_copies in Catalog Service +- If Catalog Service call fails, abort checkout + +### BR-CHK-003: Return Processing +1. Checkout must exist and be active +2. Members can return own checkouts; Librarian/Admin can return any +3. Set return_date = now (UTC), status = "returned" +4. Calculate late fee if overdue: + - days_overdue = (return_date.date() - due_date.date()).days (only if positive) + - fee = days_overdue * $0.25 + - fee capped at $10.00 per checkout + - Create fee record if fee > 0 +5. Increment available_copies in Catalog Service +6. Check hold queue: if waiting holds exist for this book, fulfill next + +### BR-CHK-004: Renewal +1. Checkout must exist and be active +2. Checkout must belong to requesting member +3. Member must be active +4. renewal_count < MAX_RENEWALS (2) +5. No active holds (status = "waiting") exist for the book +6. Extend due_date by 14 days from current due_date +7. Increment renewal_count + +## Hold Rules + +### BR-HLD-001: Hold Placement +1. Book must exist (verify via Catalog Service) +2. Book available_copies must be 0 (holds only on unavailable books) +3. Member must be active +4. Member active hold count < MAX_HOLDS (3) — count holds with status "waiting" or "ready" +5. Member must not already have an active hold on this book +6. Create hold with status = "waiting" +7. Queue position = max position for this book + 1 (FIFO) + +### BR-HLD-002: Hold Cancellation +1. Hold must exist +2. Members can cancel own holds; Librarian/Admin can cancel any +3. Set status = "cancelled" +4. Reorder queue: decrement position of all holds with higher position for the same book + +### BR-HLD-003: Hold Fulfillment (on return) +1. When a book is returned, check for waiting holds on that book +2. Find the hold with the lowest queue_position and status = "waiting" +3. Update that hold's status to "ready" +4. Do NOT decrement available_copies (the "ready" hold reserves the copy conceptually) + - Actually: the available_copies was already incremented on return. The "ready" hold member + will check out normally, which decrements again. So available_copies accurately reflects + the physical availability. + +## Fee Rules + +### BR-FEE-001: Late Fee Calculation +- Rate: $0.25 per day overdue +- Cap: $10.00 per checkout +- Only charged on return, not accruing in real-time +- Calculated in UTC dates (whole days only) + +### BR-FEE-002: Fee Payment +- Librarian/Admin processes payments +- Partial payments allowed +- Payment applied to oldest outstanding fees first +- Payment amount must be > 0 +- Payment cannot exceed total outstanding balance + +### BR-FEE-003: Outstanding Balance +- Sum of all fees with status = "outstanding" minus payments applied +- Used for checkout threshold check (BR-CHK-001, rule 5) + +## Reporting Rules + +### BR-RPT-001: Overdue Report +- All checkouts where status = "active" AND due_date < now +- Include: member name, email, book title, checkout_date, due_date, days_overdue +- Book title fetched via Catalog Service or stored denormalized in checkout +- Accessible by Librarian and Admin only + +### BR-RPT-002: Collection Summary +- total_books: count from Catalog Service +- total_members: count of all members +- books_checked_out: count of active checkouts +- books_available: total_books - books_checked_out +- total_outstanding_fees: sum of outstanding fee amounts +- Accessible by Admin only + +## Configuration Constants + +| Constant | Value | Description | +|----------|-------|-------------| +| MAX_CHECKOUTS | 5 | Max active checkouts per member | +| MAX_RENEWALS | 2 | Max renewals per checkout | +| MAX_HOLDS | 3 | Max active holds per member | +| LOAN_PERIOD_DAYS | 14 | Days per checkout period | +| LATE_FEE_PER_DAY | 0.25 | Daily late fee in dollars | +| LATE_FEE_CAP | 10.00 | Max late fee per checkout | +| FEE_THRESHOLD | 10.00 | Outstanding fee limit for checkout | +| JWT_EXPIRY_HOURS | 24 | JWT token expiry | diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/lending-service/functional-design/domain-entities.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/lending-service/functional-design/domain-entities.md new file mode 100644 index 00000000..81872c5d --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/lending-service/functional-design/domain-entities.md @@ -0,0 +1,74 @@ +# Functional Design — Lending Service — Domain Entities + +## Entity: Member + +| Field | Type | Constraints | Description | +|-------|------|-------------|-------------| +| id | str (UUID) | Auto-generated | Unique identifier | +| name | str | Required, max 100 chars | Member full name | +| email | str | Required, unique, max 255 chars, valid email format | Login email | +| password_hash | str | System-managed | Bcrypt hash of password | +| role | str (enum) | "admin", "librarian", "member" | RBAC role | +| is_active | bool | Default: true | Account active status | +| created_at | datetime (UTC) | Auto-generated | Registration timestamp | + +## Entity: Checkout + +| Field | Type | Constraints | Description | +|-------|------|-------------|-------------| +| id | str (UUID) | Auto-generated | Unique identifier | +| member_id | str | Required, FK to Member | Borrowing member | +| book_id | str | Required | Book from Catalog Service | +| checkout_date | datetime (UTC) | Auto-generated | When checked out | +| due_date | datetime (UTC) | checkout_date + 14 days | When due | +| return_date | datetime (UTC) | Null until returned | When returned | +| status | str (enum) | "active", "returned" | Checkout status | +| renewal_count | int | Default: 0, max 2 | Times renewed | + +## Entity: Hold + +| Field | Type | Constraints | Description | +|-------|------|-------------|-------------| +| id | str (UUID) | Auto-generated | Unique identifier | +| member_id | str | Required, FK to Member | Requesting member | +| book_id | str | Required | Book from Catalog Service | +| hold_date | datetime (UTC) | Auto-generated | When hold placed | +| status | str (enum) | "waiting", "ready", "cancelled", "fulfilled" | Hold status | +| queue_position | int | >= 1 | Position in FIFO queue | + +## Entity: Fee + +| Field | Type | Constraints | Description | +|-------|------|-------------|-------------| +| id | str (UUID) | Auto-generated | Unique identifier | +| member_id | str | Required, FK to Member | Member with fee | +| checkout_id | str | Required, FK to Checkout | Related checkout | +| amount | Decimal | > 0, max $10.00 | Fee amount | +| status | str (enum) | "outstanding", "paid" | Fee status | +| created_at | datetime (UTC) | Auto-generated | When fee created | + +## Entity: Payment + +| Field | Type | Constraints | Description | +|-------|------|-------------|-------------| +| id | str (UUID) | Auto-generated | Unique identifier | +| member_id | str | Required, FK to Member | Paying member | +| amount | Decimal | > 0 | Payment amount | +| payment_date | datetime (UTC) | Auto-generated | When payment processed | + +## Entity: TokenPayload (Value Object) + +| Field | Type | Description | +|-------|------|-------------| +| member_id | str | Member UUID | +| email | str | Member email | +| role | str | Member role | +| exp | datetime | Token expiration | + +## Entity: ReturnResult (Value Object) + +| Field | Type | Description | +|-------|------|-------------| +| checkout | Checkout | Updated checkout record | +| fee | Fee | None | Late fee if applicable | +| hold_fulfilled | Hold | None | Hold fulfilled if applicable | diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/plans/code-generation-plan.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/plans/code-generation-plan.md new file mode 100644 index 00000000..8d412734 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/plans/code-generation-plan.md @@ -0,0 +1,43 @@ +# Code Generation Plan — Catalog Service + Lending Service + +## Unit Context +- **Workspace Root**: workspace/ +- **Project Type**: Greenfield multi-unit (microservices) +- **Structure**: workspace/catalog-service/ and workspace/lending-service/ + +--- + +## Catalog Service Steps + +- [ ] Step 1: Create Catalog Service project structure (pyproject.toml, src layout, tests dir) +- [ ] Step 2: Create domain entities (Book, AvailabilityInfo) +- [ ] Step 3: Create Pydantic request/response models +- [ ] Step 4: Create core modules (exceptions, response helpers, logging, config) +- [ ] Step 5: Create abstract repository and in-memory implementation +- [ ] Step 6: Create BookService (business logic) +- [ ] Step 7: Create auth middleware (JWT validation) +- [ ] Step 8: Create API routes and FastAPI app +- [ ] Step 9: Create Catalog Service unit tests +- [ ] Step 10: Create Catalog Service integration tests + +## Lending Service Steps + +- [ ] Step 11: Create Lending Service project structure (pyproject.toml, src layout, tests dir) +- [ ] Step 12: Create domain entities (Member, Checkout, Hold, Fee, Payment) +- [ ] Step 13: Create Pydantic request/response models +- [ ] Step 14: Create core modules (exceptions, response helpers, logging, config) +- [ ] Step 15: Create abstract repositories and in-memory implementations +- [ ] Step 16: Create AuthService (JWT + bcrypt) +- [ ] Step 17: Create MemberService +- [ ] Step 18: Create CheckoutService +- [ ] Step 19: Create HoldService +- [ ] Step 20: Create FeeService +- [ ] Step 21: Create ReportService +- [ ] Step 22: Create CatalogClient (HTTP client) +- [ ] Step 23: Create auth middleware (JWT validation) +- [ ] Step 24: Create API routes (member, checkout, hold, fee, report) +- [ ] Step 25: Create FastAPI app entry point +- [ ] Step 26: Create Lending Service unit tests +- [ ] Step 27: Create Lending Service integration tests + +**Total**: 27 steps covering both services diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/shared/nfr-requirements/nfr-requirements.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/shared/nfr-requirements/nfr-requirements.md new file mode 100644 index 00000000..f0337f9a --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/shared/nfr-requirements/nfr-requirements.md @@ -0,0 +1,80 @@ +# NFR Requirements — Both Services + +## Performance Requirements + +| Requirement | Target | Applies To | +|---|---|---| +| API response time (p95) | < 100ms | Both services | +| Full-text search latency | < 200ms | Catalog Service | +| Concurrent users | 100 simultaneous | Both services | +| Cold start tolerance | < 3 seconds | Both services | + +## Availability Requirements + +| Requirement | Target | +|---|---| +| API uptime | 99.9% | + +## Security Requirements + +| Requirement | Detail | +|---|---| +| Authentication | Application-level JWT (PyJWT + bcrypt) | +| Authorization | RBAC with Admin, Librarian, Member roles | +| Password storage | bcrypt adaptive hashing | +| Token expiry | 24 hours, no refresh tokens (MVP) | +| Encryption at rest | Required for all data stores | +| Encryption in transit | TLS 1.2+ | +| PII protection | Member email/name must not appear in logs | +| Input validation | Pydantic models with length constraints on all endpoints | +| CORS | Restricted origins (configurable, no wildcard on authenticated endpoints) | + +## Testing Requirements + +| Test Type | Target | +|---|---| +| Unit test coverage | >= 90% line coverage per service | +| Integration tests | All API endpoints tested | +| Auth testing | Both authorized and unauthorized access per endpoint | + +## Scalability + +- Each service scales independently +- Catalog: optimized for read-heavy (search/browse) +- Lending: optimized for write-heavy (checkouts/returns) +- Single-library deployment for MVP +- Target: 10,000 books, 2,000 members + +## Database Decision + +**Decision: In-memory repositories with abstract base class for MVP development and testing.** + +**Rationale:** +- Tech-env deferred database choice (DynamoDB vs PostgreSQL) to NFR stages +- CDK infrastructure is deferred — no cloud database will be provisioned +- For local development and testing, in-memory repositories provide zero-dependency operation +- Abstract repository pattern allows swapping to DynamoDB or PostgreSQL via concrete implementation later +- In-memory approach is consistent with the "application code only" MVP scope +- All business logic and API behavior is fully testable without external dependencies + +**Repository Pattern:** +- `BaseRepository` (abstract) defines the interface (create, get, list, update, delete, search) +- `InMemoryRepository` implements using Python dicts — used for development and testing +- Future: `DynamoDBRepository` or `PostgresRepository` can be added as drop-in replacements + +## Authentication Decision + +**Decision: Application-level JWT with PyJWT + passlib[bcrypt]** + +**Rationale:** +- Simpler than Cognito for MVP +- No external dependency (no AWS account needed to run tests) +- Both services validate JWTs independently using a shared secret +- JWT secret stored in configuration (environment variable for local dev, Secrets Manager for prod) + +## Structured Logging + +- Python `logging` module with JSON formatter +- Fields: timestamp, correlation_id (request ID), level, message, service_name +- PII filtering: strip member email and name from log output +- Log to stdout (CloudWatch compatible when deployed) diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/shared/nfr-requirements/tech-stack-decisions.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/shared/nfr-requirements/tech-stack-decisions.md new file mode 100644 index 00000000..17f6a76e --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/construction/shared/nfr-requirements/tech-stack-decisions.md @@ -0,0 +1,57 @@ +# Tech Stack Decisions — Both Services + +## Core Stack + +| Technology | Version | Purpose | +|---|---|---| +| Python | 3.13+ | Runtime | +| FastAPI | 0.115+ | REST API framework | +| Pydantic | 2.x | Request/response validation | +| uvicorn | 0.34+ | ASGI server | +| PyJWT | 2.x | JWT generation and validation | +| passlib[bcrypt] | 1.7+ | Password hashing | +| httpx | 0.28+ | HTTP client (Lending → Catalog, test client) | + +## Testing Stack + +| Technology | Version | Purpose | +|---|---|---| +| pytest | 8.x | Test runner | +| pytest-asyncio | 0.24+ | Async test support | +| pytest-cov | 6.x | Coverage reporting | +| httpx | 0.28+ | AsyncClient for integration tests | + +## Development Tools + +| Technology | Version | Purpose | +|---|---|---| +| uv | latest | Package management | +| ruff | 0.9+ | Linting + formatting | + +## Database Strategy + +| Layer | Technology | Purpose | +|---|---|---| +| Repository Interface | Abstract base class | Define data access contract | +| MVP Implementation | In-memory (Python dict) | Zero-dependency development/testing | +| Future Production | DynamoDB or PostgreSQL | Cloud deployment (Phase 2) | + +## Prohibited Technologies (per tech-env.md) + +| Prohibited | Use Instead | +|---|---| +| Flask, Django | FastAPI | +| requests | httpx | +| pandas, numpy | Standard Python | +| pip, poetry, pipenv | uv | +| black, flake8, isort | ruff | +| EC2 (direct) | Lambda or Fargate (future) | +| Elastic Beanstalk | CDK (future) | + +## Project Structure + +Each service is an independent Python package: +- `catalog-service/` — pyproject.toml, src/catalog_service/, tests/ +- `lending-service/` — pyproject.toml, src/lending_service/, tests/ + +Both use `uv` as the package manager with `pyproject.toml` for dependency specification. diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/component-dependency.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/component-dependency.md new file mode 100644 index 00000000..a1948e78 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/component-dependency.md @@ -0,0 +1,71 @@ +# Component Dependencies + +## Dependency Matrix + +| Component | Depends On | Communication | +|-----------|-----------|---------------| +| CatalogAPI (Routes) | BookService, AuthMiddleware | Direct function calls | +| BookService | BookRepository | Direct function calls | +| BookRepository | Database (abstract) | Database driver | +| AuthMiddleware (Catalog) | JWT library (PyJWT) | Library call | +| LendingAPI (Routes) | All Lending services, AuthMiddleware | Direct function calls | +| MemberService | MemberRepository, AuthService | Direct function calls | +| AuthService | MemberRepository, JWT/bcrypt libs | Library call | +| CheckoutService | CheckoutRepository, HoldService, FeeService, CatalogClient | Direct calls + HTTP | +| HoldService | HoldRepository | Direct function calls | +| FeeService | FeeRepository | Direct function calls | +| ReportService | CheckoutRepository, MemberRepository, BookService (via CatalogClient) | Direct calls + HTTP | +| CatalogClient | Catalog Service API | HTTP (httpx) | + +## Inter-Service Communication + +``` ++------------------+ HTTP (httpx) +------------------+ +| | -----------------------------> | | +| Lending Service | GET /books/{id}/availability | Catalog Service | +| | POST /books/{id}/availability| | +| (Port 8001) | -----------------------------> | (Port 8000) | +| | | | ++------------------+ +------------------+ +``` + +### Communication Patterns + +1. **Checkout Flow**: LendingAPI → CheckoutService → CatalogClient → Catalog Service (verify + decrement) +2. **Return Flow**: LendingAPI → CheckoutService → CatalogClient (increment) → HoldService (fulfill) → FeeService (create fee if late) +3. **Hold Placement**: LendingAPI → HoldService → CatalogClient → Catalog Service (verify availability = 0) +4. **Search**: CatalogAPI → BookService → BookRepository (direct, no cross-service) + +## Internal Layer Dependencies (per service) + +``` ++-------------------+ +| API Routes | (FastAPI route handlers) ++-------------------+ + | + v ++-------------------+ +| Services | (Business logic) ++-------------------+ + | + v ++-------------------+ +| Repositories | (Data access - abstract) ++-------------------+ + | + v ++-------------------+ +| Database | (Concrete implementation) ++-------------------+ +``` + +## Data Ownership + +| Data Entity | Owning Service | Storage | +|-------------|---------------|---------| +| Book | Catalog Service | Catalog DB | +| Member | Lending Service | Lending DB | +| Checkout | Lending Service | Lending DB | +| Hold | Lending Service | Lending DB | +| Fee | Lending Service | Lending DB | +| Payment | Lending Service | Lending DB | diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/component-methods.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/component-methods.md new file mode 100644 index 00000000..b78fec05 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/component-methods.md @@ -0,0 +1,79 @@ +# Component Methods + +## Catalog Service + +### BookService Methods + +| Method | Input | Output | Purpose | +|--------|-------|--------|---------| +| `create_book` | `BookCreateRequest(title, author, isbn, category, total_copies)` | `Book` | Create book, set available_copies = total_copies | +| `get_book` | `book_id: str` | `Book` | Get book by ID or raise NOT_FOUND | +| `list_books` | None | `List[Book]` | Return all books | +| `update_book` | `book_id: str, BookUpdateRequest` | `Book` | Update fields, validate total_copies >= checked_out | +| `delete_book` | `book_id: str` | `None` | Delete if no active checkouts | +| `search_books` | `query: str, category: str, available: bool` | `List[Book]` | Substring match on title/author, optional filters | +| `check_availability` | `book_id: str` | `AvailabilityInfo(book_id, title, total_copies, available_copies)` | Internal API for Lending Service | +| `update_availability` | `book_id: str, delta: int` | `Book` | Adjust available_copies by delta (+1 or -1) | + +### BookRepository Methods + +| Method | Input | Output | Purpose | +|--------|-------|--------|---------| +| `create` | `Book` | `Book` | Persist new book | +| `get_by_id` | `book_id: str` | `Book | None` | Retrieve by primary key | +| `list_all` | None | `List[Book]` | List all records | +| `update` | `Book` | `Book` | Update existing record | +| `delete` | `book_id: str` | `None` | Remove record | +| `search` | `query: str, category: str, available: bool` | `List[Book]` | Search with filters | + +--- + +## Lending Service + +### AuthService Methods + +| Method | Input | Output | Purpose | +|--------|-------|--------|---------| +| `hash_password` | `password: str` | `str` | Bcrypt hash | +| `verify_password` | `plain: str, hashed: str` | `bool` | Verify password | +| `create_token` | `member_id: str, email: str, role: str` | `str` | JWT with 24h expiry | +| `decode_token` | `token: str` | `TokenPayload(member_id, email, role)` | Validate and decode | + +### MemberService / MemberRepository Methods + +| Method | Input | Output | Purpose | +|--------|-------|--------|---------| +| `register` | `MemberRegisterRequest(name, email, password)` | `Member` | Create with bcrypt hash | +| `login` | `email: str, password: str` | `str (JWT)` | Authenticate and return token | +| `get_profile` | `member_id: str` | `Member` | Self-service profile | +| `update_profile` | `member_id: str, data` | `Member` | Update name/email | +| `deactivate` | `member_id: str` | `Member` | Set is_active = False | + +### CheckoutService Methods + +| Method | Input | Output | Purpose | +|--------|-------|--------|---------| +| `checkout` | `member_id: str, book_id: str` | `Checkout` | Full validation + create | +| `return_book` | `checkout_id: str, member_id: str, role: str` | `ReturnResult` | Return + fees + holds | +| `renew` | `checkout_id: str, member_id: str` | `Checkout` | Extend due date | +| `list_checkouts` | `member_id: str, status: str` | `List[Checkout]` | Filter by status | + +### HoldService Methods + +| Method | Input | Output | Purpose | +|--------|-------|--------|---------| +| `place_hold` | `member_id: str, book_id: str` | `Hold` | Validate + create FIFO | +| `cancel_hold` | `hold_id: str, member_id: str, role: str` | `None` | Cancel + reorder | +| `get_holds_for_book` | `book_id: str` | `List[Hold]` | Queue for a book | +| `get_member_holds` | `member_id: str` | `List[Hold]` | Member's holds | +| `fulfill_next_hold` | `book_id: str` | `Hold | None` | First waiting → ready | + +### FeeService Methods + +| Method | Input | Output | Purpose | +|--------|-------|--------|---------| +| `calculate_late_fee` | `due_date, return_date` | `Decimal` | $0.25/day, cap $10 | +| `create_fee` | `member_id, checkout_id, amount` | `Fee` | Record fee | +| `get_member_fees` | `member_id: str` | `List[Fee]` | Fee list | +| `get_outstanding_balance` | `member_id: str` | `Decimal` | Total outstanding | +| `process_payment` | `member_id, amount` | `Payment` | Apply payment | diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/components.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/components.md new file mode 100644 index 00000000..c69940e6 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/components.md @@ -0,0 +1,124 @@ +# Application Components + +## Service 1: Catalog Service + +### Component: BookRepository +- **Purpose**: Data access layer for book persistence +- **Responsibilities**: CRUD operations on book records, search queries, availability updates +- **Interface**: Abstract repository pattern (allows swapping database implementations) + +### Component: BookService +- **Purpose**: Business logic for catalog operations +- **Responsibilities**: Book validation, search orchestration, availability management, business rule enforcement +- **Interface**: Called by API route handlers + +### Component: CatalogAPI (Routes) +- **Purpose**: REST API endpoint handlers for the Catalog Service +- **Responsibilities**: Request parsing, response formatting, RBAC enforcement via middleware, route definitions +- **Endpoints**: + - `POST /api/v1/books` — Add book (Librarian, Admin) + - `GET /api/v1/books` — List books (authenticated) + - `GET /api/v1/books/{book_id}` — Get book (authenticated) + - `PUT /api/v1/books/{book_id}` — Update book (Librarian, Admin) + - `DELETE /api/v1/books/{book_id}` — Delete book (Librarian, Admin) + - `GET /api/v1/books/search` — Search books (authenticated) + - `GET /api/v1/catalog/health` — Health check (public) + - Internal: `GET /api/v1/books/{book_id}/availability` — Availability check (service-to-service) + - Internal: `POST /api/v1/books/{book_id}/availability` — Update availability (service-to-service) + +### Component: AuthMiddleware (Catalog) +- **Purpose**: JWT validation and role extraction for Catalog Service +- **Responsibilities**: Token verification, role-based route protection, request context enrichment + +--- + +## Service 2: Lending Service + +### Component: MemberRepository +- **Purpose**: Data access for member records +- **Responsibilities**: CRUD operations on member records, email uniqueness enforcement + +### Component: MemberService +- **Purpose**: Business logic for member management +- **Responsibilities**: Registration, profile management, password hashing, account deactivation + +### Component: AuthService +- **Purpose**: Authentication and JWT token management +- **Responsibilities**: Login validation, JWT generation, JWT verification, password hashing with bcrypt + +### Component: CheckoutRepository +- **Purpose**: Data access for checkout records +- **Responsibilities**: CRUD on checkout records, queries for active/overdue checkouts + +### Component: CheckoutService +- **Purpose**: Business logic for checkout, return, and renewal operations +- **Responsibilities**: Checkout validation (limits, fees, availability), return processing (fee calculation, hold fulfillment trigger), renewal validation (limits, hold checks), inter-service calls to Catalog Service + +### Component: HoldRepository +- **Purpose**: Data access for hold records +- **Responsibilities**: CRUD on hold records, FIFO queue management, queue position queries + +### Component: HoldService +- **Purpose**: Business logic for hold management +- **Responsibilities**: Hold placement validation (limits, availability, duplicates), cancellation with queue reorder, hold fulfillment on return + +### Component: FeeRepository +- **Purpose**: Data access for fee and payment records +- **Responsibilities**: CRUD on fee records, payment tracking, outstanding balance queries + +### Component: FeeService +- **Purpose**: Business logic for fee management +- **Responsibilities**: Fee creation on late return, payment processing, balance calculations + +### Component: ReportService +- **Purpose**: Business logic for reporting +- **Responsibilities**: Overdue checkout aggregation, collection summary computation + +### Component: CatalogClient +- **Purpose**: HTTP client for communicating with the Catalog Service +- **Responsibilities**: Book existence verification, availability checks, availability updates (increment/decrement) + +### Component: LendingAPI (Routes) +- **Purpose**: REST API endpoint handlers for the Lending Service +- **Responsibilities**: Request parsing, response formatting, RBAC enforcement, route definitions +- **Endpoints**: + - `POST /api/v1/members/register` — Register (public) + - `POST /api/v1/members/login` — Login (public) + - `GET /api/v1/members/me` — My profile (authenticated) + - `PUT /api/v1/members/me` — Update profile (authenticated) + - `GET /api/v1/members/{member_id}` — Get member (Admin, Librarian) + - `PUT /api/v1/members/{member_id}/deactivate` — Deactivate (Admin) + - `POST /api/v1/checkouts` — Checkout (authenticated) + - `POST /api/v1/checkouts/{checkout_id}/return` — Return (authenticated) + - `POST /api/v1/checkouts/{checkout_id}/renew` — Renew (authenticated) + - `GET /api/v1/checkouts` — List checkouts (authenticated) + - `POST /api/v1/holds` — Place hold (authenticated) + - `DELETE /api/v1/holds/{hold_id}` — Cancel hold (authenticated) + - `GET /api/v1/holds` — List holds (authenticated) + - `GET /api/v1/holds/me` — My holds (authenticated) + - `GET /api/v1/fees/me` — My fees (authenticated) + - `GET /api/v1/fees` — List fees (Admin, Librarian) + - `POST /api/v1/fees/payments` — Process payment (Admin, Librarian) + - `GET /api/v1/reports/overdue` — Overdue report (Admin, Librarian) + - `GET /api/v1/reports/summary` — Collection summary (Admin) + - `GET /api/v1/lending/health` — Health check (public) + +### Component: AuthMiddleware (Lending) +- **Purpose**: JWT validation and role extraction for Lending Service +- **Responsibilities**: Token verification, role-based route protection, request context enrichment + +--- + +## Shared Concerns + +### Cross-Cutting: Structured Logging +- **Purpose**: Centralized logging configuration for both services +- **Responsibilities**: Request/response logging with correlation IDs, PII filtering, structured JSON output + +### Cross-Cutting: Error Handling +- **Purpose**: Global exception handling middleware +- **Responsibilities**: Catch unhandled exceptions, return standardized error responses, log errors without exposing internals + +### Cross-Cutting: Response Envelope +- **Purpose**: Consistent API response formatting +- **Responsibilities**: Wrap all responses in `{"status": "ok/error", "data/error": ...}` format diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/services.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/services.md new file mode 100644 index 00000000..39bf0158 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/services.md @@ -0,0 +1,76 @@ +# Services Design + +## Catalog Service — Service Layer + +### BookService +- **Responsibility**: Orchestrates book CRUD operations, search, and availability management +- **Methods**: + - `create_book(data) -> Book` — validate and create a book + - `get_book(book_id) -> Book` — retrieve by ID + - `list_books() -> List[Book]` — list all books + - `update_book(book_id, data) -> Book` — update book metadata + - `delete_book(book_id) -> None` — delete (fails if active checkouts) + - `search_books(query, category, available) -> List[Book]` — full-text search with filters + - `check_availability(book_id) -> AvailabilityInfo` — return availability for inter-service call + - `update_availability(book_id, delta) -> Book` — increment/decrement available_copies + +--- + +## Lending Service — Service Layer + +### AuthService +- **Responsibility**: Authentication and token management +- **Methods**: + - `hash_password(password) -> str` — bcrypt hash + - `verify_password(plain, hashed) -> bool` — verify bcrypt + - `create_token(member) -> str` — generate JWT with member_id, email, role, 24h expiry + - `decode_token(token) -> TokenPayload` — validate and decode JWT + +### MemberService +- **Responsibility**: Member lifecycle management +- **Methods**: + - `register(data) -> Member` — create member with hashed password, auto-assign member role + - `login(email, password) -> str` — verify credentials, return JWT + - `get_profile(member_id) -> Member` — retrieve member profile + - `update_profile(member_id, data) -> Member` — update name/email + - `get_member(member_id) -> Member` — admin/librarian view + - `deactivate(member_id) -> Member` — set active=False + +### CheckoutService +- **Responsibility**: Checkout, return, and renewal orchestration +- **Methods**: + - `checkout(member_id, book_id) -> Checkout` — validate limits/fees/availability, create checkout, decrement availability + - `return_book(checkout_id, member_id, role) -> ReturnResult` — process return, calculate fees, increment availability, fulfill holds + - `renew(checkout_id, member_id) -> Checkout` — validate renewal limits/holds, extend due date + - `list_checkouts(member_id, status) -> List[Checkout]` — list checkouts with optional status filter + +### HoldService +- **Responsibility**: Hold queue management +- **Methods**: + - `place_hold(member_id, book_id) -> Hold` — validate limits/availability/duplicates, create hold with FIFO position + - `cancel_hold(hold_id, member_id, role) -> None` — cancel and reorder queue + - `get_holds_for_book(book_id) -> List[Hold]` — hold queue for a book + - `get_member_holds(member_id) -> List[Hold]` — all holds for a member + - `fulfill_next_hold(book_id) -> Hold | None` — called on return, update first waiting hold to ready + +### FeeService +- **Responsibility**: Fee calculation and payment processing +- **Methods**: + - `calculate_late_fee(due_date, return_date) -> Decimal` — $0.25/day, capped at $10.00 + - `create_fee(member_id, checkout_id, amount) -> Fee` — create fee record + - `get_member_fees(member_id) -> List[Fee]` — list fees for a member + - `get_outstanding_balance(member_id) -> Decimal` — total outstanding + - `process_payment(member_id, amount) -> Payment` — record payment, reduce outstanding + +### ReportService +- **Responsibility**: Operational reporting +- **Methods**: + - `get_overdue_checkouts() -> List[OverdueItem]` — all overdue with member info and days overdue + - `get_collection_summary() -> CollectionSummary` — aggregate stats + +### CatalogClient +- **Responsibility**: HTTP client to Catalog Service +- **Methods**: + - `check_availability(book_id) -> AvailabilityInfo` — GET /api/v1/books/{book_id}/availability + - `decrement_availability(book_id) -> None` — POST /api/v1/books/{book_id}/availability with delta=-1 + - `increment_availability(book_id) -> None` — POST /api/v1/books/{book_id}/availability with delta=+1 diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/unit-of-work-dependency.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/unit-of-work-dependency.md new file mode 100644 index 00000000..d287dbe0 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/unit-of-work-dependency.md @@ -0,0 +1,32 @@ +# Unit of Work Dependencies + +## Dependency Matrix + +| Unit | Depends On | Communication | Priority | +|------|-----------|---------------|----------| +| Catalog Service | None | — | Build first | +| Lending Service | Catalog Service | HTTP (httpx) | Build second | + +## Dependency Details + +### Catalog Service → External Dependencies +- No service dependencies +- Database: own data store (to be determined in NFR) +- Libraries: FastAPI, Pydantic, PyJWT, uvicorn + +### Lending Service → Catalog Service +- **Purpose**: Book existence verification and availability updates +- **API Calls**: + - `GET /api/v1/books/{book_id}/availability` — verify book exists and check copies + - `POST /api/v1/books/{book_id}/availability` — increment/decrement available_copies +- **Failure Handling**: If Catalog Service is unavailable, Lending operations that require book verification should fail with INTERNAL_ERROR +- **Libraries**: FastAPI, Pydantic, PyJWT, passlib/bcrypt, httpx, uvicorn + +## Integration Points + +| Integration | Source | Target | Method | Frequency | +|-------------|--------|--------|--------|-----------| +| Checkout verification | Lending | Catalog | HTTP GET | Every checkout | +| Availability decrement | Lending | Catalog | HTTP POST | Every checkout | +| Availability increment | Lending | Catalog | HTTP POST | Every return | +| Hold verification | Lending | Catalog | HTTP GET | Every hold placement | diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/unit-of-work-story-map.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/unit-of-work-story-map.md new file mode 100644 index 00000000..cf758360 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/unit-of-work-story-map.md @@ -0,0 +1,54 @@ +# Unit of Work — Story Map + +## Catalog Service (Unit 1) + +| Story ID | Story Name | Priority | +|----------|-----------|----------| +| US-CAT-001 | Add a Book | Must Have | +| US-CAT-002 | Update a Book | Must Have | +| US-CAT-003 | Get a Book | Must Have | +| US-CAT-004 | List All Books | Must Have | +| US-CAT-005 | Delete a Book | Must Have | +| US-CAT-006 | Search Books | Must Have | +| US-SYS-001 | Catalog Health Check | Must Have | + +**Total**: 7 stories + +--- + +## Lending Service (Unit 2) + +| Story ID | Story Name | Priority | +|----------|-----------|----------| +| US-AUTH-001 | Register as a Member | Must Have | +| US-AUTH-002 | Login | Must Have | +| US-AUTH-003 | View My Profile | Must Have | +| US-AUTH-004 | Update My Profile | Must Have | +| US-AUTH-005 | View Any Member Profile | Must Have | +| US-AUTH-006 | Deactivate Member Account | Must Have | +| US-LND-001 | Checkout a Book | Must Have | +| US-LND-002 | Return a Book | Must Have | +| US-LND-003 | Renew a Checkout | Must Have | +| US-LND-004 | View My Active Checkouts | Must Have | +| US-HLD-001 | Place a Hold | Must Have | +| US-HLD-002 | Cancel a Hold | Must Have | +| US-HLD-003 | View Hold Queue | Must Have | +| US-HLD-004 | View My Holds | Must Have | +| US-FEE-001 | View My Fees | Must Have | +| US-FEE-002 | View Any Member's Fees | Must Have | +| US-FEE-003 | Process Fee Payment | Must Have | +| US-RPT-001 | View Overdue Report | Must Have | +| US-RPT-002 | View Collection Summary | Must Have | +| US-SYS-002 | Lending Health Check | Must Have | + +**Total**: 20 stories + +--- + +## Summary + +| Unit | Stories | Build Order | +|------|---------|-------------| +| Catalog Service | 7 | 1st (no dependencies) | +| Lending Service | 20 | 2nd (depends on Catalog) | +| **Total** | **27** | | diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/unit-of-work.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/unit-of-work.md new file mode 100644 index 00000000..9403f0ca --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/application-design/unit-of-work.md @@ -0,0 +1,157 @@ +# Units of Work + +## Unit 1: Catalog Service + +### Definition +An independently deployable FastAPI service managing the book inventory for the community library. + +### Responsibilities +- Book CRUD (create, read, update, delete) +- Book search (full-text by title/author, filter by category/availability) +- Book availability tracking (total_copies, available_copies) +- Internal availability API for Lending Service consumption +- Health check endpoint + +### Code Organization +``` +workspace/ + catalog-service/ + src/ + catalog_service/ + __init__.py + main.py # FastAPI app entry point + config.py # Configuration settings + models/ + __init__.py + book.py # Book Pydantic models (request/response) + domain/ + __init__.py + entities.py # Book domain entity + repositories/ + __init__.py + base.py # Abstract repository + in_memory.py # In-memory implementation for testing/dev + services/ + __init__.py + book_service.py # Book business logic + api/ + __init__.py + routes.py # Route handlers + dependencies.py # FastAPI dependencies (auth, repos) + auth/ + __init__.py + middleware.py # JWT validation middleware + dependencies.py # Auth dependencies + core/ + __init__.py + exceptions.py # Custom exceptions + responses.py # Response envelope helpers + logging.py # Structured logging setup + tests/ + __init__.py + conftest.py # Shared fixtures + test_book_service.py # Unit tests for BookService + test_routes.py # Integration tests for API routes + test_models.py # Model validation tests + pyproject.toml +``` + +### Deployment Profile +- Read-heavy workload (search/browse most frequent) +- Own data store (Catalog DB) +- Runs on port 8000 + +--- + +## Unit 2: Lending Service + +### Definition +An independently deployable FastAPI service managing member authentication, lending operations, holds, fees, and reporting. + +### Responsibilities +- Member registration, authentication (JWT), profile management +- Checkout, return, renewal with policy enforcement +- Hold queue management (FIFO, placement, cancellation, fulfillment) +- Fee tracking and payment processing +- Overdue report and collection summary +- Inter-service communication with Catalog Service +- Health check endpoint + +### Code Organization +``` +workspace/ + lending-service/ + src/ + lending_service/ + __init__.py + main.py # FastAPI app entry point + config.py # Configuration settings + models/ + __init__.py + member.py # Member Pydantic models + checkout.py # Checkout Pydantic models + hold.py # Hold Pydantic models + fee.py # Fee/Payment Pydantic models + auth.py # Auth models (login, token) + report.py # Report response models + domain/ + __init__.py + entities.py # Domain entities (Member, Checkout, Hold, Fee, Payment) + repositories/ + __init__.py + base.py # Abstract repositories + in_memory.py # In-memory implementations + services/ + __init__.py + auth_service.py # JWT + bcrypt + member_service.py # Member business logic + checkout_service.py # Checkout/return/renewal logic + hold_service.py # Hold management + fee_service.py # Fee calculation + payment + report_service.py # Reports + catalog_client.py # HTTP client to Catalog Service + api/ + __init__.py + member_routes.py # Member endpoints + checkout_routes.py # Checkout endpoints + hold_routes.py # Hold endpoints + fee_routes.py # Fee endpoints + report_routes.py # Report endpoints + dependencies.py # FastAPI dependencies + auth/ + __init__.py + middleware.py # JWT validation middleware + dependencies.py # Auth dependencies + core/ + __init__.py + exceptions.py # Custom exceptions + responses.py # Response envelope helpers + logging.py # Structured logging setup + tests/ + __init__.py + conftest.py # Shared fixtures, mock catalog client + test_auth_service.py + test_member_service.py + test_checkout_service.py + test_hold_service.py + test_fee_service.py + test_report_service.py + test_member_routes.py + test_checkout_routes.py + test_hold_routes.py + test_fee_routes.py + test_report_routes.py + pyproject.toml +``` + +### Deployment Profile +- Write-heavy workload (checkouts/returns/holds are writes) +- Own data store (Lending DB) +- Depends on Catalog Service (HTTP) +- Runs on port 8001 + +--- + +## Build Order +1. **Catalog Service first** — Lending Service depends on it for availability verification +2. **Lending Service second** — consumes Catalog Service APIs diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/plans/execution-plan.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/plans/execution-plan.md new file mode 100644 index 00000000..01a96a3f --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/plans/execution-plan.md @@ -0,0 +1,83 @@ +# Execution Plan + +## Detailed Analysis Summary + +### Change Impact Assessment +- **User-facing changes**: Yes — all 27 API endpoints serve three user roles +- **Structural changes**: Yes — two independent microservices with inter-service communication +- **Data model changes**: Yes — new data models for books, members, checkouts, holds, fees +- **API changes**: Yes — complete REST API design for both services +- **NFR impact**: Yes — performance targets, security (RBAC, JWT), scalability requirements + +### Risk Assessment +- **Risk Level**: Medium — complex business rules but well-defined requirements +- **Rollback Complexity**: Easy (greenfield, no existing system to break) +- **Testing Complexity**: Complex — two services, RBAC boundaries, lending policy edge cases + +## Workflow Visualization + +``` +Phase 1: INCEPTION + - Stage 1: Workspace Detection (COMPLETED) + - Stage 2: Reverse Engineering (SKIPPED - greenfield) + - Stage 3: Requirements Analysis (COMPLETED) + - Stage 4: User Stories (COMPLETED) + - Stage 5: Workflow Planning (IN PROGRESS) + - Stage 6: Application Design (EXECUTE) + - Stage 7: Units Generation (EXECUTE) + +Phase 2: CONSTRUCTION + - Stage 8: Functional Design (EXECUTE) + - Stage 9: NFR Requirements (EXECUTE) + - Stage 10: NFR Design (SKIP) + - Stage 11: Infrastructure Design (SKIP) + - Stage 12: Code Generation (EXECUTE) + - Stage 13: Build and Test (EXECUTE) + +Phase 3: OPERATIONS + - Operations (PLACEHOLDER) +``` + +## Phases to Execute + +### INCEPTION PHASE +- [x] Workspace Detection (COMPLETED) +- [x] Reverse Engineering (SKIPPED — greenfield project) +- [x] Requirements Analysis (COMPLETED) +- [x] User Stories (COMPLETED) +- [x] Workflow Planning (IN PROGRESS) +- [ ] Application Design — EXECUTE + - **Rationale**: Two services need component identification, service boundaries, and dependency mapping. Critical for defining how Catalog and Lending services interact. +- [ ] Units Generation — EXECUTE + - **Rationale**: System decomposes into two services (Catalog + Lending). Units define the build order and dependency sequence. + +### CONSTRUCTION PHASE +- [ ] Functional Design — EXECUTE + - **Rationale**: Complex business rules (checkout limits, hold queues, fee calculation, renewal rules) need detailed entity and domain model specification before code generation. +- [ ] NFR Requirements — EXECUTE + - **Rationale**: Database choice (DynamoDB vs PostgreSQL), authentication approach (PyJWT + bcrypt), and performance patterns need to be decided. Tech-env lists options but defers decisions. +- [ ] NFR Design — SKIP + - **Rationale**: NFR patterns will be captured sufficiently in NFR Requirements. Given app-code-only scope (no CDK), a separate NFR Design stage would add minimal value. +- [ ] Infrastructure Design — SKIP + - **Rationale**: CDK infrastructure deferred per stakeholder decision. Application code focuses on in-memory/SQLite for local dev with database abstraction layer. No cloud infrastructure to design. +- [ ] Code Generation — EXECUTE (ALWAYS) + - **Rationale**: Generate complete application code for both services. +- [ ] Build and Test — EXECUTE (ALWAYS) + - **Rationale**: Install dependencies, run tests, verify coverage. + +### OPERATIONS PHASE +- [ ] Operations — PLACEHOLDER + +## Success Criteria +- **Primary Goal**: Two independently runnable FastAPI services (Catalog + Lending) with complete business logic +- **Key Deliverables**: + - Catalog Service with book CRUD, search, availability tracking + - Lending Service with auth, checkouts, returns, renewals, holds, fees, reports + - Unit and integration tests with >= 90% coverage + - OpenAPI-compatible endpoint documentation +- **Quality Gates**: + - All tests pass + - >= 90% line coverage + - RBAC enforced on all endpoints + - Business rules verified (checkout limits, hold queues, fees, renewals) + - Security extensions (SECURITY-01 through SECURITY-15) compliance verified diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/plans/story-generation-plan.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/plans/story-generation-plan.md new file mode 100644 index 00000000..652815af --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/plans/story-generation-plan.md @@ -0,0 +1,20 @@ +# Story Generation Plan + +## Methodology +- **Approach**: Feature-Based with Persona mapping +- **Story Format**: "As a [role], I want to [action], so that [benefit]" +- **Acceptance Criteria Format**: Given/When/Then +- **INVEST Compliance**: All stories verified against INVEST criteria + +## Execution Steps + +- [x] Step 1: Define user personas (Admin, Librarian, Member) +- [x] Step 2: Generate Catalog Service stories (Book CRUD, Search, Availability) +- [x] Step 3: Generate Authentication stories (Registration, Login, JWT) +- [x] Step 4: Generate Lending stories (Checkout, Return, Renewal) +- [x] Step 5: Generate Hold Management stories (Place, Cancel, Queue, Fulfill) +- [x] Step 6: Generate Fee Management stories (Track, Pay) +- [x] Step 7: Generate Reporting stories (Overdue, Collection Summary) +- [x] Step 8: Generate Health Check stories +- [x] Step 9: Map personas to stories +- [x] Step 10: Verify INVEST criteria compliance diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/plans/user-stories-assessment.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/plans/user-stories-assessment.md new file mode 100644 index 00000000..e312f55e --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/plans/user-stories-assessment.md @@ -0,0 +1,24 @@ +# User Stories Assessment + +## Request Analysis +- **Original Request**: Build BookShelf Community Library API with two services (Catalog + Lending) +- **User Impact**: Direct — three user roles interact with API endpoints +- **Complexity Level**: Complex — multi-service, RBAC, lending policies, holds, fees +- **Stakeholders**: Librarian, Member, Admin + +## Assessment Criteria Met +- [x] High Priority: New user-facing functionality (API endpoints for all three roles) +- [x] High Priority: Multi-persona system (Admin, Librarian, Member roles) +- [x] High Priority: Customer-facing API consumed by frontend/mobile/Slack integrations +- [x] High Priority: Complex business logic (checkout limits, hold queues, fee calculation, renewal rules) +- [x] Medium Priority: Security enhancements (RBAC, JWT auth) + +## Decision +**Execute User Stories**: Yes +**Reasoning**: Three distinct user personas with different permissions, complex lending business rules with multiple edge cases, and acceptance criteria needed for comprehensive test coverage. Stories will clarify the exact behavior expected at each endpoint and role boundary. + +## Expected Outcomes +- Clear acceptance criteria for each API endpoint mapped to personas +- Testable specifications that drive unit and integration test design +- Edge case documentation (fee thresholds, hold queue behavior, renewal limits) +- RBAC boundary clarification per story diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/requirements/requirement-verification-questions.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/requirements/requirement-verification-questions.md new file mode 100644 index 00000000..d84928b9 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/requirements/requirement-verification-questions.md @@ -0,0 +1,117 @@ +# Requirements Verification Questions — ANSWERED + +Please answer the following questions to help clarify the requirements for the BookShelf Community Library API. Fill in the letter choice after each `[Answer]:` tag. + +--- + +## Question 1 +The vision document lists open questions about late fee behavior. When a member has outstanding fees and tries to check out a book, should the system block the checkout or allow it with a warning? + +A) Block checkout — members with any outstanding fees cannot borrow until fees are paid +B) Block checkout only when outstanding fees exceed a configurable threshold (e.g., $10.00) +C) Allow checkout but include a warning in the API response indicating outstanding fees +D) Other (please describe after [Answer]: tag below) + +[Answer]: B + +## Question 2 +When a book is returned and a hold exists, should there be a grace period before the held book is made available to the next person in the queue? + +A) No grace period — immediately fulfill the next hold in the queue upon return +B) Short grace period (e.g., 24 hours) for the returning member to re-check-out before the hold is fulfilled +C) Configurable grace period set by the library admin +D) Other (please describe after [Answer]: tag below) + +[Answer]: A + +## Question 3 +What should happen to a member's active holds and checkouts if an Admin deactivates their account? + +A) Cancel all active holds immediately; keep checkouts active so books are still tracked until returned +B) Cancel all active holds and mark all checkouts as requiring immediate return +C) Keep everything active — deactivation only prevents new actions (no new checkouts, holds, or renewals) +D) Other (please describe after [Answer]: tag below) + +[Answer]: C + +## Question 4 +Should late fees continue accruing indefinitely, or should they cap at a specific amount? + +A) Cap at a fixed dollar amount per checkout (e.g., $10.00 as mentioned in the vision) +B) Cap at the book's replacement value (requires a replacement_value field on books) +C) Cap at a configurable maximum set by the library admin +D) No cap — fees accrue indefinitely until the book is returned +E) Other (please describe after [Answer]: tag below) + +[Answer]: A + +## Question 5 +For the MVP two-service architecture, how should the Catalog Service and Lending Service be deployed? + +A) Two separate FastAPI applications, each with its own AWS Lambda function behind API Gateway +B) Two separate FastAPI applications, each deployed as an ECS Fargate container behind API Gateway +C) Let the Infrastructure Design stage determine the optimal compute choice based on NFR analysis +D) Other (please describe after [Answer]: tag below) + +[Answer]: C + +## Question 6 +Where should member authentication and management live in the two-service architecture? + +A) In the Lending Service — since members are primarily lending-related entities +B) In a shared authentication layer (e.g., middleware) used by both services, with member data in the Lending Service +C) As part of both services — each service validates JWTs independently, and the Lending Service owns member data +D) Other (please describe after [Answer]: tag below) + +[Answer]: C + +## Question 7 +For the MVP asynchronous hold fulfillment, what level of implementation is expected? + +A) Full AWS messaging (SQS/SNS/EventBridge) with actual async event processing between services +B) In-process async event handling within the Lending Service (simulated event bus using Python async) +C) Simple synchronous hold check on return — update hold status directly during the return operation, defer true async to Phase 2 +D) Other (please describe after [Answer]: tag below) + +[Answer]: C + +## Question 8 +How should the two services communicate for book existence/availability verification during checkout? + +A) Direct HTTP call from Lending Service to Catalog Service's internal API +B) Shared database view (violates data isolation — not recommended) +C) Cached book data in Lending Service, synchronized via events from Catalog Service +D) Other (please describe after [Answer]: tag below) + +[Answer]: A + +## Question 9 +For the MVP, should the project include AWS CDK infrastructure-as-code for actual deployment, or focus on the application code with deployment deferred? + +A) Include complete CDK infrastructure code for both services (Lambda/Fargate, API Gateway, DynamoDB/RDS, SQS, etc.) +B) Include CDK infrastructure code as stubs/templates showing the architecture but not production-ready +C) Focus on application code only — provide deployment documentation but no CDK code in the MVP +D) Other (please describe after [Answer]: tag below) + +[Answer]: C + +## Question 10 +What database should be used for each service? + +A) DynamoDB for both services (key-value access patterns, serverless scaling) +B) DynamoDB for Catalog Service, RDS PostgreSQL for Lending Service (lending has more relational queries) +C) RDS PostgreSQL for both services (relational queries, familiar SQL) +D) Let the NFR Requirements and Infrastructure Design stages determine the optimal choice +E) Other (please describe after [Answer]: tag below) + +[Answer]: D + +## Question 11: Security Extensions +Should security extension rules (SECURITY-01 through SECURITY-15) be enforced for this project? + +A) Yes — enforce all SECURITY rules as blocking constraints (recommended for production-grade applications) +B) No — skip all SECURITY rules (suitable for PoCs, prototypes, and experimental projects) +C) Other (please describe after [Answer]: tag below) + +[Answer]: A + diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/requirements/requirements.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/requirements/requirements.md new file mode 100644 index 00000000..5c0806da --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/requirements/requirements.md @@ -0,0 +1,297 @@ +# BookShelf Community Library API — Requirements Document + +## Intent Analysis Summary + +| Attribute | Value | +|-----------|-------| +| **User Request** | Build a cloud-deployed BookShelf platform with two independently deployable services (Catalog Service and Lending Service) for community library management | +| **Request Type** | New Project (Greenfield) | +| **Scope Estimate** | System-wide — two microservices with shared auth, event-driven communication, and comprehensive business rules | +| **Complexity Estimate** | Complex — multiple services, RBAC, async events, lending policy enforcement, inter-service communication | +| **Requirements Depth** | Comprehensive | + +--- + +## 1. Functional Requirements + +### 1.1 Catalog Service + +#### FR-CAT-001: Book CRUD Operations +- Add a book with: title, author, ISBN (optional), category, total_copies +- Update book metadata (title, author, ISBN, category, total_copies) +- Retrieve a single book by ID +- List all books +- Delete a book (soft-delete or hard-delete — librarian/admin only) +- All string fields have maximum length constraints + +#### FR-CAT-002: Book Search +- Full-text search across title and author fields (substring match) +- Filter by category +- Filter by availability (available copies > 0) +- Combined search + filter supported + +#### FR-CAT-003: Book Availability Tracking +- Each book tracks `total_copies` and `available_copies` +- `available_copies` decremented on checkout (via event or API call from Lending Service) +- `available_copies` incremented on return (via event or API call from Lending Service) +- `available_copies` must never go below 0 or above `total_copies` + +#### FR-CAT-004: Book Availability API +- Internal API endpoint for Lending Service to verify book existence and available copies +- Returns book ID, title, total_copies, available_copies + +#### FR-CAT-005: Health Check +- `GET /api/v1/catalog/health` returns service status and version + +### 1.2 Lending Service + +#### FR-LND-001: Member Registration +- Register with: name, email, password +- Email must be unique +- Password stored using adaptive hashing (bcrypt) +- Auto-assigned "member" role on registration +- Returns member ID and profile (excluding password) + +#### FR-LND-002: Member Authentication +- Login with email and password +- Returns JWT token with 24-hour expiry +- JWT contains: member_id, email, role +- Both services validate JWTs independently + +#### FR-LND-003: Member Profile Management +- Members can view and update their own profile (name, email) +- Admins can view any member's profile +- Librarians can view any member's profile + +#### FR-LND-004: Role-Based Access Control +- Three roles: Admin, Librarian, Member +- **Admin**: Full access to all endpoints in both services +- **Librarian**: Catalog management (CRUD, search), lending operations (process returns, manage holds), view reports +- **Member**: Self-service (own checkouts, own holds, own fees, own profile, search catalog) +- **Public**: Registration, login, health check — no JWT required + +#### FR-LND-005: Checkout +- Member checks out a book by book ID +- Validations before checkout: + - Book exists (verified via HTTP call to Catalog Service) + - Available copies > 0 + - Member active checkout count < 5 (configurable max) + - Member outstanding fees ≤ $10.00 threshold (configurable); block if exceeded +- On successful checkout: + - Record checkout with: member_id, book_id, checkout_date (UTC), due_date (checkout_date + 14 days), status=active, renewal_count=0 + - Decrement available_copies in Catalog Service +- Due date: 14 days from checkout date + +#### FR-LND-006: Return +- Member or Librarian/Admin returns a book by checkout ID +- On return: + - Calculate late fee if overdue: $0.25/day, capped at $10.00 per checkout + - If late fee > 0, create fee record for the member + - Update checkout status to "returned", set return_date + - Increment available_copies in Catalog Service + - Check hold queue synchronously: if holds exist for this book, fulfill the next hold (update hold status to "ready") +- All timestamps in UTC + +#### FR-LND-007: Renewal +- Member renews an active checkout by checkout ID +- Validations: + - Checkout is active (not returned) + - Checkout belongs to requesting member + - Renewal count < 2 (max 2 renewals per checkout) + - No active holds exist for the book +- On renewal: extend due_date by 14 days from current due_date, increment renewal_count + +#### FR-LND-008: Active Checkouts +- List active checkouts for the requesting member +- Each record includes: checkout_id, book_id, book_title, checkout_date, due_date, renewal_count +- Admins/Librarians can list active checkouts for any member + +#### FR-LND-009: Hold Placement +- Member places a hold on a book by book ID +- Validations: + - Book exists (verified via Catalog Service) + - No available copies (available_copies == 0) + - Member active hold count < 3 (configurable max) + - Member does not already have an active hold on this book +- On placement: record hold with member_id, book_id, hold_date, status=waiting, queue_position (FIFO) + +#### FR-LND-010: Hold Cancellation +- Member cancels their own hold by hold ID +- Librarian/Admin can cancel any hold +- On cancellation: update status to "cancelled", re-order queue positions + +#### FR-LND-011: Hold Queue Status +- Get hold queue for a specific book: list of holds with position and status +- Member can see their own position in the queue + +#### FR-LND-012: Hold Fulfillment (Synchronous MVP) +- When a book is returned and holds exist in "waiting" status for that book: + - Fulfill the first hold in FIFO order + - Update hold status from "waiting" to "ready" + - No grace period — immediate fulfillment +- True asynchronous processing (SQS/EventBridge) deferred to Phase 2 + +#### FR-LND-013: Fee Tracking +- Track outstanding fees per member +- Fee record includes: fee_id, member_id, checkout_id, amount, created_date, status (outstanding/paid/partial) +- List all fees for the requesting member +- Admins/Librarians can view fees for any member + +#### FR-LND-014: Fee Payment +- Record a payment against a member's outstanding fees +- Partial payments allowed +- Admins and Librarians can process payments +- Payment record includes: payment_id, member_id, amount, payment_date + +#### FR-LND-015: Overdue Report +- List all currently overdue checkouts (due_date < now and status=active) +- Include: member name, member email, book title, checkout_date, due_date, days_overdue +- Accessible by Librarian and Admin roles only + +#### FR-LND-016: Collection Summary +- Total books, total members, books checked out, books available, total outstanding fees +- Accessible by Admin role only + +#### FR-LND-017: Health Check +- `GET /api/v1/lending/health` returns service status and version + +### 1.3 Inter-Service Communication + +#### FR-ISC-001: Book Verification +- Lending Service calls Catalog Service via HTTP to verify book existence and availability before checkout/hold placement +- Direct HTTP call (not event-driven) for synchronous validation + +#### FR-ISC-002: Availability Updates +- Lending Service calls Catalog Service via HTTP to decrement/increment available_copies on checkout/return +- Atomic operation — if update fails, the checkout/return operation must also fail + +### 1.4 Account Deactivation Behavior + +#### FR-ACC-001: Account Deactivation +- Admin can deactivate a member account +- Deactivated accounts: existing holds and checkouts remain active, but no new checkouts, holds, or renewals allowed +- Account suspension is out of MVP scope (automated suspension deferred to Phase 2) + +--- + +## 2. Non-Functional Requirements + +### 2.1 Performance +| Metric | Target | +|--------|--------| +| API response time (p95) | < 100ms | +| Full-text search latency | < 200ms | +| Concurrent users | 100 simultaneous | +| Inter-service call overhead | < 50ms added latency | + +### 2.2 Reliability +| Metric | Target | +|--------|--------| +| API uptime | 99.9% | +| Data durability | Multi-AZ replication | + +### 2.3 Security +- RBAC with three roles enforced on all endpoints +- JWT authentication with 24-hour expiry +- Password hashing with bcrypt +- Input validation via Pydantic on all endpoints +- No PII in logs +- Encryption at rest and in transit +- Security extension rules (SECURITY-01 through SECURITY-15) enforced + +### 2.4 Scalability +- Each service scales independently +- Catalog Service optimized for read-heavy workload +- Lending Service optimized for write-heavy workload +- Single-library deployment for MVP (multi-tenant in Phase 2) + +### 2.5 Testing +| Test Type | Target | +|-----------|--------| +| Unit test coverage | ≥ 90% line coverage | +| Integration tests | All API endpoints tested | +| Contract tests | All endpoints match OpenAPI spec | + +### 2.6 Maintainability +- Solo developer maintainable +- Infrastructure cost < $50/month +- Clean separation between services +- Consistent API envelope format + +--- + +## 3. API Design Standards + +- **Style**: REST with JSON +- **Versioning**: URL path prefix `/api/v1/` +- **Field naming**: snake_case +- **Success envelope**: `{ "status": "ok", "data": { ... } }` +- **Error envelope**: `{ "status": "error", "error": { "code": "ERROR_CODE", "message": "..." } }` +- **Error codes**: VALIDATION_ERROR (422), NOT_FOUND (404), UNAUTHORIZED (401), FORBIDDEN (403), CONFLICT (409), INTERNAL_ERROR (500) + +--- + +## 4. Technology Stack + +- **Language**: Python 3.13+ +- **Framework**: FastAPI 0.115+ with Pydantic 2.x +- **Server**: uvicorn 0.34+ +- **Testing**: pytest 8.x, pytest-asyncio, httpx, pytest-cov +- **Linting**: ruff 0.9+ +- **Package manager**: uv +- **Infrastructure**: AWS CDK 2.x (deferred — application code only for MVP) +- **Cloud**: AWS us-east-1 +- **Database**: To be determined during NFR Requirements / Infrastructure Design stages + +--- + +## 5. Architectural Decisions + +| Decision | Choice | Rationale | +|----------|--------|-----------| +| Service architecture | Two independent services (Catalog + Lending) | Vision requirement; independent scaling | +| Authentication | Application-level JWT (PyJWT + bcrypt); both services validate JWTs | Simpler than Cognito for MVP; member data owned by Lending Service | +| Inter-service communication (sync) | Direct HTTP calls via httpx | Simple, testable; Lending → Catalog for book verification | +| Hold fulfillment | Synchronous on return | Simpler for MVP; true async deferred to Phase 2 | +| Fee threshold for checkout | Block checkout if outstanding fees > $10.00 | Configurable threshold; balances enforcement with usability | +| Late fee cap | $10.00 per checkout | Fixed cap as stated in vision | +| CDK infrastructure | Deferred — app code only for MVP | Focus on working application; deployment docs provided | +| Database | Deferred to NFR stages | Tech-env lists DynamoDB and RDS as options; choice depends on access patterns | + +--- + +## 6. MVP Scope Boundaries + +### In Scope +- Book CRUD, search, availability tracking +- Member registration, login, JWT auth, profile management +- RBAC (Admin, Librarian, Member) +- Checkout, return, renewal with full policy enforcement +- Hold placement, cancellation, queue status, synchronous fulfillment +- Fee tracking and payment +- Overdue report and collection summary +- Health checks for both services +- Unit and integration tests (≥90% coverage) +- OpenAPI specification + +### Out of Scope (Deferred) +- Email notifications (Phase 2) +- Multi-tenant support (Phase 2) +- Barcode/ISBN scanning (Phase 2) +- Advanced analytics (Phase 2) +- Recommendation engine (Phase 3) +- Account suspension automation (Phase 2) +- Password reset (Phase 2) +- Pagination (Phase 2) +- CDK infrastructure code (deployment docs only) + +--- + +## 7. Open Decisions for Later Stages + +| Decision | Stage | +|----------|-------| +| Database choice (DynamoDB vs RDS PostgreSQL) | NFR Requirements / Infrastructure Design | +| Compute choice (Lambda vs Fargate) | NFR Requirements / Infrastructure Design | +| API Gateway configuration | Infrastructure Design | +| Messaging infrastructure for Phase 2 | Infrastructure Design | diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/user-stories/personas.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/user-stories/personas.md new file mode 100644 index 00000000..95d5d3ed --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/user-stories/personas.md @@ -0,0 +1,40 @@ +# User Personas + +## Persona 1: Librarian (Volunteer Library Manager) + +| Attribute | Detail | +|-----------|--------| +| **Name** | Pat the Librarian | +| **Role** | Librarian | +| **Description** | Volunteer who manages the physical collection at a community library. Adds new books, processes returns, resolves lending disputes, and monitors overdue items. | +| **Goals** | Efficiently manage the catalog, see who has overdue books at a glance, process returns quickly, maintain accurate inventory | +| **Frustrations** | Manual spreadsheet tracking, chasing down overdue books via email, lost books with no accountability | +| **Technical Comfort** | Moderate — comfortable using web interfaces and APIs but not a developer | +| **Usage Pattern** | Uses the system during library open hours (evenings/weekends). Burst usage when processing donations or returns. | +| **Key Endpoints** | Book CRUD, book search, return processing, overdue report, hold management, fee payment processing | + +## Persona 2: Member (Library Borrower) + +| Attribute | Detail | +|-----------|--------| +| **Name** | Alex the Member | +| **Role** | Member | +| **Description** | Community member who borrows and returns books. Browses the catalog, places holds on popular books, and manages their own account. | +| **Goals** | Find available books easily, check out quickly, know when holds become available, track due dates | +| **Frustrations** | Not knowing which books are available, unfair access to popular books, forgetting due dates | +| **Technical Comfort** | Varies — the API will be consumed by a frontend app on their behalf | +| **Usage Pattern** | Sporadic — browses catalog weekly, checks out 1-3 books per month, returns within lending period | +| **Key Endpoints** | Search, checkout, return, renewal, hold placement/cancellation, active checkouts, fee viewing, profile management | + +## Persona 3: Admin (Library Operations Manager) + +| Attribute | Detail | +|-----------|--------| +| **Name** | Sam the Admin | +| **Role** | Admin | +| **Description** | Person responsible for the library's overall operations. Manages member accounts, configures lending policies, views operational reports. | +| **Goals** | Full oversight of library operations, manage member issues, review usage reports, ensure fair policy enforcement | +| **Frustrations** | No visibility into library usage, inability to manage problem members, lack of operational metrics | +| **Technical Comfort** | High — comfortable with admin dashboards, API tools, and data analysis | +| **Usage Pattern** | Daily monitoring. Occasional member management actions. Weekly report reviews. | +| **Key Endpoints** | All endpoints (full access), collection summary, overdue report, member management, fee payment, account deactivation | diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/user-stories/stories.md b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/user-stories/stories.md new file mode 100644 index 00000000..8e03d58b --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden-aidlc-docs/aidlc-docs/inception/user-stories/stories.md @@ -0,0 +1,315 @@ +# User Stories + +## Epic 1: Catalog Management + +### US-CAT-001: Add a Book +**As a** Librarian, **I want to** add a new book to the catalog with title, author, ISBN, category, and copy count, **so that** members can discover and borrow it. + +**Acceptance Criteria:** +- Given I am authenticated as a Librarian or Admin +- When I POST to /api/v1/books with valid book data +- Then the book is created with a unique ID, available_copies = total_copies, and returned in the response +- And if any required field is missing or invalid, I receive a 422 VALIDATION_ERROR +- And if I am a Member, I receive a 403 FORBIDDEN + +### US-CAT-002: Update a Book +**As a** Librarian, **I want to** update book metadata (title, author, ISBN, category, total_copies), **so that** the catalog stays accurate. + +**Acceptance Criteria:** +- Given I am authenticated as a Librarian or Admin +- When I PUT to /api/v1/books/{book_id} with updated fields +- Then the book is updated and the updated record is returned +- And if the book does not exist, I receive a 404 NOT_FOUND +- And if total_copies is reduced below currently checked-out copies, I receive a 409 CONFLICT + +### US-CAT-003: Get a Book +**As a** Member, **I want to** view details of a specific book, **so that** I can decide whether to borrow it. + +**Acceptance Criteria:** +- Given I am authenticated +- When I GET /api/v1/books/{book_id} +- Then I receive the book details including available_copies +- And if the book does not exist, I receive a 404 NOT_FOUND + +### US-CAT-004: List All Books +**As a** Member, **I want to** browse the complete catalog, **so that** I can discover books available in the library. + +**Acceptance Criteria:** +- Given I am authenticated +- When I GET /api/v1/books +- Then I receive a list of all books with their availability + +### US-CAT-005: Delete a Book +**As a** Librarian, **I want to** remove a book from the catalog, **so that** decommissioned books no longer appear. + +**Acceptance Criteria:** +- Given I am authenticated as a Librarian or Admin +- When I DELETE /api/v1/books/{book_id} +- Then the book is removed from the catalog +- And if the book has active checkouts, I receive a 409 CONFLICT +- And if the book does not exist, I receive a 404 NOT_FOUND + +### US-CAT-006: Search Books +**As a** Member, **I want to** search for books by title or author and filter by category and availability, **so that** I can find specific books quickly. + +**Acceptance Criteria:** +- Given I am authenticated +- When I GET /api/v1/books/search?q=python&category=programming&available=true +- Then I receive books matching the search term in title or author, filtered by category and availability +- And if no books match, I receive an empty list (not an error) + +--- + +## Epic 2: Member Management and Authentication + +### US-AUTH-001: Register as a Member +**As a** new user, **I want to** register with my name, email, and password, **so that** I can start borrowing books. + +**Acceptance Criteria:** +- Given I am not authenticated (public endpoint) +- When I POST to /api/v1/members/register with name, email, password +- Then my account is created with the "member" role and I receive my profile (without password) +- And if the email is already registered, I receive a 409 CONFLICT +- And if the password is too short (< 8 chars), I receive a 422 VALIDATION_ERROR + +### US-AUTH-002: Login +**As a** registered user, **I want to** log in with my email and password, **so that** I receive a JWT token for API access. + +**Acceptance Criteria:** +- Given I am not authenticated (public endpoint) +- When I POST to /api/v1/members/login with valid credentials +- Then I receive a JWT token with 24-hour expiry containing my member_id, email, and role +- And if credentials are invalid, I receive a 401 UNAUTHORIZED + +### US-AUTH-003: View My Profile +**As a** Member, **I want to** view my profile information, **so that** I can verify my account details. + +**Acceptance Criteria:** +- Given I am authenticated +- When I GET /api/v1/members/me +- Then I receive my profile (name, email, role, active status) + +### US-AUTH-004: Update My Profile +**As a** Member, **I want to** update my name and email, **so that** I can keep my information current. + +**Acceptance Criteria:** +- Given I am authenticated +- When I PUT /api/v1/members/me with updated name or email +- Then my profile is updated and the new profile is returned +- And if the new email is already in use, I receive a 409 CONFLICT + +### US-AUTH-005: View Any Member Profile (Admin/Librarian) +**As an** Admin or Librarian, **I want to** view any member's profile, **so that** I can manage member accounts. + +**Acceptance Criteria:** +- Given I am authenticated as Admin or Librarian +- When I GET /api/v1/members/{member_id} +- Then I receive that member's profile +- And if the member does not exist, I receive a 404 NOT_FOUND +- And if I am a Member, I receive a 403 FORBIDDEN + +### US-AUTH-006: Deactivate Member Account +**As an** Admin, **I want to** deactivate a member account, **so that** they cannot perform new actions while their existing obligations remain tracked. + +**Acceptance Criteria:** +- Given I am authenticated as Admin +- When I PUT /api/v1/members/{member_id}/deactivate +- Then the member's account is marked inactive +- And the member's existing checkouts and holds remain active +- And the member cannot create new checkouts, holds, or renewals +- And if I am not Admin, I receive a 403 FORBIDDEN + +--- + +## Epic 3: Lending Operations + +### US-LND-001: Checkout a Book +**As a** Member, **I want to** check out a book, **so that** I can borrow it for 14 days. + +**Acceptance Criteria:** +- Given I am authenticated as a Member +- When I POST to /api/v1/checkouts with book_id +- Then a checkout record is created with due_date = now + 14 days and status = active +- And available_copies is decremented in the Catalog Service +- And if the book does not exist, I receive a 404 NOT_FOUND +- And if no copies are available, I receive a 409 CONFLICT +- And if I have 5 active checkouts already, I receive a 409 CONFLICT with "checkout limit exceeded" +- And if my outstanding fees exceed $10.00, I receive a 409 CONFLICT with "outstanding fees exceed threshold" + +### US-LND-002: Return a Book +**As a** Member or Librarian, **I want to** return a checked-out book, **so that** it becomes available for others. + +**Acceptance Criteria:** +- Given I am authenticated +- When I POST to /api/v1/checkouts/{checkout_id}/return +- Then the checkout status is updated to "returned" with return_date +- And available_copies is incremented in the Catalog Service +- And if the book is overdue, a late fee is calculated ($0.25/day, capped at $10.00) and a fee record is created +- And if there are active holds for this book, the first hold in FIFO order is updated to "ready" +- And Members can only return their own checkouts; Librarians/Admins can return any + +### US-LND-003: Renew a Checkout +**As a** Member, **I want to** renew my checkout, **so that** I can keep the book for another 14 days. + +**Acceptance Criteria:** +- Given I am authenticated as the checkout owner +- When I POST to /api/v1/checkouts/{checkout_id}/renew +- Then the due_date is extended by 14 days and renewal_count is incremented +- And if renewal_count is already 2, I receive a 409 CONFLICT with "renewal limit exceeded" +- And if there are active holds for this book, I receive a 409 CONFLICT with "book has active holds" +- And if the checkout is not active, I receive a 409 CONFLICT + +### US-LND-004: View My Active Checkouts +**As a** Member, **I want to** see my active checkouts, **so that** I know what books I have and when they are due. + +**Acceptance Criteria:** +- Given I am authenticated +- When I GET /api/v1/checkouts?status=active +- Then I receive my active checkouts with book_id, book_title, checkout_date, due_date, renewal_count +- And Admins/Librarians can specify a member_id query parameter to view any member's checkouts + +--- + +## Epic 4: Hold Management + +### US-HLD-001: Place a Hold +**As a** Member, **I want to** place a hold on an unavailable book, **so that** I can get fair access when it is returned. + +**Acceptance Criteria:** +- Given I am authenticated as a Member +- When I POST to /api/v1/holds with book_id +- Then a hold record is created with status=waiting and a FIFO queue position +- And if the book has available copies, I receive a 409 CONFLICT with "book is available for checkout" +- And if I already have a hold on this book, I receive a 409 CONFLICT with "duplicate hold" +- And if I have 3 active holds already, I receive a 409 CONFLICT with "hold limit exceeded" + +### US-HLD-002: Cancel a Hold +**As a** Member, **I want to** cancel my hold, **so that** others can move up in the queue. + +**Acceptance Criteria:** +- Given I am authenticated +- When I DELETE /api/v1/holds/{hold_id} +- Then the hold is cancelled and remaining queue positions are re-ordered +- And Members can only cancel their own holds; Librarians/Admins can cancel any hold + +### US-HLD-003: View Hold Queue +**As a** Member, **I want to** see my position in the hold queue for a book, **so that** I know how long I might wait. + +**Acceptance Criteria:** +- Given I am authenticated +- When I GET /api/v1/holds?book_id={book_id} +- Then I receive the hold queue with positions and statuses +- And I can identify my own position in the queue + +### US-HLD-004: View My Holds +**As a** Member, **I want to** see all my active holds, **so that** I can manage my hold queue. + +**Acceptance Criteria:** +- Given I am authenticated +- When I GET /api/v1/holds/me +- Then I receive all my holds with book_id, status (waiting/ready/cancelled), queue_position, hold_date + +--- + +## Epic 5: Fee Management + +### US-FEE-001: View My Fees +**As a** Member, **I want to** see my outstanding fees, **so that** I know what I owe. + +**Acceptance Criteria:** +- Given I am authenticated +- When I GET /api/v1/fees/me +- Then I receive a list of my fees with fee_id, amount, status, created_date, checkout_id +- And I can see a total outstanding balance + +### US-FEE-002: View Any Member's Fees +**As a** Librarian or Admin, **I want to** view any member's fees, **so that** I can assist with fee inquiries. + +**Acceptance Criteria:** +- Given I am authenticated as Librarian or Admin +- When I GET /api/v1/fees?member_id={member_id} +- Then I receive that member's fee records + +### US-FEE-003: Process Fee Payment +**As a** Librarian or Admin, **I want to** record a fee payment for a member, **so that** their balance is updated. + +**Acceptance Criteria:** +- Given I am authenticated as Librarian or Admin +- When I POST to /api/v1/fees/payments with member_id and amount +- Then the payment is recorded and applied to outstanding fees +- And partial payments are allowed (outstanding fees reduced by payment amount) +- And the response includes the new outstanding balance + +--- + +## Epic 6: Reporting + +### US-RPT-001: View Overdue Report +**As a** Librarian, **I want to** see all overdue checkouts, **so that** I can follow up with members. + +**Acceptance Criteria:** +- Given I am authenticated as Librarian or Admin +- When I GET /api/v1/reports/overdue +- Then I receive a list of overdue checkouts with member_name, member_email, book_title, checkout_date, due_date, days_overdue +- And if I am a Member, I receive a 403 FORBIDDEN + +### US-RPT-002: View Collection Summary +**As an** Admin, **I want to** see a summary of library operations, **so that** I have a dashboard view. + +**Acceptance Criteria:** +- Given I am authenticated as Admin +- When I GET /api/v1/reports/summary +- Then I receive: total_books, total_members, books_checked_out, books_available, total_outstanding_fees +- And if I am not Admin, I receive a 403 FORBIDDEN + +--- + +## Epic 7: System Operations + +### US-SYS-001: Catalog Health Check +**As a** monitoring system, **I want to** check the Catalog Service health, **so that** I can detect outages. + +**Acceptance Criteria:** +- When I GET /api/v1/catalog/health (public endpoint) +- Then I receive status and version information + +### US-SYS-002: Lending Health Check +**As a** monitoring system, **I want to** check the Lending Service health, **so that** I can detect outages. + +**Acceptance Criteria:** +- When I GET /api/v1/lending/health (public endpoint) +- Then I receive status and version information + +--- + +## Story-Persona Mapping + +| Story | Admin | Librarian | Member | Public | +|-------|-------|-----------|--------|--------| +| US-CAT-001 Add Book | ✅ | ✅ | ❌ | ❌ | +| US-CAT-002 Update Book | ✅ | ✅ | ❌ | ❌ | +| US-CAT-003 Get Book | ✅ | ✅ | ✅ | ❌ | +| US-CAT-004 List Books | ✅ | ✅ | ✅ | ❌ | +| US-CAT-005 Delete Book | ✅ | ✅ | ❌ | ❌ | +| US-CAT-006 Search Books | ✅ | ✅ | ✅ | ❌ | +| US-AUTH-001 Register | ❌ | ❌ | ❌ | ✅ | +| US-AUTH-002 Login | ❌ | ❌ | ❌ | ✅ | +| US-AUTH-003 View My Profile | ✅ | ✅ | ✅ | ❌ | +| US-AUTH-004 Update Profile | ✅ | ✅ | ✅ | ❌ | +| US-AUTH-005 View Any Profile | ✅ | ✅ | ❌ | ❌ | +| US-AUTH-006 Deactivate | ✅ | ❌ | ❌ | ❌ | +| US-LND-001 Checkout | ✅ | ✅ | ✅ | ❌ | +| US-LND-002 Return | ✅ | ✅ | ✅(own) | ❌ | +| US-LND-003 Renew | ✅ | ✅ | ✅(own) | ❌ | +| US-LND-004 Active Checkouts | ✅(any) | ✅(any) | ✅(own) | ❌ | +| US-HLD-001 Place Hold | ✅ | ✅ | ✅ | ❌ | +| US-HLD-002 Cancel Hold | ✅(any) | ✅(any) | ✅(own) | ❌ | +| US-HLD-003 Hold Queue | ✅ | ✅ | ✅ | ❌ | +| US-HLD-004 My Holds | ✅ | ✅ | ✅ | ❌ | +| US-FEE-001 My Fees | ✅ | ✅ | ✅ | ❌ | +| US-FEE-002 Any Fees | ✅ | ✅ | ❌ | ❌ | +| US-FEE-003 Payment | ✅ | ✅ | ❌ | ❌ | +| US-RPT-001 Overdue | ✅ | ✅ | ❌ | ❌ | +| US-RPT-002 Summary | ✅ | ❌ | ❌ | ❌ | +| US-SYS-001 Catalog Health | ✅ | ✅ | ✅ | ✅ | +| US-SYS-002 Lending Health | ✅ | ✅ | ✅ | ✅ | diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/golden.yaml b/scripts/aidlc-evaluator/test_cases/all-stages/golden.yaml new file mode 100644 index 00000000..269251aa --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/golden.yaml @@ -0,0 +1,60 @@ +run_folder: runs/20260227T004722-aidlc-workflows_main +promoted_at: '2026-02-27T12:00:00+00:00' +executor_model: global.anthropic.claude-opus-4-6-v1 +simulator_model: global.anthropic.claude-opus-4-6-v1 +execution: + wall_clock_ms: 1615170 + total_tokens: 28262477 + input_tokens: 28091382 + output_tokens: 171095 + handoffs: 3 + executor: + input_tokens: 14043505 + output_tokens: 85683 + total_tokens: 14129188 + simulator: + input_tokens: 40914 + output_tokens: 916 + total_tokens: 41830 +context_size: + max_tokens: 170689 + avg_tokens: 85880 + median_tokens: 92422 +artifacts: + source_files: 1822 + test_files: 28 + total_files: 3168 + lines_of_code: 718697 + doc_files: 24 +unit_tests: + passed: 101 + failed: 0 + total: 101 + pass_pct: 100.0 + coverage_pct: null +contract_tests: + passed: 0 + failed: 0 + total: 50 + notes: "Server module path mismatch — openapi.yaml x-app.module needs updating for two-service architecture" +code_quality: + lint_errors: 3 + lint_warnings: 6 + lint_total: 9 + security_total: 1 + security_high: 0 +stages_executed: + - Workspace Detection + - Requirements Analysis + - User Stories + - Workflow Planning + - Application Design + - Units Generation + - Functional Design + - NFR Requirements + - Code Generation + - Build and Test +stages_skipped: + - Reverse Engineering (greenfield) + - NFR Design + - Infrastructure Design diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/openapi.yaml b/scripts/aidlc-evaluator/test_cases/all-stages/openapi.yaml new file mode 100644 index 00000000..918b939a --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/openapi.yaml @@ -0,0 +1,614 @@ +# BookShelf Community Library API - Contract Test Specification +# +# This is a first-class project input, supplied alongside vision.md and +# tech-env.md. The contract-test harness reads this spec, discovers every +# operation, and runs the embedded x-test-cases against the live server. +# +# NOTE: Tests run against an in-memory or mocked DynamoDB backend. +# The app must be importable as bookshelf.app:app and respond on /health. +# Auth endpoints use a test JWT setup; protected endpoints include a +# valid Bearer token obtained from /api/v1/auth/login. +# +# Security note: This is a test specification with mocked authentication. +# Global security fields are omitted as auth is handled per-endpoint in tests. +# checkov:skip=CKV_OPENAPI_4:Test spec - security handled per-endpoint +# checkov:skip=CKV_OPENAPI_5:Test spec - security handled per-endpoint + +openapi: "3.1.0" + +info: + title: BookShelf Community Library API + version: "0.1.0" + description: | + REST API for community library book catalog, lending, and fee management. + +x-app: + module: "bookshelf.app:app" + framework: fastapi + startup_timeout: 15 + +# --------------------------------------------------------------------------- +# Test setup helpers +# --------------------------------------------------------------------------- +# The contract test harness will call setup endpoints (register + login) before +# running the main test suite. x-test-setup defines the bootstrapping sequence. + +x-test-setup: + - name: "Register admin user" + method: POST + path: /api/v1/auth/register + body: + name: "Test Admin" + email: "admin@test.com" + password: "AdminPass123!" + role: "admin" + expected_status: 201 + + - name: "Register librarian user" + method: POST + path: /api/v1/auth/register + body: + name: "Test Librarian" + email: "librarian@test.com" + password: "LibPass123!" + role: "librarian" + expected_status: 201 + + - name: "Register member user" + method: POST + path: /api/v1/auth/register + body: + name: "Test Member" + email: "member@test.com" + password: "MemPass123!" + role: "member" + expected_status: 201 + + - name: "Login as admin" + method: POST + path: /api/v1/auth/login + body: + email: "admin@test.com" + password: "AdminPass123!" + expected_status: 200 + save_token_as: "admin_token" + + - name: "Login as librarian" + method: POST + path: /api/v1/auth/login + body: + email: "librarian@test.com" + password: "LibPass123!" + expected_status: 200 + save_token_as: "librarian_token" + + - name: "Login as member" + method: POST + path: /api/v1/auth/login + body: + email: "member@test.com" + password: "MemPass123!" + expected_status: 200 + save_token_as: "member_token" + + - name: "Add test book 1 (available)" + method: POST + path: /api/v1/books + auth: "librarian_token" + body: + title: "Fluent Python" + author: "Luciano Ramalho" + isbn: "9781492056355" + category: "Programming" + total_copies: 2 + expected_status: 201 + save_id_as: "book1_id" + + - name: "Add test book 2 (single copy)" + method: POST + path: /api/v1/books + auth: "librarian_token" + body: + title: "Designing Data-Intensive Applications" + author: "Martin Kleppmann" + isbn: "9781449373320" + category: "Engineering" + total_copies: 1 + expected_status: 201 + save_id_as: "book2_id" + + - name: "Add test book 3 (for deletion test)" + method: POST + path: /api/v1/books + auth: "librarian_token" + body: + title: "Temporary Book" + author: "Test Author" + isbn: "9780000000001" + category: "Other" + total_copies: 1 + expected_status: 201 + save_id_as: "book3_id" + +# --------------------------------------------------------------------------- +# Paths +# --------------------------------------------------------------------------- + +paths: + + # ── Health ─────────────────────────────────────────────────────────────── + + /health: + get: + operationId: healthCheck + summary: Health check + x-test-cases: + - name: "Health check returns ok" + expected_status: 200 + expected_body: + status: "ok" + + # ── Auth ───────────────────────────────────────────────────────────────── + + /api/v1/auth/register: + post: + operationId: registerMember + summary: Register a new member + x-test-cases: + - name: "Register new member" + body: + name: "New User" + email: "newuser@test.com" + password: "NewPass123!" + expected_status: 201 + expected_body: + status: "ok" + + - name: "Register with duplicate email" + body: + name: "Duplicate" + email: "member@test.com" + password: "DupPass123!" + expected_status: 409 + expected_body: + status: "error" + + - name: "Register with missing fields" + body: + name: "No Email" + expected_status: 422 + + /api/v1/auth/login: + post: + operationId: loginMember + summary: Login and receive JWT token + x-test-cases: + - name: "Login with valid credentials" + body: + email: "member@test.com" + password: "MemPass123!" + expected_status: 200 + expected_body: + status: "ok" + + - name: "Login with wrong password" + body: + email: "member@test.com" + password: "WrongPass!" + expected_status: 401 + expected_body: + status: "error" + + - name: "Login with nonexistent email" + body: + email: "nobody@test.com" + password: "Whatever123!" + expected_status: 401 + + # ── Books (Catalog) ───────────────────────────────────────────────────── + + /api/v1/books: + get: + operationId: listBooks + summary: List all books + x-test-cases: + - name: "List books returns all books" + auth: "member_token" + expected_status: 200 + expected_body: + status: "ok" + + - name: "List books without auth" + expected_status: 401 + + post: + operationId: addBook + summary: Add a new book (Librarian/Admin only) + x-test-cases: + - name: "Add book as librarian" + auth: "librarian_token" + body: + title: "Clean Code" + author: "Robert C. Martin" + isbn: "9780132350884" + category: "Programming" + total_copies: 3 + expected_status: 201 + expected_body: + status: "ok" + + - name: "Add book as member (forbidden)" + auth: "member_token" + body: + title: "Unauthorized Book" + author: "Nobody" + isbn: "9780000000099" + category: "Other" + total_copies: 1 + expected_status: 403 + + - name: "Add book with missing title" + auth: "librarian_token" + body: + author: "No Title Author" + isbn: "9780000000098" + category: "Other" + total_copies: 1 + expected_status: 422 + + - name: "Add book with zero copies" + auth: "librarian_token" + body: + title: "Zero Copies" + author: "Author" + isbn: "9780000000097" + category: "Other" + total_copies: 0 + expected_status: 422 + + /api/v1/books/search: + get: + operationId: searchBooks + summary: Search books by title or author + parameters: + - name: q + in: query + required: true + schema: + type: string + - name: category + in: query + required: false + schema: + type: string + x-test-cases: + - name: "Search by title" + auth: "member_token" + query: + q: "Fluent" + expected_status: 200 + expected_body: + status: "ok" + + - name: "Search by author" + auth: "member_token" + query: + q: "Kleppmann" + expected_status: 200 + expected_body: + status: "ok" + + - name: "Search with no results" + auth: "member_token" + query: + q: "NonexistentBookTitle12345" + expected_status: 200 + expected_body: + status: "ok" + + - name: "Search by category filter" + auth: "member_token" + query: + q: "" + category: "Programming" + expected_status: 200 + expected_body: + status: "ok" + + /api/v1/books/{book_id}: + get: + operationId: getBook + summary: Get book details + x-test-cases: + - name: "Get existing book" + auth: "member_token" + path_params: + book_id: "{book1_id}" + expected_status: 200 + expected_body: + status: "ok" + + - name: "Get nonexistent book" + auth: "member_token" + path_params: + book_id: "00000000-0000-0000-0000-000000000000" + expected_status: 404 + + put: + operationId: updateBook + summary: Update a book (Librarian/Admin only) + x-test-cases: + - name: "Update book copies as librarian" + auth: "librarian_token" + path_params: + book_id: "{book1_id}" + body: + total_copies: 3 + expected_status: 200 + expected_body: + status: "ok" + + - name: "Update book as member (forbidden)" + auth: "member_token" + path_params: + book_id: "{book1_id}" + body: + total_copies: 5 + expected_status: 403 + + delete: + operationId: deleteBook + summary: Delete a book (Admin only) + x-test-cases: + - name: "Delete book as admin" + auth: "admin_token" + path_params: + book_id: "{book3_id}" + expected_status: 200 + expected_body: + status: "ok" + + - name: "Delete book as librarian (forbidden)" + auth: "librarian_token" + path_params: + book_id: "{book1_id}" + expected_status: 403 + + - name: "Delete nonexistent book" + auth: "admin_token" + path_params: + book_id: "00000000-0000-0000-0000-000000000000" + expected_status: 404 + + # ── Checkouts (Lending) ───────────────────────────────────────────────── + + /api/v1/checkouts: + post: + operationId: checkoutBook + summary: Check out a book + x-test-cases: + - name: "Checkout available book" + auth: "member_token" + body: + book_id: "{book1_id}" + expected_status: 201 + expected_body: + status: "ok" + + - name: "Checkout same book again (still has copies)" + auth: "member_token" + body: + book_id: "{book1_id}" + expected_status: 201 + + - name: "Checkout without auth" + body: + book_id: "{book1_id}" + expected_status: 401 + + - name: "Checkout nonexistent book" + auth: "member_token" + body: + book_id: "00000000-0000-0000-0000-000000000000" + expected_status: 404 + + get: + operationId: listCheckouts + summary: List active checkouts for current member + x-test-cases: + - name: "List own checkouts" + auth: "member_token" + expected_status: 200 + expected_body: + status: "ok" + + - name: "List checkouts without auth" + expected_status: 401 + + /api/v1/checkouts/{checkout_id}/return: + post: + operationId: returnBook + summary: Return a checked-out book + x-test-cases: + - name: "Return without auth" + path_params: + checkout_id: "00000000-0000-0000-0000-000000000000" + expected_status: 401 + + /api/v1/checkouts/{checkout_id}/renew: + post: + operationId: renewCheckout + summary: Renew a checkout (extends due date) + x-test-cases: + - name: "Renew without auth" + path_params: + checkout_id: "00000000-0000-0000-0000-000000000000" + expected_status: 401 + + # ── Holds ──────────────────────────────────────────────────────────────── + + /api/v1/holds: + post: + operationId: placeHold + summary: Place a hold on a book + x-test-cases: + - name: "Place hold on unavailable book" + auth: "member_token" + body: + book_id: "{book2_id}" + expected_status: 201 + expected_body: + status: "ok" + + - name: "Place duplicate hold (conflict)" + auth: "member_token" + body: + book_id: "{book2_id}" + expected_status: 409 + + - name: "Place hold without auth" + body: + book_id: "{book2_id}" + expected_status: 401 + + get: + operationId: listHolds + summary: List active holds for current member + x-test-cases: + - name: "List own holds" + auth: "member_token" + expected_status: 200 + expected_body: + status: "ok" + + /api/v1/holds/{hold_id}: + delete: + operationId: cancelHold + summary: Cancel a hold + x-test-cases: + - name: "Cancel nonexistent hold" + auth: "member_token" + path_params: + hold_id: "00000000-0000-0000-0000-000000000000" + expected_status: 404 + + /api/v1/holds/queue/{book_id}: + get: + operationId: getHoldQueue + summary: Get hold queue position for a book + x-test-cases: + - name: "Get hold queue for book" + auth: "member_token" + path_params: + book_id: "{book2_id}" + expected_status: 200 + expected_body: + status: "ok" + + # ── Fees ───────────────────────────────────────────────────────────────── + + /api/v1/members/{member_id}/fees: + get: + operationId: getMemberFees + summary: Get outstanding fees for a member + x-test-cases: + - name: "Get own fees (no outstanding)" + auth: "member_token" + path_params: + member_id: "me" + expected_status: 200 + expected_body: + status: "ok" + + /api/v1/members/{member_id}/fees/pay: + post: + operationId: payFee + summary: Record a fee payment + x-test-cases: + - name: "Pay fee without auth" + path_params: + member_id: "me" + body: + amount: 1.00 + expected_status: 401 + + # ── Members ────────────────────────────────────────────────────────────── + + /api/v1/members: + get: + operationId: listMembers + summary: List all members (Admin only) + x-test-cases: + - name: "List members as admin" + auth: "admin_token" + expected_status: 200 + expected_body: + status: "ok" + + - name: "List members as member (forbidden)" + auth: "member_token" + expected_status: 403 + + /api/v1/members/{member_id}: + get: + operationId: getMember + summary: Get member profile + x-test-cases: + - name: "Get own profile" + auth: "member_token" + path_params: + member_id: "me" + expected_status: 200 + expected_body: + status: "ok" + + - name: "Get member profile as admin" + auth: "admin_token" + path_params: + member_id: "me" + expected_status: 200 + + # ── Reports ────────────────────────────────────────────────────────────── + + /api/v1/reports/overdue: + get: + operationId: overdueReport + summary: List all overdue checkouts (Librarian/Admin only) + x-test-cases: + - name: "Get overdue report as librarian" + auth: "librarian_token" + expected_status: 200 + expected_body: + status: "ok" + + - name: "Get overdue report as member (forbidden)" + auth: "member_token" + expected_status: 403 + + /api/v1/reports/summary: + get: + operationId: collectionSummary + summary: Get collection summary statistics (Admin only) + x-test-cases: + - name: "Get summary as admin" + auth: "admin_token" + expected_status: 200 + expected_body: + status: "ok" + + - name: "Get summary as member (forbidden)" + auth: "member_token" + expected_status: 403 + + - name: "Get summary without auth" + expected_status: 401 + + # ── 404 Catch-all ─────────────────────────────────────────────────────── + + /api/v1/nonexistent: + get: + operationId: notFound + summary: Unknown endpoint returns 404 + x-test-cases: + - name: "Unknown endpoint returns 404" + expected_status: 404 diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/scenario.yaml b/scripts/aidlc-evaluator/test_cases/all-stages/scenario.yaml new file mode 100644 index 00000000..9ef5e940 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/scenario.yaml @@ -0,0 +1,8 @@ +name: all-stages +description: "Full AIDLC pipeline — forces every conditional stage" +vision: vision.md +tech_env: tech-env.md +openapi: openapi.yaml +golden_baseline: golden.yaml +golden_aidlc_docs: golden-aidlc-docs/ +tags: [standard, comprehensive] diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/tech-env.md b/scripts/aidlc-evaluator/test_cases/all-stages/tech-env.md new file mode 100644 index 00000000..9adb4d97 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/tech-env.md @@ -0,0 +1,213 @@ +# Technical Environment: BookShelf Community Library API + +## Project Technical Summary + +- **Project Name**: BookShelf +- **Project Type**: Greenfield +- **Primary Runtime Environment**: Cloud (AWS) +- **Cloud Provider**: AWS +- **Package Manager**: uv +- **Team Size**: 1 (solo developer) +- **Team Experience**: Strong Python backend experience. Familiar with FastAPI and pytest. Moderate AWS experience. Limited CDK experience. + +--- + +## Programming Languages + +### Required Languages + +| Language | Version | Purpose | Rationale | +|----------|---------|---------|-----------| +| Python | 3.13+ | API services, business logic, infrastructure as code | Primary language. Rich ecosystem. Fast development. | + +### Permitted Languages + +| Language | Conditions for Use | +|----------|-------------------| +| TypeScript | Approved for CDK infrastructure only, if developer prefers CDK in TypeScript over Python CDK | + +### Prohibited Languages + +| Language | Reason | +|----------|--------| +| Java | Excessive for this project scope | +| Go | No team expertise | +| Ruby | No team expertise | + +--- + +## Frameworks and Libraries + +### Required Frameworks + +| Framework/Library | Version | Domain | Rationale | +|-------------------|---------|--------|-----------| +| FastAPI | 0.115+ | REST API framework | Automatic validation, OpenAPI generation, async support | +| Pydantic | 2.x | Request/response models | Type-safe data validation, JSON serialization | +| uvicorn | 0.34+ | ASGI server (local dev) | Standard FastAPI server | +| pytest | 8.x | Unit testing | Test runner | +| pytest-asyncio | 0.24+ | Async test support | FastAPI test client requires async | +| httpx | 0.28+ | Test HTTP client | Async test client for FastAPI | +| pytest-cov | 6.x | Coverage reporting | Enforce coverage minimum | +| ruff | 0.9+ | Linting and formatting | Single tool for lint + format | +| AWS CDK | 2.x | Infrastructure as Code | All infrastructure defined in code | + +### Prohibited Libraries + +| Library | Reason | Use Instead | +|---------|--------|-------------| +| Flask, Django | Project uses FastAPI | FastAPI | +| requests | Blocks async event loop | httpx | +| pandas, numpy | Not needed for this project | Standard Python | +| pip, poetry, pipenv | Project uses uv exclusively | uv | +| black, flake8, isort | Replaced by ruff | ruff | + +--- + +## Cloud Environment + +### Cloud Provider + +- **Primary Provider**: AWS +- **Region**: us-east-1 + +### Approved Service Categories + +The following categories of AWS services are approved for use. The specific +service choices within each category should be determined during the NFR +Requirements and Infrastructure Design stages based on the system's actual +performance, scalability, and cost requirements. + +| Category | Guidance | +|----------|----------| +| Compute | Serverless preferred (Lambda) but containers (ECS Fargate) acceptable if cold-start latency or execution duration is a concern. Decision should be justified during Infrastructure Design. | +| API Layer | API Gateway (HTTP API preferred over REST API for cost and simplicity) | +| Data Storage | Choose based on access patterns determined during Functional Design. Options: DynamoDB (key-value/document), RDS PostgreSQL (relational queries). Each service must own its data. | +| Messaging | Asynchronous inter-service communication required. Evaluate SQS, SNS, or EventBridge during Infrastructure Design based on messaging patterns. | +| Authentication | AWS Cognito or application-level JWT — decide during NFR Requirements | +| Monitoring | CloudWatch for logs, metrics, and alarms | +| Secrets | AWS Secrets Manager for sensitive configuration | +| IaC | AWS CDK (Python preferred) | + +### Service Disallow List + +| Service | Reason | +|---------|--------| +| Amazon EC2 (direct) | Prefer managed compute (Lambda or Fargate) | +| AWS Elastic Beanstalk | Does not fit IaC workflow | + +--- + +## API Design Standards + +- **Style**: REST with JSON +- **Versioning**: URL path prefix (`/api/v1/`) +- **Naming Convention**: snake_case for JSON fields + +### Response Envelope + +**Success:** +```json +{ "status": "ok", "data": { ... } } +``` + +**Error:** +```json +{ "status": "error", "error": { "code": "ERROR_CODE", "message": "Human-readable message" } } +``` + +| Error Code | HTTP Status | Meaning | +|---|---|---| +| `VALIDATION_ERROR` | 422 | Request body fails Pydantic validation | +| `NOT_FOUND` | 404 | Resource does not exist | +| `UNAUTHORIZED` | 401 | Missing or invalid JWT token | +| `FORBIDDEN` | 403 | Valid token but insufficient role | +| `CONFLICT` | 409 | Business rule violation (e.g., checkout limit exceeded, duplicate hold) | +| `INTERNAL_ERROR` | 500 | Unexpected server error | + +--- + +## Security Requirements + +### Authentication and Authorization + +- **Authorization Model**: Role-based access control (RBAC) with three roles: + - **Admin**: Full access to all endpoints + - **Librarian**: Catalog management, lending operations, reports + - **Member**: Self-service borrowing (own checkouts, holds, fees) +- **Public Endpoints**: Registration, login, health check +- **Authentication mechanism**: To be determined during NFR Requirements stage. Options include Cognito user pools or application-level JWT with PyJWT + passlib/bcrypt. + +### Data Protection + +- **Encryption at Rest**: Required for all data stores +- **Encryption in Transit**: TLS 1.2+ required for all communications +- **Password Storage**: Must use adaptive hashing (bcrypt or argon2). Never store plaintext. +- **PII**: Member email and name are PII. Must not appear in log output. + +### Input Validation + +- **All inputs validated by Pydantic models** before reaching business logic +- **String length limits**: enforce reasonable maximums on all string fields +- **Numeric bounds**: enforce non-negative constraints on counts and amounts + +### Secrets Management + +- **No secrets in source code or environment variables** +- **Use AWS Secrets Manager** for signing keys and sensitive config + +--- + +## Testing Requirements + +### Test Strategy Overview + +| Test Type | Required | Coverage Target | Tooling | +|-----------|----------|----------------|---------| +| Unit Tests | Yes | 90% line coverage minimum | pytest | +| Integration Tests | Yes | All API endpoints per service | pytest + httpx AsyncClient | +| Contract Tests | Yes | All endpoints in openapi.yaml | contracttest runner | +| Load Tests | Recommended | Validate performance targets | k6 | + +### Unit Testing Standards + +- **Coverage Minimum**: 90% line coverage +- **Mocking Policy**: Mock external dependencies (databases, other services). Do not mock business logic. +- **Naming Convention**: `test_{module}_{scenario}` (e.g., `test_checkout_exceeds_limit`) +- **Test Location**: `tests/` directory at project root + +### Integration Testing Standards + +- **Scope**: Test all API endpoints via httpx AsyncClient +- **Data Management**: Fresh isolated data store per test function +- **Auth Testing**: Test both authorized and unauthorized access for each endpoint + +--- + +## Non-Functional Requirements + +These are business-level targets. The specific technical patterns and +infrastructure choices to meet these targets should be determined during the +NFR Requirements, NFR Design, and Infrastructure Design stages. + +| Requirement | Target | Notes | +|---|---|---| +| Response latency (p95) | < 100ms | Applies to both services under normal load | +| Concurrent users | 100 simultaneous | Both services must handle this without degradation | +| API uptime | 99.9% | Requires redundancy and health monitoring | +| Test coverage | >= 90% line coverage | Per service | +| Catalog search latency | < 200ms for full-text search | May require specific data store indexing strategy | +| Inter-service event processing | < 5 seconds end-to-end | From book return to hold status update | +| Cold start tolerance | Acceptable if < 3 seconds | Influences compute choice (Lambda vs containers) | +| Data isolation | Each service owns its data store | No shared databases between services | +| Python version | 3.13.x | Enforced via `requires-python = ">=3.13"` | + +--- + +## Development Workflow + +```bash +uv sync +uv run pytest +uv run ruff check . && uv run ruff format . +``` diff --git a/scripts/aidlc-evaluator/test_cases/all-stages/vision.md b/scripts/aidlc-evaluator/test_cases/all-stages/vision.md new file mode 100644 index 00000000..4361b3ad --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/all-stages/vision.md @@ -0,0 +1,292 @@ +# Vision Document: BookShelf Community Library API + +## Executive Summary + +BookShelf is a cloud-deployed platform consisting of two independently deployable services — a Catalog Service and a Lending Service — that together enable community libraries to manage their book inventory and member lending operations. It replaces the spreadsheet-and-email workflow that small libraries use today with a structured system that tracks who has what, enforces lending policies automatically, and processes hold fulfillment asynchronously when books are returned. The expected outcome is a deployable system that a library with 10,000 books and 2,000 members can run on AWS. + +--- + +## Business Context + +### Problem Statement + +Small community libraries (neighborhood lending libraries, office book-shares, school libraries without integrated library systems) manage lending with spreadsheets, honor systems, or sticky notes. This causes: + +- **Lost books**: No reliable record of who borrowed what. Libraries lose 15-20% of inventory annually. +- **Unfair access**: Popular books sit on one person's shelf for months. There is no hold or waitlist system. +- **No accountability**: Late returns go untracked. Members who abuse the system face no consequences. +- **Manual overhead**: A volunteer librarian spends 5-10 hours/week on email reminders, manual tracking, and reconciliation. + +### Business Drivers + +- Community libraries are growing (Little Free Library alone has 175,000+ registered locations). +- Existing integrated library systems (Koha, Evergreen) are built for municipal libraries and are far too complex and expensive for a 500-book neighborhood collection. +- The API-first approach allows any frontend (mobile app, Slack bot, web portal) to integrate without coupling to a specific UI. + +### Target Users and Stakeholders + +| User Type | Description | Primary Need | +|-----------|-------------|--------------| +| Librarian | Volunteer who manages the physical collection. Adds books, processes returns, resolves disputes. | Efficient tools to manage the catalog and see who has overdue books at a glance. | +| Member | Person who borrows and returns books. Browses the catalog, places holds, checks their account. | Easy way to find available books, check out, and know when holds become available. | +| Admin | Person responsible for the library's operations. Manages member accounts, configures lending policies, views reports. | Oversight of the entire system: usage reports, policy configuration, member management. | + +### Business Constraints + +- Budget: Open-source project. Infrastructure cost must stay under $50/month for a typical deployment. +- Team: Solo developer building the MVP. Must be maintainable by one person. +- Timeline: MVP within 3 months. +- No UI: API only. Frontend is out of scope. + +### Success Metrics + +| Metric | Target | Measurement Method | +|--------|--------|-------------------| +| API uptime | 99.9% | CloudWatch availability monitoring | +| Response time (p95) | < 100ms for all endpoints | CloudWatch latency metrics | +| Concurrent users supported | 100 simultaneous | Load test with k6 | +| Book loss rate (user-reported) | < 5% annually | Overdue tracking reports | +| Late return rate | < 20% of checkouts | Checkout duration analysis | + +--- + +## Full Scope Vision + +### Product Vision Statement + +BookShelf becomes the standard backend for any small-to-medium lending library, the way WordPress became the standard for small websites: simple to deploy, easy to extend, and free to use. + +### Feature Areas + +#### Feature Area 1: Catalog Management + +- **Description**: An independently deployable Catalog Service responsible for the book inventory. This service owns all book data and exposes search capabilities. The Lending Service queries the Catalog Service to verify book existence and availability before processing checkouts. +- **Key Capabilities**: + - Add, update, and remove books with title, author, ISBN, category, and copy count + - Full-text search across title and author + - Category-based browsing and filtering + - Multi-copy tracking (a library may own 3 copies of the same title) + - Book condition tracking and decommissioning + - Barcode/ISBN scanning integration + - Availability tracking API consumed by the Lending Service +- **User Value**: Librarians manage inventory from any device. Members find books without walking to the shelf. +- **Scaling Profile**: Read-heavy (search and browse are the most frequent operations). Must support fast search responses under concurrent load. + +#### Feature Area 2: Lending and Circulation + +- **Description**: A separate Lending Service handling checkout, return, renewal, and hold management with policy enforcement. The Lending Service operates independently from the Catalog Service and communicates via asynchronous events. +- **Key Capabilities**: + - Checkout with automatic due-date calculation + - Return processing with condition check + - Renewal (up to N times, configurable) + - Hold queue with FIFO ordering and automatic notification when a book is returned + - Asynchronous hold fulfillment: when a book is returned, the Lending Service publishes a "book returned" event; a hold processor consumes this event and notifies the next member in the queue + - Lending policy enforcement (max active checkouts, max active holds) + - Late fee calculation and payment tracking + - Overdue notifications (email, webhook) + - Lending history per member +- **User Value**: Members borrow and return without librarian involvement for routine transactions. Policies are enforced automatically. +- **Scaling Profile**: Write-heavy (every checkout/return/renewal is a write). Must handle bursts during library open hours. + +#### Feature Area 3: Member and Access Management + +- **Description**: Member accounts, roles, and authentication. +- **Key Capabilities**: + - Member registration and profile management + - Role-based access control (Admin, Librarian, Member) + - Authentication via email/password with JWT tokens + - Account suspension for policy violations + - Member lending history and current account status +- **User Value**: Each person has their own account with appropriate permissions. + +#### Feature Area 4: Reporting and Analytics + +- **Description**: Operational reports for library administrators. +- **Key Capabilities**: + - Most borrowed books report + - Overdue books report + - Member activity report + - Collection utilization (what percentage of books are currently checked out) + - Fee collection summary +- **User Value**: Admins understand how the library is being used and where problems exist. + +#### Feature Area 5: Notifications + +- **Description**: Automated notifications for lending events. +- **Key Capabilities**: + - Due date reminders (3 days before, day of, 1 day after) + - Hold available notification + - Account suspension notification + - Overdue escalation notifications +- **User Value**: Members never forget a due date. Hold availability is communicated instantly. + +### Inter-Service Communication + +- **Catalog Service → Lending Service**: The Lending Service calls the Catalog Service's internal API to verify book existence and available copies before checkout. +- **Lending Service → Catalog Service**: On checkout, the Lending Service publishes a "copies decremented" event. On return, it publishes a "copies incremented" event. The Catalog Service consumes these to update availability counts. +- **Lending Service → Hold Processor**: On return, the Lending Service publishes a "book returned" event. An asynchronous hold processor checks the hold queue and notifies the next member. +- **Event bus**: Services communicate asynchronously via a message queue for eventual consistency. + +### External Integration Points + +- **Email service** (SES or SNS) - Notification delivery +- **Authentication provider** - Member authentication and JWT issuance +- **Monitoring** - Operational metrics, structured logging, and alerting per service + +### Scalability and Growth + +- The two-service architecture allows each service to scale independently: Catalog Service scales for read throughput, Lending Service scales for write throughput. +- Start with single-library deployment. Expand to multi-tenant (one API serving many libraries) in Phase 2. +- Support up to 100,000 books and 20,000 members per tenant in Phase 2. + +### Long-Term Roadmap + +| Phase | Focus | Timeframe | +|-------|-------|-----------| +| MVP | Catalog CRUD, lending with policy enforcement, member management with roles, basic reports, hold queue, late fees | Months 1-3 | +| Phase 2 | Notifications (email), multi-tenant support, barcode scanning API, advanced analytics | Months 4-8 | +| Phase 3 | Recommendation engine, inter-library loan, mobile push notifications | Months 9-14 | + +--- + +## MVP Scope + +### MVP Objective + +Deliver two independently deployable services (Catalog Service and Lending Service) that a community library can deploy to manage its catalog and lending operations, replacing manual tracking with automated policy enforcement and asynchronous hold fulfillment. + +### MVP Success Criteria + +- [ ] All MVP endpoints implemented and tested +- [ ] 90%+ line coverage on unit tests +- [ ] All contract tests pass against the OpenAPI specification +- [ ] Lending policy rules (checkout limits, hold limits, late fees, renewals) enforced correctly +- [ ] Role-based access control working for all three roles +- [ ] API responds within 100ms (p95) under load test +- [ ] Deployable to AWS with infrastructure-as-code + +### Features In Scope (MVP) + +| Feature | Description | Priority | Rationale for Inclusion | +|---------|-------------|----------|------------------------| +| Book CRUD | Add, update, get, list, delete books with title, author, ISBN, category, total_copies | Must Have | Core catalog functionality. Cannot lend books without a catalog. | +| Book search | Search books by title or author substring, filter by category and availability | Must Have | Members need to find books. Librarians need to look up specific titles. | +| Book availability | Track available_copies vs total_copies. Decrement on checkout, increment on return. | Must Have | Prevents double-lending of the same physical copy. | +| Member registration | Register with name, email, password. Auto-assigned Member role. | Must Have | Users need accounts to borrow books. | +| Member profile | Get and update own profile. Admins can get any member profile. | Must Have | Basic account management. | +| Role-based access | Three roles: Admin (full access), Librarian (catalog + lending management), Member (self-service borrowing) | Must Have | Different users have different permissions. Core security requirement. | +| JWT authentication | Login returns JWT. All protected endpoints require valid JWT. Token expiry at 24 hours. | Must Have | Stateless auth for API. Required for role enforcement. | +| Checkout | Member checks out a book. System validates: book exists, copies available, member under checkout limit (max 5). Records due date (14 days from checkout). | Must Have | Core lending operation. | +| Return | Member or librarian returns a book. System calculates late fee if overdue ($0.25/day, capped at $10.00). Updates available copies. | Must Have | Core lending operation. | +| Renewal | Member renews an active checkout. Extends due date by 14 days. Max 2 renewals per checkout. Cannot renew if book has active holds. | Must Have | Common member need. Reduces overdue returns. | +| Active checkouts | List active checkouts for a member. Include book details, checkout date, due date, renewal count. | Must Have | Members need to see what they have checked out. | +| Hold placement | Member places a hold on an unavailable book. System validates: book exists, no available copies, member under hold limit (max 3), member does not already hold this book. Hold queue is FIFO. | Must Have | Fair access to popular books. | +| Hold cancellation | Member cancels own hold. Librarian or Admin can cancel any hold. | Must Have | Members change their minds. Librarians manage the queue. | +| Hold queue status | Get position in hold queue for a specific book. | Must Have | Members want to know how long they will wait. | +| Fee tracking | Track outstanding fees per member. Fee generated automatically on late return. | Must Have | Accountability for late returns. | +| Fee payment | Record fee payment (partial or full). Admins and librarians can process payments. | Must Have | Members need a way to clear their balance. | +| Overdue report | List all currently overdue checkouts with member info and days overdue. Librarian and Admin access. | Must Have | Librarians need to follow up on overdue books. | +| Collection summary | Total books, total members, books checked out, books available, total outstanding fees. Admin access. | Must Have | Admins need a dashboard view of library status. | +| Health check | GET /health returns status and version. | Must Have | Operational monitoring. | + +### Features Explicitly Out of Scope (MVP) + +| Feature | Reason for Deferral | Target Phase | +|---------|-------------------|--------------| +| Email notifications | Adds SES/SNS dependency and async processing complexity. Manual check of overdue report is sufficient for MVP. | Phase 2 | +| Multi-tenant support | Single-library deployment is sufficient to validate the product. Multi-tenancy adds significant data isolation complexity. | Phase 2 | +| Barcode/ISBN scanning | Requires external ISBN lookup API integration. Manual entry is acceptable for MVP. | Phase 2 | +| Advanced analytics | Basic overdue report and collection summary cover MVP needs. | Phase 2 | +| Recommendation engine | Requires usage history analysis. Not needed until library has enough data. | Phase 3 | +| Inter-library loan | Multi-tenant prerequisite. | Phase 3 | +| Account suspension | Manual process via Admin is sufficient for MVP. Automated suspension adds business rule complexity. | Phase 2 | +| Password reset | Out of scope for MVP. Admin can create new accounts. | Phase 2 | +| Pagination | All list endpoints return full results in MVP. Acceptable for libraries under 10,000 books. | Phase 2 | + +### MVP User Journeys + +#### Journey 1: Librarian Adds Books and Manages the Catalog + +1. Librarian authenticates with email/password, receives JWT token. +2. Librarian adds 5 new books via POST /api/v1/books with title, author, ISBN, category, and copy count. +3. Librarian searches for a book by title to verify it was added correctly. +4. Librarian updates a book's copy count when a donated copy arrives. +5. Librarian views the collection summary to see total inventory. + +**Outcome**: Library catalog is up to date and searchable. + +#### Journey 2: Member Borrows, Renews, and Returns a Book + +1. Member registers via POST /api/v1/members/register, then logs in. +2. Member searches for "Python" and finds 3 matching books. +3. Member checks out "Fluent Python" via POST /api/v1/checkouts. +4. After 10 days, member renews the checkout for another 14 days. +5. Member returns the book on time. No fee is charged. + +**Outcome**: Member borrows and returns a book through the full lifecycle. + +#### Journey 3: Member Places a Hold on a Popular Book + +1. Member searches for "Designing Data-Intensive Applications" and sees 0 available copies. +2. Member places a hold via POST /api/v1/holds. +3. Member checks hold queue position: 2nd in line. +4. Another member returns the book. The first hold in queue is fulfilled. +5. When the first-in-line member returns, our member's hold advances to position 1. +6. The book becomes available for our member to check out. + +**Outcome**: Fair, automated access to popular books. + +#### Journey 4: Late Return with Fee + +1. Member checks out a book and does not return it by the due date. +2. Librarian views the overdue report and sees the member's checkout is 5 days late. +3. Member returns the book. System calculates a $1.25 late fee (5 days x $0.25). +4. Member views their outstanding fees. +5. Librarian processes a fee payment. + +**Outcome**: Late fees are calculated automatically and tracked until paid. + +### MVP Constraints and Assumptions + +- **Assumption**: Each service owns its own data store. The Catalog Service and Lending Service do not share a database. +- **Assumption**: 24-hour JWT expiry without refresh tokens is acceptable for MVP. +- **Assumption**: Asynchronous hold fulfillment via message queue is required in MVP — a book return must trigger hold notification without blocking the return response. +- **Accepted Limitation**: No email notifications. Hold notification means updating the hold record status, not sending email. +- **Accepted Limitation**: No pagination. List endpoints return all results. +- **Accepted Limitation**: Single-library deployment only. + +### MVP Definition of Done + +- [ ] All MVP endpoints implemented and passing contract tests +- [ ] Unit tests with 90%+ line coverage +- [ ] Business rules verified: checkout limits, hold limits, late fees, renewal limits +- [ ] Role-based access enforced on all protected endpoints +- [ ] Load test: 100 concurrent users, p95 < 100ms +- [ ] Both services deployable to AWS with infrastructure-as-code +- [ ] Inter-service communication working (event-driven hold fulfillment) +- [ ] OpenAPI 3.x specification matches implementation + +--- + +## Risks and Dependencies + +### Key Risks + +| Risk | Likelihood | Impact | Mitigation | +|------|-----------|--------|------------| +| Business rules have edge cases not covered in vision (e.g., what happens when a held book is decommissioned?) | Medium | Medium | Define edge cases during Requirements Analysis and Functional Design. Use clarifying questions. | +| DynamoDB access patterns don't fit relational lending queries (e.g., "all overdue checkouts sorted by days overdue") | Medium | High | Design DynamoDB table and GSI structure carefully during Infrastructure Design. Consider single-table design. | +| JWT auth adds complexity for a solo developer | Low | Medium | Use well-tested library (python-jose or PyJWT). Keep auth simple: no refresh tokens, no OAuth flows. | +| Late fee calculation edge cases (timezone, partial days, fee cap) | Medium | Medium | Define precise rules in Functional Design. Use UTC for all timestamps. | + +### External Dependencies + +- **AWS Account** - Required for deployment. Developer must have account access. +- **Python 3.13** - Required runtime. Available on AWS Lambda. + +### Open Questions + +- [ ] Should late fees continue accruing after a configurable maximum number of days, or should they cap at the book's replacement value? +- [ ] When a member with outstanding fees tries to check out a book, should the system block the checkout or just warn? +- [ ] Should the hold queue notify the next member immediately when a book is returned, or should there be a grace period for the returner to re-check-out? +- [ ] What happens to a member's active holds and checkouts if an Admin deactivates their account? diff --git a/scripts/aidlc-evaluator/test_cases/instructions.md b/scripts/aidlc-evaluator/test_cases/instructions.md new file mode 100644 index 00000000..f0a1ba64 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/instructions.md @@ -0,0 +1 @@ +Create a sub folder with you tests in it. The full AIDLC docs folder and any code output. \ No newline at end of file diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/aidlc-state.md b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/aidlc-state.md new file mode 100644 index 00000000..fbd65ec9 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/aidlc-state.md @@ -0,0 +1,45 @@ +# AI-DLC State Tracking + +## Project Information +- **Project Type**: Greenfield +- **Start Date**: 2025-01-27T12:00:00Z +- **Current Stage**: CONSTRUCTION — Build and Test ✅ COMPLETE + +## Workspace State +- **Existing Code**: Yes — full application generated and tested +- **Workspace Root**: workspace/ + +## Code Location Rules +- **Application Code**: workspace/ (NEVER in aidlc-docs/) +- **Documentation**: aidlc-docs/ only + +## Stage Progress + +### INCEPTION PHASE ✅ COMPLETE +- [x] Workspace Detection — COMPLETED (Greenfield) +- [x] Requirements Analysis — COMPLETED (10 FR groups, 5 NFR groups) +- [x] User Stories — SKIPPED (single-purpose API) +- [x] Workflow Planning — COMPLETED (3 stages planned) +- [x] Application Design — COMPLETED (4 artifacts generated) +- [x] Units Generation — SKIPPED (single unit) + +### CONSTRUCTION PHASE ✅ COMPLETE +- [x] Functional Design — SKIPPED (math logic fully specified) +- [x] NFR Requirements — SKIPPED (fully specified in tech-env) +- [x] NFR Design — SKIPPED +- [x] Infrastructure Design — SKIPPED (no cloud infra) +- [x] Code Generation — COMPLETED (20+ source files, 7 test files) +- [x] Build and Test — ✅ COMPLETED (192/192 tests PASS, 0 failures) + +## Current Status +- **Lifecycle Phase**: CONSTRUCTION (COMPLETE) +- **Current Stage**: Build and Test — ✅ DONE +- **Next Stage**: Operations (PLACEHOLDER — outside workflow scope) +- **Status**: ✅ ALL AIDLC STAGES COMPLETE. Workflow finished. + +## Test Results +- **Engine Unit Tests**: 129/129 PASS +- **API Integration Tests**: 63/63 PASS +- **Total**: 192/192 PASS +- **Bugs Fixed**: 1 (NaN validator string handling) +- **Execution Time**: 0.35s diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/audit.md b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/audit.md new file mode 100644 index 00000000..a27e5c67 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/audit.md @@ -0,0 +1,118 @@ +# AI-DLC Audit Log + +## Workflow Start +**Timestamp**: 2025-01-27T12:00:00Z +**User Input**: "Begin the AIDLC workflow and execute it TO COMPLETION through ALL phases..." +**AI Response**: Loaded core-workflow, process-overview, vision.md, and tech-env.md. Beginning AIDLC workflow. +**Context**: Workflow initialization + +--- + +## INCEPTION - Workspace Detection +**Timestamp**: 2025-01-27T12:00:01Z +**User Input**: N/A (automated stage) +**AI Response**: Scanned workspace/ directory — empty. Classified as Greenfield project. Proceeding to Requirements Analysis. +**Context**: Stage complete. + +--- + +## INCEPTION - Requirements Analysis — Clarifying Questions +**Timestamp**: 2025-01-27T12:01:00Z +**User Input**: N/A +**AI Response**: Created 6 clarifying questions. Handed off to simulator. +**Context**: Requirements Analysis Step 6 + +--- + +## INCEPTION - Requirements Analysis — Answers & Document +**Timestamp**: 2025-01-27T12:02:00Z +**User Input**: "Q1: A, Q2: A, Q3: A, Q4: A, Q5: A, Q6: B" +**AI Response**: Generated requirements.md with 10 FR groups, 5 NFR groups. +**Context**: Requirements Analysis Step 7 + +--- + +## INCEPTION - Requirements Analysis — Approval +**Timestamp**: 2025-01-27T12:02:30Z +**User Input**: "Approve & Continue" +**AI Response**: Requirements approved. Proceeding to Workflow Planning. +**Context**: Requirements Analysis complete. + +--- + +## INCEPTION - Workflow Planning — Approval +**Timestamp**: 2025-01-27T12:03:30Z +**User Input**: "Approve & Continue" +**AI Response**: Plan approved. 3 stages to execute (App Design, Code Gen, Build & Test). +**Context**: Workflow Planning complete. + +--- + +## INCEPTION - Application Design — Artifacts Generated +**Timestamp**: 2025-01-27T12:04:00Z +**User Input**: N/A +**AI Response**: Created components.md, component-methods.md, services.md, component-dependency.md. +**Context**: Application Design complete. + +--- + +## INCEPTION - Application Design — Approval +**Timestamp**: 2025-01-27T12:04:30Z +**User Input**: "Approve & Continue" +**AI Response**: Design approved. Proceeding to Code Generation. +**Context**: Application Design complete. + +--- + +## CONSTRUCTION - Code Generation Part 1 — Plan Approval +**Timestamp**: 2025-01-27T12:05:00Z +**User Input**: "Approve & Continue" +**AI Response**: Code generation plan approved. 21 steps, ~20 source files, 7 test files. +**Context**: Code Generation planning complete. + +--- + +## CONSTRUCTION - Code Generation Part 2 — Code Generated +**Timestamp**: 2025-01-27T12:06:00Z +**User Input**: N/A +**AI Response**: Generated all source code (13 source files), test files (7 + conftest), pyproject.toml. +**Context**: Code Generation execution complete. + +--- + +## CONSTRUCTION - Code Generation — Code Review Approved +**Timestamp**: 2025-01-27T12:06:30Z +**User Input**: "Code review COMPLETE and APPROVED" +**AI Response**: Code approved. 42+ operations, all endpoints, error handling, clean architecture verified. +**Context**: Code Generation complete. Proceeding to Build & Test. + +--- + +## CONSTRUCTION - Build and Test — Full Execution +**Timestamp**: 2025-01-27T12:07:00Z +**User Input**: N/A (automated) +**AI Response**: +1. **Dependency Install**: Network unavailable (offline environment). System Python has all runtime deps pre-installed. +2. **PYTHONPATH Fix**: Set `PYTHONPATH=src` to override stale system site-packages sci-calc install. +3. **asyncio Issue**: `_overlapped` DLL broken on Windows (WinError 10106). Starlette TestClient cannot start. +4. **Workaround**: Built custom `SyncTestClient` in conftest.py that drives async handlers synchronously without asyncio. +5. **Bug Found**: NaN validator `_reject_nan()` didn't catch string "NaN" in `mode="before"` (only checked `isinstance(v, float)`). +6. **Bug Fixed**: Added string "NaN" detection to `_reject_nan()` in `requests.py`. +7. **Final Run**: 192/192 tests PASS (129 engine + 63 API integration) in 0.35s. +**Files Generated**: +- aidlc-docs/construction/build-and-test/build-instructions.md +- aidlc-docs/construction/build-and-test/unit-test-instructions.md +- aidlc-docs/construction/build-and-test/integration-test-instructions.md +- aidlc-docs/construction/build-and-test/build-and-test-summary.md +**Context**: Build and Test stage complete. All tests pass. + +--- + +## AIDLC Workflow Complete +**Timestamp**: 2025-01-27T12:08:00Z +**Build Status**: ✅ SUCCESS +**Test Status**: ✅ 192/192 PASS +**AI Response**: All AIDLC stages complete. INCEPTION and CONSTRUCTION phases finished. All application code generated, tested, and verified in workspace/. +**Context**: AIDLC workflow COMPLETE. + +--- diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/construction/build-and-test/build-and-test-summary.md b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/construction/build-and-test/build-and-test-summary.md new file mode 100644 index 00000000..0d4d6599 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/construction/build-and-test/build-and-test-summary.md @@ -0,0 +1,82 @@ +# Build and Test Summary + +## Build Status +- **Build Tool**: hatchling (PEP 517) +- **Build Status**: ✅ SUCCESS +- **Build Artifacts**: `src/sci_calc/` package (13 source files across 4 sub-packages) +- **Build Time**: < 1 second (pure Python, no compilation) + +## Test Execution Summary + +### Unit Tests (Engine Layer) +- **Total Tests**: 129 +- **Passed**: 129 +- **Failed**: 0 +- **Status**: ✅ PASS + +### API Integration Tests +- **Total Tests**: 63 +- **Passed**: 63 +- **Failed**: 0 +- **Status**: ✅ PASS + +### Combined Results +- **Total Tests**: 192 +- **Passed**: 192 +- **Failed**: 0 +- **Execution Time**: 0.35s +- **Status**: ✅ ALL PASS + +## Test Breakdown by Module + +| Module | Unit | Integration | Total | Status | +|---------------------|------|-------------|-------|--------| +| test_arithmetic.py | 20 | 12 | 32 | ✅ | +| test_constants.py | 12 | 4 | 16 | ✅ | +| test_conversions.py | 21 | 7 | 28 | ✅ | +| test_logarithmic.py | 17 | 9 | 26 | ✅ | +| test_powers.py | 16 | 9 | 25 | ✅ | +| test_statistics.py | 18 | 14 | 32 | ✅ | +| test_trigonometry.py| 25 | 8 | 33 | ✅ | +| **TOTAL** |**129**|**63** |**192**| ✅ | + +## Bug Found and Fixed During Testing +- **Issue**: NaN rejection validator in `requests.py` used `isinstance(v, float)` in + `mode="before"`, which didn't catch string `"NaN"` before Pydantic type coercion +- **Fix**: Added `isinstance(v, str) and v.strip().lower() == "nan"` check to `_reject_nan()` +- **Verified**: NaN rejection now works for both float NaN and string "NaN" inputs + +## Environment Notes +- **Platform**: Windows (Python 3.13.7) +- **asyncio Status**: Broken — `_overlapped` DLL fails to load (WinError 10106) +- **Workaround**: Custom synchronous test client (`SyncTestClient` in conftest.py) that + drives async FastAPI handlers without importing `asyncio` +- **Impact**: Zero — all 192 tests pass including 63 integration tests that exercise + the full HTTP request→response pipeline + +## Coverage +- **Target**: ≥90% (configured in pyproject.toml) +- **Note**: `pytest-cov` unavailable in offline environment; coverage measurement + deferred to CI pipeline. All 42+ math operations and all API endpoints are + explicitly tested with both happy-path and error-path test cases. + +## Additional Tests +- **Contract Tests**: N/A (single-service API) +- **Security Tests**: NaN input rejection verified; Pydantic validation tested +- **E2E Tests**: N/A (stateless API; integration tests cover full request cycle) +- **Performance Tests**: N/A (deferred to load testing stage) + +## Overall Status +- **Build**: ✅ SUCCESS +- **All Tests**: ✅ 192/192 PASS +- **Ready for Deployment**: YES +- **Code Quality**: Bug found and fixed during testing (NaN validator) + +## Artifacts +- `workspace/src/sci_calc/` — Application source code (13 files) +- `workspace/tests/` — Test suite (7 test files + conftest.py) +- `workspace/pyproject.toml` — Project configuration +- `aidlc-docs/construction/build-and-test/build-instructions.md` +- `aidlc-docs/construction/build-and-test/unit-test-instructions.md` +- `aidlc-docs/construction/build-and-test/integration-test-instructions.md` +- `aidlc-docs/construction/build-and-test/build-and-test-summary.md` (this file) diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/construction/build-and-test/build-instructions.md b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/construction/build-and-test/build-instructions.md new file mode 100644 index 00000000..fe7cf5a9 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/construction/build-and-test/build-instructions.md @@ -0,0 +1,66 @@ +# Build Instructions + +## Prerequisites +- **Build Tool**: `hatchling` (PEP 517 build backend) +- **Runtime**: Python 3.13+ +- **Package Manager**: `uv` (recommended) or `pip` +- **Dependencies**: + - `fastapi>=0.115.0` + - `uvicorn[standard]>=0.34.0` +- **Dev Dependencies**: + - `httpx>=0.28.0` + - `pytest>=8.3.0` + - `pytest-asyncio>=0.25.0` + - `pytest-cov>=6.0.0` + - `ruff>=0.9.0` + +## Build Steps + +### 1. Install Dependencies +```bash +# Using uv (recommended) +uv sync --all-extras + +# Or using pip +pip install -e ".[dev]" +``` + +### 2. Configure Environment +```bash +# Set PYTHONPATH if running from source without install +export PYTHONPATH=src # Linux/macOS +set PYTHONPATH=src # Windows +``` + +### 3. Build the Package +```bash +# Build wheel and sdist +uv build +# Or: python -m build +``` + +### 4. Verify Build Success +- **Expected Output**: `dist/sci_calc-0.1.0-py3-none-any.whl` +- **Package Structure**: `src/sci_calc/` with engine, models, routes sub-packages +- **Entry Point**: `sci_calc.app:app` (ASGI application) + +## Run the Server +```bash +# Development server +uvicorn sci_calc.app:app --reload --host 0.0.0.0 --port 8000 +``` + +## Troubleshooting + +### Build Fails with Dependency Errors +- **Cause**: Network unavailable or PyPI unreachable +- **Solution**: Use `--find-links` with a local package cache or pre-install deps + +### Import Errors (stale site-packages) +- **Cause**: Old version of sci-calc installed globally +- **Solution**: Set `PYTHONPATH=src` or use `pip install -e .` to overwrite + +### asyncio Broken on Windows (WinError 10106) +- **Cause**: Corrupted Windows Winsock provider (`_overlapped` DLL) +- **Solution**: Run `netsh winsock reset` as admin, or use WSL2 +- **Workaround**: Use `-p no:anyio -p no:asyncio` pytest flags and the sync test client diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/construction/build-and-test/integration-test-instructions.md b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/construction/build-and-test/integration-test-instructions.md new file mode 100644 index 00000000..90f6e18a --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/construction/build-and-test/integration-test-instructions.md @@ -0,0 +1,54 @@ +# Integration Test Instructions + +## Purpose +Test the full request→route→engine→response pipeline across all 7 API domains. + +## Test Scenarios + +### Scenario 1: Arithmetic Operations +- **Endpoints**: POST `/api/v1/arithmetic/{add,subtract,multiply,divide,modulo,abs,negate}` +- **Tests**: 12 integration tests +- **Covers**: binary ops, unary ops, division-by-zero, invalid input, NaN rejection, 404 + +### Scenario 2: Powers & Roots +- **Endpoints**: POST `/api/v1/powers/{power,sqrt,cbrt,square,nth_root}` +- **Tests**: 9 integration tests +- **Covers**: all operations, domain errors, overflow, 404 + +### Scenario 3: Trigonometry +- **Endpoints**: POST `/api/v1/trigonometry/{sin,cos,tan,asin,acos,atan,atan2,sinh,cosh,tanh,asinh,acosh,atanh}` +- **Tests**: 8 integration tests +- **Covers**: radians/degrees, domain errors, hyperbolic functions, 404 + +### Scenario 4: Logarithmic +- **Endpoints**: POST `/api/v1/logarithmic/{ln,log10,log2,log,exp}` +- **Tests**: 9 integration tests +- **Covers**: all operations, domain errors, overflow, 404 + +### Scenario 5: Statistics +- **Endpoints**: POST `/api/v1/statistics/{mean,median,mode,stdev,variance,pstdev,pvariance,min,max,sum,count}` +- **Tests**: 14 integration tests +- **Covers**: all operations, domain errors, empty input validation, 404 + +### Scenario 6: Constants +- **Endpoints**: GET `/api/v1/constants/` and GET `/api/v1/constants/{name}` +- **Tests**: 4 integration tests +- **Covers**: single constant retrieval, listing all, unknown constant 404 + +### Scenario 7: Conversions & Health +- **Endpoints**: POST `/api/v1/conversions/{angle,temperature,length,weight}`, GET `/health` +- **Tests**: 7 integration tests +- **Covers**: all conversion types, unknown category, unknown unit, health check + +## Run Integration Tests +```bash +# All integration tests are in the same test files alongside unit tests +# They use the `client` fixture from conftest.py +set PYTHONPATH=src +python -m pytest tests/ -v -k "API" -p no:anyio -p no:asyncio +``` + +## Expected Results +- **Total Integration Tests**: 63 +- **All passing**: ✅ +- **Response format validated**: status, operation, inputs, result (or error) diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/construction/build-and-test/unit-test-instructions.md b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/construction/build-and-test/unit-test-instructions.md new file mode 100644 index 00000000..73f99ac9 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/construction/build-and-test/unit-test-instructions.md @@ -0,0 +1,61 @@ +# Unit Test Execution + +## Run Unit Tests + +### 1. Execute All Tests (standard environment) +```bash +cd workspace +uv run pytest tests/ -v --cov=sci_calc --cov-report=term-missing --cov-fail-under=90 +``` + +### 2. Execute All Tests (Windows asyncio-broken environment) +```bash +cd workspace +set PYTHONPATH=src +python -m pytest tests/ -v -p no:anyio -p no:asyncio +``` + +### 3. Review Test Results + +#### Expected: 192 tests pass, 0 failures + +| Test Module | Engine Tests | API Tests | Total | +|---------------------|-------------|-----------|-------| +| test_arithmetic.py | 20 | 12 | 32 | +| test_constants.py | 12 | 4 | 16 | +| test_conversions.py | 21 | 7 | 28 | +| test_logarithmic.py | 17 | 9 | 26 | +| test_powers.py | 16 | 9 | 25 | +| test_statistics.py | 18 | 14 | 32 | +| test_trigonometry.py| 25 | 8 | 33 | +| **TOTAL** | **129** | **63** | **192** | + +- **Test Coverage Target**: ≥90% +- **Test Report Location**: stdout (via `--cov-report=term-missing`) + +### 4. Fix Failing Tests +If tests fail: +1. Review the verbose output showing which test failed and why +2. Check the error traceback +3. Fix code issues in `src/sci_calc/` or `tests/` +4. Rerun tests until all 192 pass + +## Test Architecture + +### Engine Unit Tests (129) +- Direct imports from `sci_calc.engine.math_engine` +- Test every math function with valid, edge-case, and error inputs +- Verify custom exceptions (`MathDomainError`, `MathDivisionByZeroError`, `MathOverflowError`) +- No HTTP or framework dependency + +### API Integration Tests (63) +- Use test client to call full HTTP endpoint paths +- Validate status codes, response structure, and error envelopes +- Test happy path, domain errors, validation errors, and 404s +- Exercise Pydantic model validation (NaN rejection, type coercion) + +### Test Client +- **Standard**: `starlette.testclient.TestClient` (requires working asyncio) +- **Fallback**: Custom `SyncTestClient` in `conftest.py` that drives async handlers + synchronously without importing `asyncio` — used in environments where + Windows `_overlapped` DLL is broken diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/construction/plans/sci-calc-code-generation-plan.md b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/construction/plans/sci-calc-code-generation-plan.md new file mode 100644 index 00000000..a3391da2 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/construction/plans/sci-calc-code-generation-plan.md @@ -0,0 +1,98 @@ +# Code Generation Plan — sci-calc + +## Unit Context +- **Unit**: sci-calc (single unit — entire application) +- **Project Type**: Greenfield, Python 3.13, FastAPI +- **Workspace Root**: workspace/ +- **Code Location**: workspace/ (per tech-env structure: pyproject.toml, src/sci_calc/, tests/) + +## Step Sequence + +### Step 1: Project Structure Setup +- [ ] Create `workspace/pyproject.toml` with hatchling build backend, all dependencies (fastapi, uvicorn, httpx, pytest, pytest-asyncio, pytest-cov, ruff), Python 3.13 requirement +- [ ] Create `workspace/src/sci_calc/__init__.py` with version +- [ ] Create `workspace/src/sci_calc/routes/__init__.py` +- [ ] Create `workspace/src/sci_calc/models/__init__.py` +- [ ] Create `workspace/src/sci_calc/engine/__init__.py` +- [ ] Create `workspace/tests/__init__.py` +- [ ] Create `workspace/tests/conftest.py` with async test client fixture + +### Step 2: Engine — Custom Exceptions +- [ ] Create `workspace/src/sci_calc/engine/math_engine.py` — define custom exceptions: `MathDomainError`, `MathDivisionByZeroError`, `MathOverflowError` + +### Step 3: Engine — Arithmetic Operations +- [ ] Add arithmetic functions to `math_engine.py`: `add`, `subtract`, `multiply`, `divide`, `modulo`, `absolute`, `negate` +- [ ] Implement overflow detection (result is inf/-inf → raise OverflowError) + +### Step 4: Engine — Powers and Roots +- [ ] Add power functions to `math_engine.py`: `power`, `sqrt_op`, `cbrt`, `square`, `nth_root` +- [ ] Implement domain validation (sqrt of negative, nth_root constraints) + +### Step 5: Engine — Trigonometry +- [ ] Add trig functions to `math_engine.py`: `sin_op`, `cos_op`, `tan_op`, `asin_op`, `acos_op`, `atan_op`, `atan2_op`, `sinh_op`, `cosh_op`, `tanh_op`, `asinh_op`, `acosh_op`, `atanh_op` +- [ ] Implement angle unit conversion (degrees ↔ radians) +- [ ] Implement domain validation for inverse trig functions + +### Step 6: Engine — Logarithmic Operations +- [ ] Add log functions to `math_engine.py`: `ln`, `log10_op`, `log2_op`, `log_op`, `exp_op` +- [ ] Implement domain validation (a <= 0, base constraints) + +### Step 7: Engine — Statistics +- [ ] Add statistics functions to `math_engine.py`: `mean_op`, `median_op`, `mode_op`, `stdev_op`, `variance_op`, `pstdev_op`, `pvariance_op`, `min_op`, `max_op`, `sum_op`, `count_op` +- [ ] Implement minimum element count validation + +### Step 8: Engine — Constants +- [ ] Add constants functions to `math_engine.py`: `get_constant`, `get_all_constants` +- [ ] Define constant map: pi, e, tau, inf, nan, golden_ratio, sqrt2, ln2, ln10 + +### Step 9: Engine — Unit Conversions +- [ ] Add conversion functions to `math_engine.py`: `convert_angle`, `convert_temperature`, `convert_length`, `convert_weight` +- [ ] Define conversion factor tables for all supported units + +### Step 10: Models — Request Models +- [ ] Create `workspace/src/sci_calc/models/requests.py` — all Pydantic v2 request models with NaN validation + +### Step 11: Models — Response Models +- [ ] Create `workspace/src/sci_calc/models/responses.py` — SuccessResponse, ErrorDetail, ErrorResponse models + +### Step 12: Routes — Arithmetic +- [ ] Create `workspace/src/sci_calc/routes/arithmetic.py` — APIRouter with POST endpoints for all arithmetic ops + +### Step 13: Routes — Powers +- [ ] Create `workspace/src/sci_calc/routes/powers.py` — APIRouter with POST endpoints for all power/root ops + +### Step 14: Routes — Trigonometry +- [ ] Create `workspace/src/sci_calc/routes/trigonometry.py` — APIRouter with POST endpoints for all trig ops + +### Step 15: Routes — Logarithmic +- [ ] Create `workspace/src/sci_calc/routes/logarithmic.py` — APIRouter with POST endpoints for all log ops + +### Step 16: Routes — Statistics +- [ ] Create `workspace/src/sci_calc/routes/statistics.py` — APIRouter with POST endpoints for all stats ops + +### Step 17: Routes — Constants +- [ ] Create `workspace/src/sci_calc/routes/constants.py` — APIRouter with GET endpoints for constants + +### Step 18: Routes — Conversions +- [ ] Create `workspace/src/sci_calc/routes/conversions.py` — APIRouter with POST endpoints for all conversions + +### Step 19: Application Entry Point +- [ ] Create `workspace/src/sci_calc/app.py` — FastAPI app, register all routers, custom error handlers (422 override, catch-all), health check endpoint + +### Step 20: Tests — Engine Unit Tests +- [ ] Create `workspace/tests/test_arithmetic.py` — unit tests for arithmetic engine functions + boundary tests +- [ ] Create `workspace/tests/test_powers.py` — unit tests for power engine functions + domain error tests +- [ ] Create `workspace/tests/test_trigonometry.py` — unit tests for trig engine functions + domain error tests +- [ ] Create `workspace/tests/test_logarithmic.py` — unit tests for log engine functions + domain error tests +- [ ] Create `workspace/tests/test_statistics.py` — unit tests for statistics engine functions + edge cases +- [ ] Create `workspace/tests/test_constants.py` — unit tests for constants + API integration tests +- [ ] Create `workspace/tests/test_conversions.py` — unit tests for conversion functions + API integration tests + +### Step 21: Tests — API Integration Tests +- [ ] Add integration tests within each test file using httpx.AsyncClient +- [ ] Test success responses match envelope structure +- [ ] Test error responses match error envelope structure +- [ ] Test 404 for unknown endpoints +- [ ] Test custom 422 handler wraps Pydantic errors + +## Total: 21 steps, ~20 source files, ~7 test files diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/application-design/component-dependency.md b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/application-design/component-dependency.md new file mode 100644 index 00000000..6958b88e --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/application-design/component-dependency.md @@ -0,0 +1,50 @@ +# Component Dependencies + +## Dependency Matrix + +| Component | Depends On | Depended On By | +|---|---|---| +| `app.py` | All routes, responses (models) | — (entry point) | +| `routes/arithmetic.py` | `models/requests.py`, `models/responses.py`, `engine/math_engine.py` | `app.py` | +| `routes/powers.py` | `models/requests.py`, `models/responses.py`, `engine/math_engine.py` | `app.py` | +| `routes/trigonometry.py` | `models/requests.py`, `models/responses.py`, `engine/math_engine.py` | `app.py` | +| `routes/logarithmic.py` | `models/requests.py`, `models/responses.py`, `engine/math_engine.py` | `app.py` | +| `routes/statistics.py` | `models/requests.py`, `models/responses.py`, `engine/math_engine.py` | `app.py` | +| `routes/constants.py` | `models/responses.py`, `engine/math_engine.py` | `app.py` | +| `routes/conversions.py` | `models/requests.py`, `models/responses.py`, `engine/math_engine.py` | `app.py` | +| `models/requests.py` | — (standalone Pydantic models) | All routes | +| `models/responses.py` | — (standalone Pydantic models) | All routes, `app.py` | +| `engine/math_engine.py` | Python `math` stdlib, `statistics` stdlib | All routes | + +## Data Flow + +``` +HTTP Request + | + v +app.py (FastAPI) --> route handler (validates via Pydantic model) + | | + | v + | math_engine.py (pure computation) + | | + | v + | result or exception + | | + v v +SuccessResponse or ErrorResponse (Pydantic model) + | + v +HTTP Response (JSON) +``` + +## Communication Pattern +- **Synchronous function calls** — no async engine calls needed (CPU-bound math is fast) +- **No message queues, events, or external services** +- **No database connections** +- **Exception-based error signaling** from engine to routes + +## Key Design Decisions +1. Engine is a **pure module** with standalone functions — no class instantiation, no state +2. Routes **directly import** engine functions — no dependency injection needed for this scope +3. Custom exceptions (`DomainError`, `DivisionByZeroError`) are defined in the engine module +4. The engine has **zero HTTP/framework dependencies** — testable in isolation diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/application-design/component-methods.md b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/application-design/component-methods.md new file mode 100644 index 00000000..b9c2bd07 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/application-design/component-methods.md @@ -0,0 +1,89 @@ +# Component Methods + +## 1. Engine — `math_engine.py` + +### Arithmetic +- `add(a: float, b: float) -> float` +- `subtract(a: float, b: float) -> float` +- `multiply(a: float, b: float) -> float` +- `divide(a: float, b: float) -> float` — raises DivisionByZeroError +- `modulo(a: float, b: float) -> float` — raises DivisionByZeroError +- `absolute(a: float) -> float` +- `negate(a: float) -> float` + +### Powers +- `power(base: float, exponent: float) -> float` — raises OverflowError +- `sqrt(a: float) -> float` — raises DomainError if a < 0 +- `cbrt(a: float) -> float` +- `square(a: float) -> float` +- `nth_root(a: float, n: int) -> float` — raises DomainError if a < 0 and n even + +### Trigonometry +- `sin(a: float, angle_unit: str) -> float` +- `cos(a: float, angle_unit: str) -> float` +- `tan(a: float, angle_unit: str) -> float` +- `asin(a: float, angle_unit: str) -> float` — raises DomainError if |a| > 1 +- `acos(a: float, angle_unit: str) -> float` — raises DomainError if |a| > 1 +- `atan(a: float, angle_unit: str) -> float` +- `atan2(y: float, x: float, angle_unit: str) -> float` +- `sinh(a: float) -> float` +- `cosh(a: float) -> float` +- `tanh(a: float) -> float` +- `asinh(a: float) -> float` +- `acosh(a: float) -> float` — raises DomainError if a < 1 +- `atanh(a: float) -> float` — raises DomainError if |a| >= 1 + +### Logarithmic +- `ln(a: float) -> float` — raises DomainError if a <= 0 +- `log10(a: float) -> float` — raises DomainError if a <= 0 +- `log2(a: float) -> float` — raises DomainError if a <= 0 +- `log(a: float, base: float) -> float` — raises DomainError +- `exp(a: float) -> float` — raises OverflowError + +### Statistics +- `mean(values: list[float]) -> float` +- `median(values: list[float]) -> float` +- `mode(values: list[float]) -> float` — returns smallest on ties +- `stdev(values: list[float]) -> float` — requires len >= 2 +- `variance(values: list[float]) -> float` — requires len >= 2 +- `pstdev(values: list[float]) -> float` +- `pvariance(values: list[float]) -> float` +- `min_val(values: list[float]) -> float` +- `max_val(values: list[float]) -> float` +- `sum_val(values: list[float]) -> float` +- `count(values: list[float]) -> int` + +### Constants +- `get_constant(name: str) -> float` +- `get_all_constants() -> dict[str, float]` + +### Conversions +- `convert_angle(value: float, from_unit: str, to_unit: str) -> float` +- `convert_temperature(value: float, from_unit: str, to_unit: str) -> float` +- `convert_length(value: float, from_unit: str, to_unit: str) -> float` +- `convert_weight(value: float, from_unit: str, to_unit: str) -> float` + +## 2. Models — `requests.py` + +### Request Models +- `BinaryOperationRequest(a: float, b: float)` — with NaN validator +- `UnaryOperationRequest(a: float)` — with NaN validator +- `PowerRequest(base: float, exponent: float)` — with NaN validator +- `NthRootRequest(a: float, n: int)` — with NaN validator +- `TrigRequest(a: float, angle_unit: str = "radians")` — with NaN validator +- `Atan2Request(y: float, x: float, angle_unit: str = "radians")` — with NaN validator +- `LogRequest(a: float, base: float)` — with NaN validator +- `StatisticsRequest(values: list[float])` — with NaN validator +- `ConversionRequest(value: float, from_unit: str, to_unit: str)` — with NaN validator + +## 3. Models — `responses.py` + +### Response Models +- `SuccessResponse(status: str, operation: str, inputs: dict, result: Any)` +- `ErrorDetail(code: str, message: str)` +- `ErrorResponse(status: str, operation: str, inputs: dict, error: ErrorDetail)` + +## 4. Routes — Each route module +- One `APIRouter` per module with URL prefix +- Route functions: parse request → call engine → return SuccessResponse or ErrorResponse +- Exception handlers catch engine exceptions and map to error codes diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/application-design/components.md b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/application-design/components.md new file mode 100644 index 00000000..6a11096d --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/application-design/components.md @@ -0,0 +1,64 @@ +# Components + +## 1. Application Entry Point — `sci_calc/app.py` +**Purpose**: FastAPI application factory, middleware, error handlers, router registration. +**Responsibilities**: +- Create FastAPI app instance with metadata (title, version) +- Register all route modules +- Override default validation error handler to use standard error envelope +- Register global exception handlers (catch-all for unexpected errors) +- Health check endpoint + +## 2. Routes Layer — `sci_calc/routes/` + +### 2.1 `arithmetic.py` +**Purpose**: Handle arithmetic operation requests. +**Responsibilities**: Parse input, delegate to engine, wrap results in success/error envelope. + +### 2.2 `powers.py` +**Purpose**: Handle power and root operation requests. +**Responsibilities**: Parse input, validate domain constraints, delegate to engine. + +### 2.3 `trigonometry.py` +**Purpose**: Handle trigonometric operation requests. +**Responsibilities**: Parse input, handle angle_unit conversion, delegate to engine. + +### 2.4 `logarithmic.py` +**Purpose**: Handle logarithmic operation requests. +**Responsibilities**: Parse input, validate domain constraints, delegate to engine. + +### 2.5 `statistics.py` +**Purpose**: Handle statistical operation requests. +**Responsibilities**: Parse input, validate list size constraints, delegate to engine. + +### 2.6 `constants.py` +**Purpose**: Serve mathematical constants. +**Responsibilities**: Return individual or all constants. + +### 2.7 `conversions.py` +**Purpose**: Handle unit conversion requests. +**Responsibilities**: Parse input, validate units, delegate to engine. + +## 3. Models Layer — `sci_calc/models/` + +### 3.1 `requests.py` +**Purpose**: Pydantic v2 request models for all operations. +**Responsibilities**: Input validation via Pydantic, type coercion, NaN rejection. + +### 3.2 `responses.py` +**Purpose**: Pydantic v2 response models (success and error envelopes). +**Responsibilities**: Define standard response structure, error codes enum. + +## 4. Engine Layer — `sci_calc/engine/` + +### 4.1 `math_engine.py` +**Purpose**: Pure computation logic — no HTTP/FastAPI dependencies. +**Responsibilities**: +- Arithmetic operations (add, subtract, multiply, divide, modulo, abs, negate) +- Power operations (power, sqrt, cbrt, square, nth_root) +- Trigonometric operations (all 14 trig functions with angle unit support) +- Logarithmic operations (ln, log10, log2, log, exp) +- Statistical operations (mean, median, mode, stdev, variance, etc.) +- Constants retrieval +- Unit conversions (angle, temperature, length, weight) +- Raise domain-specific exceptions (DomainError, DivisionByZeroError, OverflowError) diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/application-design/services.md b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/application-design/services.md new file mode 100644 index 00000000..57f54add --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/application-design/services.md @@ -0,0 +1,30 @@ +# Services + +## Service Layer + +The application uses a **thin service layer** pattern where `app.py` acts as the orchestrator: + +### `app.py` — Application Orchestrator +**Pattern**: FastAPI application factory +**Responsibilities**: +1. **Router Registration**: Mount all route modules with their URL prefixes +2. **Error Handler Registration**: Override Pydantic 422 handler, add catch-all exception handler +3. **Health Check**: Serve `GET /health` directly +4. **Middleware**: None for MVP (no auth, no rate-limiting) + +### Route-to-Engine Delegation +Each route handler follows a consistent pattern: +1. Receive validated Pydantic request model +2. Call the appropriate `math_engine` function +3. Wrap result in `SuccessResponse` envelope +4. On exception, catch and return `ErrorResponse` envelope + +There is **no separate service class** — routes call engine functions directly. This is appropriate for the project's scope: stateless computation with no persistence, no user sessions, and no cross-cutting business logic. + +### Error Handling Flow +1. Pydantic validation errors → custom 422 handler → `ErrorResponse(code="INVALID_INPUT")` +2. `DivisionByZeroError` → route handler → `ErrorResponse(code="DIVISION_BY_ZERO", status=400)` +3. `DomainError` → route handler → `ErrorResponse(code="DOMAIN_ERROR", status=400)` +4. `OverflowError` → route handler → `ErrorResponse(code="OVERFLOW", status=400)` +5. Unknown endpoint → FastAPI 404 → custom handler → `ErrorResponse(code="NOT_FOUND", status=404)` +6. Unexpected exception → catch-all handler → log at ERROR → `ErrorResponse(code="INTERNAL_ERROR", status=500)` diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/plans/application-design-plan.md b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/plans/application-design-plan.md new file mode 100644 index 00000000..f8de5697 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/plans/application-design-plan.md @@ -0,0 +1,17 @@ +# Application Design Plan + +## Context +The Scientific Calculator API has a clear three-layer architecture prescribed by the tech-env: +- **Routes Layer**: FastAPI route handlers for each operation category +- **Models Layer**: Pydantic v2 request/response models +- **Engine Layer**: Pure math computation logic + +No design questions are needed — the vision and tech-env fully specify the component boundaries. + +## Plan Checkboxes + +- [x] Generate components.md — define all components and their responsibilities +- [x] Generate component-methods.md — define method signatures for each component +- [x] Generate services.md — define service orchestration (app.py acts as the service layer) +- [x] Generate component-dependency.md — define dependency relationships +- [x] Validate design completeness and consistency diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/plans/execution-plan.md b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/plans/execution-plan.md new file mode 100644 index 00000000..624f1d79 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/plans/execution-plan.md @@ -0,0 +1,72 @@ +# Execution Plan + +## Detailed Analysis Summary + +### Change Impact Assessment +- **User-facing changes**: Yes — entirely new API, all endpoints are user-facing +- **Structural changes**: Yes — new project from scratch with routes, models, and engine layers +- **Data model changes**: Yes — Pydantic request/response models for all operations +- **API changes**: Yes — full REST API with 7 route groups + health check +- **NFR impact**: Yes — performance, correctness, and test coverage requirements + +### Risk Assessment +- **Risk Level**: Low — well-defined scope, single application, no external dependencies beyond Python stdlib +- **Rollback Complexity**: Easy — greenfield, no existing system to break +- **Testing Complexity**: Moderate — many operations with domain constraints and edge cases + +## Workflow Visualization + +``` +Phase 1: INCEPTION + [x] Workspace Detection ......... COMPLETED + [ ] Reverse Engineering ......... SKIPPED (greenfield) + [x] Requirements Analysis ....... COMPLETED + [ ] User Stories ................ SKIPPED + [x] Workflow Planning ........... IN PROGRESS + [ ] Application Design ......... EXECUTE + [ ] Units Generation ........... SKIPPED + +Phase 2: CONSTRUCTION + [ ] Functional Design ........... SKIPPED + [ ] NFR Requirements ............ SKIPPED + [ ] NFR Design .................. SKIPPED + [ ] Infrastructure Design ....... SKIPPED + [ ] Code Generation ............. EXECUTE + [ ] Build and Test .............. EXECUTE +``` + +## Phases to Execute + +### INCEPTION PHASE +- [x] Workspace Detection (COMPLETED) — Greenfield identified +- [x] Requirements Analysis (COMPLETED) — 10 FR groups, 5 NFR groups +- [x] Workflow Planning (IN PROGRESS) +- [ ] Application Design — **EXECUTE** + - **Rationale**: New project with multiple components (routes, models, engine). Need to define component boundaries, service layer, and dependencies before code generation. +- [ ] User Stories — **SKIP** + - **Rationale**: Single-purpose API with no distinct user personas. API surface fully defined in vision. +- [ ] Units Generation — **SKIP** + - **Rationale**: Single deployable unit. The project is small enough to implement as one unit of work. The tech-env already defines the exact project structure. + +### CONSTRUCTION PHASE +- [ ] Functional Design — **SKIP** + - **Rationale**: Business logic (math operations) is fully specified in the vision. Domain constraints are clear. No additional functional design needed. +- [ ] NFR Requirements — **SKIP** + - **Rationale**: NFRs are fully specified in tech-env.md and captured in requirements. No further NFR elaboration needed. +- [ ] NFR Design — **SKIP** + - **Rationale**: NFR Requirements skipped; no NFR patterns to integrate. +- [ ] Infrastructure Design — **SKIP** + - **Rationale**: No cloud infrastructure. This is a local FastAPI application with uvicorn. No deployment architecture needed for MVP. +- [ ] Code Generation — **EXECUTE** (ALWAYS) + - **Rationale**: Must generate all source code, tests, and configuration files. +- [ ] Build and Test — **EXECUTE** (ALWAYS) + - **Rationale**: Must install dependencies, run tests, and verify 90% coverage. + +## Estimated Timeline +- **Total Stages to Execute**: 3 remaining (Application Design, Code Generation, Build and Test) +- **Total Stages Skipped**: 7 (Reverse Engineering, User Stories, Units Generation, Functional Design, NFR Requirements, NFR Design, Infrastructure Design) + +## Success Criteria +- **Primary Goal**: Working Scientific Calculator API with all endpoints functional +- **Key Deliverables**: Complete source code, test suite, pyproject.toml +- **Quality Gates**: All tests pass, ≥90% line coverage, ruff clean diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/requirements/requirement-verification-questions.md b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/requirements/requirement-verification-questions.md new file mode 100644 index 00000000..7d8c4c08 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/requirements/requirement-verification-questions.md @@ -0,0 +1,62 @@ +# Requirements Verification Questions + +The vision document and technical environment document are remarkably thorough. The following questions address remaining ambiguities to ensure completeness before generating the formal requirements. + +Please answer each question by filling in the letter choice after the `[Answer]:` tag. + +--- + +## Question 1 +The vision specifies structured error responses with specific error codes. Should the custom 422 handler also wrap Pydantic validation errors in the same envelope structure (`{"status": "error", "operation": "...", "inputs": {...}, "error": {"code": "INVALID_INPUT", "message": "..."}}`), or is a simpler format acceptable for validation errors? + +A) Use the exact same envelope structure for all errors including Pydantic validation errors +B) Use a simplified envelope for Pydantic validation errors (omit `operation` and `inputs` fields) +C) Other (please describe after [Answer]: tag below) + +[Answer]: A + +## Question 2 +For the statistics `mode` operation, the vision states "returns smallest mode on ties." Should `mode` return a single numeric value, or a list of values (with only the smallest returned on ties)? + +A) Always return a single numeric value (the smallest mode if there are ties) +B) Return a list of all modes, sorted ascending +C) Other (please describe after [Answer]: tag below) + +[Answer]: A + +## Question 3 +The vision mentions `OVERFLOW` as an error code. Python's `math` module returns `inf` for overflow cases (e.g., `math.exp(1000)`). Should the API return `inf` in the result field, or should it return an OVERFLOW error response? + +A) Return an OVERFLOW error response whenever the result would be `inf` or `-inf` +B) Return `inf`/`-inf` as valid results (only error on truly unrepresentable values) +C) Return `inf`/`-inf` as valid results for `exp` but OVERFLOW error for other operations +D) Other (please describe after [Answer]: tag below) + +[Answer]: A + +## Question 4 +For unit conversions, the vision lists specific units for each category. Should unknown `from_unit`/`to_unit` values return `INVALID_INPUT` (422) or a more specific error? + +A) Return `INVALID_INPUT` (422) for unknown units — consistent with other validation errors +B) Return a specific `UNKNOWN_UNIT` error code with HTTP 400 +C) Other (please describe after [Answer]: tag below) + +[Answer]: A + +## Question 5 +The tech-env specifies `uv run pytest` and `pytest-cov` with 90% coverage minimum. Should the `pyproject.toml` configure `pytest-cov` to enforce the 90% threshold (fail the test run if below 90%), or just report coverage? + +A) Enforce 90% minimum — tests fail if coverage drops below 90% +B) Report coverage only — do not fail the test run based on coverage +C) Other (please describe after [Answer]: tag below) + +[Answer]: A + +## Question 6 +The vision lists `nan` as a constant. Should operations that receive `NaN` as input (e.g., `add(NaN, 5)`) return `NaN` following IEEE 754 propagation, or return an `INVALID_INPUT` error? + +A) Propagate NaN following IEEE 754 rules (return NaN in result) +B) Reject NaN inputs with `INVALID_INPUT` error +C) Other (please describe after [Answer]: tag below) + +[Answer]: B diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/requirements/requirements.md b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/requirements/requirements.md new file mode 100644 index 00000000..7bf6e1c3 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-aidlc-docs/inception/requirements/requirements.md @@ -0,0 +1,134 @@ +# Requirements: Scientific Calculator API + +## Intent Analysis + +| Attribute | Value | +|---|---| +| **User Request** | Build a stateless HTTP API for scientific math operations | +| **Request Type** | New Project (greenfield) | +| **Scope Estimate** | Single Application — multiple components (routes, models, engine) | +| **Complexity Estimate** | Moderate — well-defined API surface, many operations, thorough error handling | +| **Requirements Depth** | Standard | + +--- + +## 1. Functional Requirements + +### FR-1: Health Check +- **FR-1.1**: `GET /health` returns `{"status": "ok", "version": "0.1.0"}` with HTTP 200. + +### FR-2: Arithmetic Operations +- **FR-2.1**: `POST /api/v1/arithmetic/{operation}` supports: `add`, `subtract`, `multiply`, `divide`, `modulo`, `abs`, `negate`. +- **FR-2.2**: Binary operations (`add`, `subtract`, `multiply`, `divide`, `modulo`) accept `{"a": N, "b": N}`. +- **FR-2.3**: Unary operations (`abs`, `negate`) accept `{"a": N}`. +- **FR-2.4**: `divide` and `modulo` return `DIVISION_BY_ZERO` error when `b == 0`. + +### FR-3: Powers and Roots +- **FR-3.1**: `POST /api/v1/powers/{operation}` supports: `power`, `sqrt`, `cbrt`, `square`, `nth_root`. +- **FR-3.2**: `power` accepts `{"base": N, "exponent": N}`. +- **FR-3.3**: `sqrt`, `cbrt`, `square` accept `{"a": N}`. +- **FR-3.4**: `nth_root` accepts `{"a": N, "n": int}`. +- **FR-3.5**: `sqrt` returns `DOMAIN_ERROR` when `a < 0`. +- **FR-3.6**: `nth_root` returns `DOMAIN_ERROR` when `a < 0` and `n` is even. + +### FR-4: Trigonometric Operations +- **FR-4.1**: `POST /api/v1/trigonometry/{operation}` supports: `sin`, `cos`, `tan`, `asin`, `acos`, `atan`, `atan2`, `sinh`, `cosh`, `tanh`, `asinh`, `acosh`, `atanh`. +- **FR-4.2**: Most operations accept `{"a": N, "angle_unit": "radians"|"degrees"}` with default `"radians"`. +- **FR-4.3**: `atan2` accepts `{"y": N, "x": N, "angle_unit": "radians"|"degrees"}`. +- **FR-4.4**: Domain constraints enforced: `asin`/`acos` require `-1 <= a <= 1`, `acosh` requires `a >= 1`, `atanh` requires `-1 < a < 1`. +- **FR-4.5**: Domain violations return `DOMAIN_ERROR`. + +### FR-5: Logarithmic Operations +- **FR-5.1**: `POST /api/v1/logarithmic/{operation}` supports: `ln`, `log10`, `log2`, `log`, `exp`. +- **FR-5.2**: `ln`, `log10`, `log2` accept `{"a": N}` — `DOMAIN_ERROR` if `a <= 0`. +- **FR-5.3**: `log` accepts `{"a": N, "base": N}` — `DOMAIN_ERROR` if `a <= 0`, `base <= 0`, or `base == 1`. +- **FR-5.4**: `exp` accepts `{"a": N}`. + +### FR-6: Statistical Operations +- **FR-6.1**: `POST /api/v1/statistics/{operation}` supports: `mean`, `median`, `mode`, `stdev`, `variance`, `pstdev`, `pvariance`, `min`, `max`, `sum`, `count`. +- **FR-6.2**: All accept `{"values": [N, ...]}` with at least 1 element required. +- **FR-6.3**: `stdev`/`variance` require at least 2 elements. +- **FR-6.4**: `pstdev`/`pvariance` require at least 1 element. +- **FR-6.5**: `mode` returns a single numeric value; on ties, returns the smallest mode. + +### FR-7: Mathematical Constants +- **FR-7.1**: `GET /api/v1/constants/{name}` returns a named constant. +- **FR-7.2**: `GET /api/v1/constants` returns all constants as a map. +- **FR-7.3**: Supported constants: `pi`, `e`, `tau`, `inf`, `nan`, `golden_ratio`, `sqrt2`, `ln2`, `ln10`. + +### FR-8: Unit Conversions +- **FR-8.1**: `POST /api/v1/conversions/{category}` accepts `{"value": N, "from_unit": "...", "to_unit": "..."}`. +- **FR-8.2**: Angle: `degrees`, `radians`, `gradians`. +- **FR-8.3**: Temperature: `celsius`, `fahrenheit`, `kelvin`. +- **FR-8.4**: Length: `meters`, `feet`, `inches`, `centimeters`, `millimeters`, `kilometers`, `miles`, `yards`. +- **FR-8.5**: Weight: `kilograms`, `pounds`, `ounces`, `grams`, `milligrams`, `tonnes`, `stones`. +- **FR-8.6**: Unknown units return `INVALID_INPUT` (422). + +### FR-9: Response Envelope +- **FR-9.1**: Success responses: `{"status": "ok", "operation": "", "inputs": {...}, "result": }`. +- **FR-9.2**: Error responses: `{"status": "error", "operation": "", "inputs": {...}, "error": {"code": "", "message": "..."}}`. +- **FR-9.3**: All endpoints accept and return `application/json`. + +### FR-10: Error Handling +- **FR-10.1**: Error codes: `INVALID_INPUT` (422), `DIVISION_BY_ZERO` (400), `DOMAIN_ERROR` (400), `OVERFLOW` (400), `NOT_FOUND` (404). +- **FR-10.2**: Pydantic validation errors are wrapped in the same error envelope with `INVALID_INPUT` code. +- **FR-10.3**: Results that would be `inf` or `-inf` return `OVERFLOW` error. +- **FR-10.4**: `NaN` inputs are rejected with `INVALID_INPUT` error. +- **FR-10.5**: Unknown endpoints return `NOT_FOUND`. +- **FR-10.6**: Unexpected exceptions are logged at ERROR level and return a generic `INTERNAL_ERROR` response; never return bare 500. + +--- + +## 2. Non-Functional Requirements + +### NFR-1: Performance +- **NFR-1.1**: Startup time < 2 seconds. +- **NFR-1.2**: Response latency p95 < 50ms for any single operation. + +### NFR-2: Correctness +- **NFR-2.1**: Results match Python `math` stdlib to ≤ 1 ULP for standard operations. + +### NFR-3: Testing +- **NFR-3.1**: All tests pass with ≥ 90% line coverage (enforced — test run fails if below 90%). +- **NFR-3.2**: Unit tests exercise `math_engine.py` directly with known-value tables. +- **NFR-3.3**: Integration tests use `httpx.AsyncClient` with FastAPI TestClient. +- **NFR-3.4**: Boundary tests verify every domain constraint produces the correct error code. + +### NFR-4: Security +- **NFR-4.1**: Max request body size 1 MB. +- **NFR-4.2**: No authentication, rate-limiting, or production hardening in MVP. + +### NFR-5: Maintainability +- **NFR-5.1**: Code linted and formatted with `ruff` (line-length 100, target py313). +- **NFR-5.2**: Clear separation of concerns: routes, models, engine. + +--- + +## 3. Technical Constraints + +| Constraint | Value | +|---|---| +| Language | Python 3.13 | +| Package Manager | uv (no pip, poetry, conda) | +| Framework | FastAPI + Pydantic v2 | +| ASGI Server | uvicorn | +| Build Backend | hatchling | +| Test Framework | pytest + pytest-asyncio + httpx + pytest-cov | +| Linter/Formatter | ruff | +| Prohibited | Flask, Django, requests, sympy, pandas, numpy, pip, poetry, pipenv, black, flake8, isort | + +--- + +## 4. API Versioning +- URL prefix: `/api/v1/...` +- Initial release: v0.1.0 +- Semver applies. + +--- + +## 5. Out of Scope (MVP) +- Persistent storage or user accounts +- Graphical or terminal UI +- Symbolic / CAS capabilities +- Arbitrary-precision beyond Python `decimal` module +- Expression evaluation from string input diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden-report.html b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-report.html new file mode 100644 index 00000000..9cd55eda --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-report.html @@ -0,0 +1,465 @@ + + + +AIDLC Report — 20260218T125810-b84d042dff254a72b4ffec926fe5ea99 + + + + +

AIDLC Evaluation Report

+
20260218T125810-b84d042dff254a72b4ffec926fe5ea99 · 2026-02-18T13:45:16+00:00
+
+
Unit Tests
+
192/192
+
Coverage: 91.3%
+
+
Contract Tests
+
88/88
+
API endpoints validated
+
+
Code Quality
+
18 findings
+
5 errors, 13 warnings
+
+
Qualitative Score
+
+
+ + + + +
89%
+
+
+
Execution Time
+
24.1m
+
3 handoffs
+
+
Total Tokens
+
9.8M
+
in: 9.7M / out: 140K
+
+
+

Run Overview

+ + + + + + +
StatusStatus.COMPLETED
Executorglobal.anthropic.claude-opus-4-6-v1
Simulatorus.anthropic.claude-sonnet-4-5-20250929-v1:0
Regionus-west-2
Handoffs3 (executor → simulator → executor)
+

Handoff Timeline

+
+
E1
+
S2
+
E3
+
+ + + + +
#AgentDuration% of Total
1executor16.3m67.5%
2simulator1.1m4.7%
3executor6.7m27.8%
+

Token Usage

+ + + + +
AgentInputOutputTotal
Executor5.7M77K5.7M
Simulator180K2K182K
Total9.7M140K9.8M
+

Unit Tests

+
+192/192 passed +91.3% coverage +
+
+

Contract Tests

+
+88/88 passed +
+
+

Health 1/1

+ + +
TestMethodPathStatusLatency
health checkGET/health20014ms
+

Arithmetic 15/15

+ + + + + + + + + + + + + + + + +
TestMethodPathStatusLatency
add positive integersPOST/api/v1/arithmetic/add2004ms
add negative numbersPOST/api/v1/arithmetic/add2002ms
add floatsPOST/api/v1/arithmetic/add2002ms
add missing field → 422POST/api/v1/arithmetic/add4222ms
subtractPOST/api/v1/arithmetic/subtract2002ms
multiplyPOST/api/v1/arithmetic/multiply2002ms
multiply by zeroPOST/api/v1/arithmetic/multiply2002ms
dividePOST/api/v1/arithmetic/divide2003ms
divide by zero → errorPOST/api/v1/arithmetic/divide4002ms
moduloPOST/api/v1/arithmetic/modulo2002ms
modulo by zero → errorPOST/api/v1/arithmetic/modulo4002ms
abs negativePOST/api/v1/arithmetic/abs2002ms
abs positivePOST/api/v1/arithmetic/abs2001ms
negate positivePOST/api/v1/arithmetic/negate2001ms
negate negativePOST/api/v1/arithmetic/negate2002ms
+

Powers 11/11

+ + + + + + + + + + + + +
TestMethodPathStatusLatency
2^10POST/api/v1/powers/power2003ms
5^0POST/api/v1/powers/power2001ms
sqrt(16)POST/api/v1/powers/sqrt2001ms
sqrt(0)POST/api/v1/powers/sqrt2001ms
sqrt(-1) → domain errorPOST/api/v1/powers/sqrt4002ms
cbrt(27)POST/api/v1/powers/cbrt2002ms
cbrt(-8)POST/api/v1/powers/cbrt2002ms
square(5)POST/api/v1/powers/square2002ms
square(-3)POST/api/v1/powers/square2001ms
4th root of 16POST/api/v1/powers/nth_root2002ms
nth_root negative even → domain errorPOST/api/v1/powers/nth_root4001ms
+

Trigonometry 20/20

+ + + + + + + + + + + + + + + + + + + + + +
TestMethodPathStatusLatency
sin(0)POST/api/v1/trigonometry/sin2004ms
sin(90 deg)POST/api/v1/trigonometry/sin2002ms
cos(0)POST/api/v1/trigonometry/cos2002ms
tan(0)POST/api/v1/trigonometry/tan2002ms
asin(0)POST/api/v1/trigonometry/asin2002ms
asin(1)POST/api/v1/trigonometry/asin2001ms
asin(2) → domain errorPOST/api/v1/trigonometry/asin4001ms
acos(1)POST/api/v1/trigonometry/acos2002ms
acos(2) → domain errorPOST/api/v1/trigonometry/acos4002ms
atan(0)POST/api/v1/trigonometry/atan2002ms
atan2(0, 1)POST/api/v1/trigonometry/atan22002ms
atan2(1, 0)POST/api/v1/trigonometry/atan22001ms
sinh(0)POST/api/v1/trigonometry/sinh2002ms
cosh(0)POST/api/v1/trigonometry/cosh2002ms
tanh(0)POST/api/v1/trigonometry/tanh2002ms
asinh(0)POST/api/v1/trigonometry/asinh2002ms
acosh(1)POST/api/v1/trigonometry/acosh2001ms
acosh(0.5) → domain errorPOST/api/v1/trigonometry/acosh4001ms
atanh(0)POST/api/v1/trigonometry/atanh2002ms
atanh(1) → domain errorPOST/api/v1/trigonometry/atanh4001ms
+

Logarithmic 11/11

+ + + + + + + + + + + + +
TestMethodPathStatusLatency
ln(1)POST/api/v1/logarithmic/ln2003ms
ln(e)POST/api/v1/logarithmic/ln2002ms
ln(0) → domain errorPOST/api/v1/logarithmic/ln4002ms
ln(-1) → domain errorPOST/api/v1/logarithmic/ln4001ms
log10(100)POST/api/v1/logarithmic/log102001ms
log10(1)POST/api/v1/logarithmic/log102002ms
log2(8)POST/api/v1/logarithmic/log22002ms
log(8, base=2)POST/api/v1/logarithmic/log2002ms
log base 1 → domain errorPOST/api/v1/logarithmic/log4002ms
exp(0)POST/api/v1/logarithmic/exp2002ms
exp(1)POST/api/v1/logarithmic/exp2001ms
+

Statistics 12/12

+ + + + + + + + + + + + + +
TestMethodPathStatusLatency
meanPOST/api/v1/statistics/mean2004ms
median odd countPOST/api/v1/statistics/median2002ms
median even countPOST/api/v1/statistics/median2002ms
modePOST/api/v1/statistics/mode2002ms
stdevPOST/api/v1/statistics/stdev2002ms
variancePOST/api/v1/statistics/variance2002ms
pstdevPOST/api/v1/statistics/pstdev2002ms
pvariancePOST/api/v1/statistics/pvariance2002ms
minPOST/api/v1/statistics/min2002ms
maxPOST/api/v1/statistics/max2002ms
sumPOST/api/v1/statistics/sum2001ms
countPOST/api/v1/statistics/count2001ms
+

Constants 10/10

+ + + + + + + + + + + +
TestMethodPathStatusLatency
get all constantsGET/api/v1/constants2003ms
get piGET/api/v1/constants/pi2002ms
get eGET/api/v1/constants/e2001ms
get tauGET/api/v1/constants/tau2002ms
get golden_ratioGET/api/v1/constants/golden_ratio2003ms
get sqrt2GET/api/v1/constants/sqrt22002ms
get ln2GET/api/v1/constants/ln22002ms
get ln10GET/api/v1/constants/ln102002ms
get infGET/api/v1/constants/inf2001ms
get nanGET/api/v1/constants/nan2002ms
+

Conversions 7/7

+ + + + + + + + +
TestMethodPathStatusLatency
180 degrees to radiansPOST/api/v1/conversions/angle2003ms
boiling point C to FPOST/api/v1/conversions/temperature2002ms
freezing point C to KPOST/api/v1/conversions/temperature2002ms
1 meter to feetPOST/api/v1/conversions/length2002ms
1 mile to kilometersPOST/api/v1/conversions/length2002ms
1 kg to poundsPOST/api/v1/conversions/weight2001ms
1 stone to kilogramsPOST/api/v1/conversions/weight2001ms
+

Nonexistent 1/1

+ + +
TestMethodPathStatusLatency
unknown endpoint → 404GET/api/v1/nonexistent4041ms
+

Code Quality

+
+5 errors +13 warnings +ruff 0.15.1 +
+ + + + + + + + + + + + + + + + + + + +
FileLineCodeMessageSeverity
app.py3I001Import block is un-sorted or un-formattedwarning
math_engine.py7I001Import block is un-sorted or un-formattedwarning
math_engine.py12F401`typing.Any` imported but unusedwarning
arithmetic.py65E501Line too long (101 > 100)error
arithmetic.py78E501Line too long (107 > 100)error
logarithmic.py3I001Import block is un-sorted or un-formattedwarning
logarithmic.py72E501Line too long (108 > 100)error
powers.py74E501Line too long (103 > 100)error
trigonometry.py75E501Line too long (109 > 100)error
conftest.py8I001Import block is un-sorted or un-formattedwarning
test_arithmetic.py3I001Import block is un-sorted or un-formattedwarning
test_arithmetic.py9F401`sci_calc.engine.math_engine.MathOverflowError` imported but unusedwarning
test_constants.py3I001Import block is un-sorted or un-formattedwarning
test_conversions.py3I001Import block is un-sorted or un-formattedwarning
test_logarithmic.py3I001Import block is un-sorted or un-formattedwarning
test_powers.py3I001Import block is un-sorted or un-formattedwarning
test_statistics.py3I001Import block is un-sorted or un-formattedwarning
test_trigonometry.py3I001Import block is un-sorted or un-formattedwarning
+

Qualitative Evaluation

+
+
+ + + + +
89%
+
+
Overall Score
Semantic similarity to golden baseline
+
+
+
inception
+
Intent
0.90
+
Design
0.89
+
Complete
0.88
+
Overall
0.89
+
+
construction
+
Intent
0.93
+
Design
0.85
+
Complete
0.90
+
Overall
0.89
+
+
+

Inception Phase — Documents

+ + + + + + + + + +
DocumentIntentDesignCompletenessOverall
component-dependency.md1.000.950.900.96
component-methods.md1.000.950.850.95
components.md1.001.001.001.00
services.md0.950.900.850.91
application-design-plan.md1.001.001.001.00
execution-plan.md1.000.950.950.97
requirement-verification-questions.md0.300.400.500.38
requirements.md0.950.950.950.95
+
component-dependency.md — 0.96 +
Both documents capture identical intent: documenting component dependencies for a FastAPI math service with clear separation of concerns. Design is nearly identical with same architecture (routes, models, engine), same dependency patterns, and same key constraints (engine has zero framework dependencies, routes are thin adapters). Minor differences: CANDIDATE uses file paths (.py extensions) vs module notation, includes data flow diagram instead of dependency flow diagram, and omits external dependencies table and exception handler registration details. CANDIDATE adds clarification on synchronous calls and no async/database/queues. Overall highly aligned with trivial presentation differences.
+
component-methods.md — 0.95 +
Intent is identical: both define the same mathematical operations, request/response models, and API structure. Design is nearly identical with same layered architecture (routes, models, engine), same function signatures, and same exception handling approach. Minor differences: CANDIDATE uses slightly different model names (BinaryOperationRequest vs TwoOperandRequest, UnaryOperationRequest vs SingleOperandRequest) and omits detailed route path/method tables. CANDIDATE lacks the detailed routing table with HTTP methods and paths, and doesn't explicitly document the create_app() function or custom exception classes as separate entities, though the functionality is implied. Overall very strong alignment with minor organizational differences.
+
components.md — 1.00 +
Both documents describe identical component architectures with the same four-layer structure (app entry point, routes, models, engine). All seven route modules are present and match in purpose. The models layer distinguishes requests and responses identically. The engine layer responsibilities are equivalent, including pure function design, stdlib-only dependencies, and domain-specific exceptions. Minor stylistic differences exist (formatting, level of detail in operation enumeration), but the architectural intent, design decisions, and topic coverage are functionally identical.
+
services.md — 0.91 +
Both documents describe the same thin service architecture with direct route-to-engine delegation and no separate service layer. Intent is nearly identical. Design is very similar with same error handling flow and patterns, though CANDIDATE adds 404 handling and omits CORS middleware details. CANDIDATE is slightly less complete as it doesn't mention CORS configuration but adds health check details not in REFERENCE.
+
application-design-plan.md — 1.00 +
Both documents capture identical intent: a three-layer architecture (Routes, Models, Engine) for a Scientific Calculator API with FastAPI and Pydantic v2. Both explicitly state no design questions are needed due to fully specified tech-env. Both include the same deliverables (components.md, component-methods.md, services.md, component-dependency.md) and validation steps. The candidate provides slightly more context detail but maintains complete alignment with the reference.
+
execution-plan.md — 0.97 +
Both documents have identical intent and goals, capturing the same requirements and execution strategy. Design approaches are nearly identical with same component structure and skip/execute decisions. Minor differences: REFERENCE includes more detailed success criteria (1 ULP precision, HTTP status codes, structured envelope) and slightly different workflow visualization format. CANDIDATE is slightly more concise but covers all major topics. Overall extremely high alignment.
+
requirement-verification-questions.md — 0.38 +
Both documents aim to clarify ambiguities before requirements finalization, but they address almost entirely different concerns. REFERENCE focuses on floating-point handling, array limits, CORS, NaN serialization, precision, and API docs. CANDIDATE focuses on error envelope structure, mode return format, overflow handling, unknown units, coverage enforcement, and NaN input handling. Only Questions 1 (floating-point/overflow) and 4 (NaN handling) have thematic overlap, but ask different specific questions. Both documents have 6 questions and similar structure (partial completeness), but the substantive content differs significantly, indicating different areas of uncertainty were identified in each inception run.
+
requirements.md — 0.95 +
Both documents capture nearly identical intent, requirements, and technical approach for a scientific calculator API. Minor differences: REFERENCE has FR-011 (NaN/Infinity serialization as strings) and FR-013 (explicit CORS requirement) which CANDIDATE omits. CANDIDATE has FR-10.3/10.4 (overflow/NaN input handling) more explicitly stated. CANDIDATE uses sub-numbered FR format (FR-1.1, FR-2.1) vs REFERENCE's FR-001 style, but content is equivalent. Both specify same operations, error codes, tech stack, and constraints. CANDIDATE omits explicit mention of CORS and special NaN/Infinity serialization format, which are minor but notable gaps.
+

Construction Phase — Documents

+ + + + + + +
DocumentIntentDesignCompletenessOverall
build-and-test-summary.md0.950.900.950.93
build-instructions.md0.850.750.800.80
integration-test-instructions.md0.850.750.900.82
unit-test-instructions.md1.000.900.950.95
sci-calc-code-generation-plan.md1.000.950.900.96
+
build-and-test-summary.md — 0.93 +
Both documents capture the same core intent: summarizing build and test results for the sci-calc project with all tests passing and ready for deployment. Design approaches are nearly identical (FastAPI, hatchling, pytest, same module structure). Minor differences: CANDIDATE has 192 tests vs REFERENCE 187 tests (likely test refinements), CANDIDATE includes detailed bug fix documentation (NaN validator), and CANDIDATE uses custom SyncTestClient workaround for Windows asyncio issue. CANDIDATE provides more granular test breakdown by module. Both meet quality gates and declare deployment readiness. Coverage reporting differs (REFERENCE: 95.20% measured, CANDIDATE: deferred to CI). File counts slightly differ (REFERENCE: 16+9 files, CANDIDATE: 13+7 files) but core structure is equivalent. Overall highly similar with minor implementation variations.
+
build-instructions.md — 0.80 +
Both documents share the core intent of providing build instructions for the sci-calc project using Python 3.13+ and uv. The candidate includes additional detail on build backends (hatchling), explicit dependency versions, package building steps, and troubleshooting sections not present in the reference. The reference focuses on simpler verification and development workflow. Design approaches are similar (uv-based, FastAPI/uvicorn stack) but candidate adds more build tooling detail. Candidate covers all major reference topics (prerequisites, install, verify, run server, linting) plus extras, though some reference elements like the health check curl command are missing.
+
integration-test-instructions.md — 0.82 +
Both documents describe integration testing for the same FastAPI calculator application with similar goals (testing HTTP request/response cycles, validation, error handling). The candidate provides more granular detail with 63 tests across 7 domains vs reference's 5 general scenarios. Design approach is similar (httpx.AsyncClient, ASGI transport, co-located tests) though candidate adds specific endpoint paths and test counts. Candidate covers all reference scenarios plus additional domains (constants, conversions, health). Minor differences in run commands but both use pytest. Overall strong alignment with enhanced detail in candidate.
+
unit-test-instructions.md — 0.95 +
Both documents share identical intent: providing unit test execution instructions for the sci_calc project with pytest and coverage targets ≥90%. Design is highly similar with pytest/coverage commands, though CANDIDATE adds Windows asyncio workaround and more detailed test architecture breakdown. CANDIDATE has 192 tests vs REFERENCE's 187 (minor evolution), and adds fallback test client documentation. REFERENCE includes detailed coverage breakdown table by module (95.20% achieved), while CANDIDATE focuses on test count breakdown by module. Both are complete construction phase test instructions with only minor structural differences.
+
sci-calc-code-generation-plan.md — 0.96 +
Both documents target the same scientific calculator API with identical goals and requirements. Design is nearly identical with same layered architecture (engine/models/routes), same FastAPI framework, and same component breakdown. Candidate provides more granular implementation details (e.g., breaking engine into sub-steps by operation type, explicit error handling steps) while reference uses broader steps. Candidate consolidates some files (conftest in step 1 vs separate step 7) and adds more explicit testing details. Minor structural differences in step organization but covers all reference topics with additional implementation specificity.
+

Generated Artifacts

+
+
Source Files
17
+
Test Files
18
+
Config Files
4
+
Total Files
72
+
Lines of Code
3,522
+
AIDLC Docs
15
+
+

Baseline Comparison

+
vs golden 20260218T125810-b84d042dff254a72b4ffec926fe5ea99 + · promoted 2026-02-18T13:45:06+00:00 +
+
+
0
Improved
+
0
Regressed
+
20
Unchanged
+
+

Unit Tests

+ + + + + +
MetricGoldenCurrentDeltaChange
Tests Passed1921920unchanged
Tests Failed000unchanged
Tests Total1921920unchanged
Coverage %91910unchanged
+

Contract Tests

+ + + + +
MetricGoldenCurrentDeltaChange
Contract Passed88880unchanged
Contract Failed000unchanged
Contract Total88880unchanged
+

Code Quality

+ + + + +
MetricGoldenCurrentDeltaChange
Lint Errors550unchanged
Lint Warnings13130unchanged
Lint Total18180unchanged
+

Qualitative

+ + + + +
MetricGoldenCurrentDeltaChange
Qualitative Score0.89100.89100unchanged
Inception Score0.89000.89000unchanged
Construction Score0.89200.89200unchanged
+

Artifacts

+ + + + + +
MetricGoldenCurrentDeltaChange
Source Files17170unchanged
Test Files18180unchanged
Lines of Code3,5223,5220unchanged
Doc Files15150unchanged
+

Execution

+ + + + +
MetricGoldenCurrentDeltaChange
Total Tokens9,835,9359,835,9350unchanged
Wall Clock (ms)1,445,4601,445,4600unchanged
Handoffs330unchanged
+ + \ No newline at end of file diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden-report.md b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-report.md new file mode 100644 index 00000000..f0f59bae --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden-report.md @@ -0,0 +1,419 @@ +# AIDLC Evaluation Report + +> **Run:** `20260218T125810-b84d042dff254a72b4ffec926fe5ea99` +> **Generated:** 2026-02-18T13:45:16+00:00 + +## Verdict + +| Dimension | Result | +|-----------|--------| +| Unit Tests | ✅ **192/192** passed | +| Contract Tests | ✅ **88/88** passed | +| Code Quality | ❌ 18 findings (5 errors) | +| Qualitative Score | 🟢 **0.89** | + +## Run Overview + +| Property | Value | +|----------|-------| +| Status | `Status.COMPLETED` | +| Executor Model | `global.anthropic.claude-opus-4-6-v1` | +| Simulator Model | `us.anthropic.claude-sonnet-4-5-20250929-v1:0` | +| Region | `us-west-2` | +| Wall Clock | 24.1m | +| Handoffs | 3 (executor → simulator → executor) | +| Started | 2026-02-18T12:58:13.159285+00:00 | +| Completed | 2026-02-18T13:22:44.249897+00:00 | + +## Token Usage + +| Agent | Input | Output | Total | +|-------|------:|-------:|------:| +| Executor | 5.7M | 77K | 5.7M | +| Simulator | 180K | 2K | 182K | +| **Total** | **9.7M** | **140K** | **9.8M** | + +## Handoff Timeline + +| # | Agent | Duration | +|--:|-------|----------| +| 1 | executor | 16.3m | +| 2 | simulator | 1.1m | +| 3 | executor | 6.7m | + +## Generated Artifacts + +| Category | Count | +|----------|------:| +| Source files | 17 | +| Test files | 18 | +| Config files | 4 | +| Total files | 72 | +| Lines of code | 3,522 | +| AIDLC docs (inception) | 8 | +| AIDLC docs (construction) | 5 | +| AIDLC docs total | 15 | + +## Unit Tests + +**✅ 192 passed** / 192 total + +**Coverage:** 91.3% + +## Contract Tests (API Specification) + +**✅ 88/88** endpoints validated + +### Health ✅ 1/1 + +| Test | Method | Path | Status | Latency | +|------|--------|------|:------:|--------:| +| ✅ health check | GET | `/health` | 200 | 14ms | + + +### Arithmetic ✅ 15/15 + +| Test | Method | Path | Status | Latency | +|------|--------|------|:------:|--------:| +| ✅ add positive integers | POST | `/api/v1/arithmetic/add` | 200 | 4ms | +| ✅ add negative numbers | POST | `/api/v1/arithmetic/add` | 200 | 2ms | +| ✅ add floats | POST | `/api/v1/arithmetic/add` | 200 | 2ms | +| ✅ add missing field → 422 | POST | `/api/v1/arithmetic/add` | 422 | 2ms | +| ✅ subtract | POST | `/api/v1/arithmetic/subtract` | 200 | 2ms | +| ✅ multiply | POST | `/api/v1/arithmetic/multiply` | 200 | 2ms | +| ✅ multiply by zero | POST | `/api/v1/arithmetic/multiply` | 200 | 2ms | +| ✅ divide | POST | `/api/v1/arithmetic/divide` | 200 | 3ms | +| ✅ divide by zero → error | POST | `/api/v1/arithmetic/divide` | 400 | 2ms | +| ✅ modulo | POST | `/api/v1/arithmetic/modulo` | 200 | 2ms | +| ✅ modulo by zero → error | POST | `/api/v1/arithmetic/modulo` | 400 | 2ms | +| ✅ abs negative | POST | `/api/v1/arithmetic/abs` | 200 | 2ms | +| ✅ abs positive | POST | `/api/v1/arithmetic/abs` | 200 | 1ms | +| ✅ negate positive | POST | `/api/v1/arithmetic/negate` | 200 | 1ms | +| ✅ negate negative | POST | `/api/v1/arithmetic/negate` | 200 | 2ms | + + +### Powers ✅ 11/11 + +| Test | Method | Path | Status | Latency | +|------|--------|------|:------:|--------:| +| ✅ 2^10 | POST | `/api/v1/powers/power` | 200 | 3ms | +| ✅ 5^0 | POST | `/api/v1/powers/power` | 200 | 1ms | +| ✅ sqrt(16) | POST | `/api/v1/powers/sqrt` | 200 | 1ms | +| ✅ sqrt(0) | POST | `/api/v1/powers/sqrt` | 200 | 1ms | +| ✅ sqrt(-1) → domain error | POST | `/api/v1/powers/sqrt` | 400 | 2ms | +| ✅ cbrt(27) | POST | `/api/v1/powers/cbrt` | 200 | 2ms | +| ✅ cbrt(-8) | POST | `/api/v1/powers/cbrt` | 200 | 2ms | +| ✅ square(5) | POST | `/api/v1/powers/square` | 200 | 2ms | +| ✅ square(-3) | POST | `/api/v1/powers/square` | 200 | 1ms | +| ✅ 4th root of 16 | POST | `/api/v1/powers/nth_root` | 200 | 2ms | +| ✅ nth_root negative even → domain error | POST | `/api/v1/powers/nth_root` | 400 | 1ms | + + +### Trigonometry ✅ 20/20 + +| Test | Method | Path | Status | Latency | +|------|--------|------|:------:|--------:| +| ✅ sin(0) | POST | `/api/v1/trigonometry/sin` | 200 | 4ms | +| ✅ sin(90 deg) | POST | `/api/v1/trigonometry/sin` | 200 | 2ms | +| ✅ cos(0) | POST | `/api/v1/trigonometry/cos` | 200 | 2ms | +| ✅ tan(0) | POST | `/api/v1/trigonometry/tan` | 200 | 2ms | +| ✅ asin(0) | POST | `/api/v1/trigonometry/asin` | 200 | 2ms | +| ✅ asin(1) | POST | `/api/v1/trigonometry/asin` | 200 | 1ms | +| ✅ asin(2) → domain error | POST | `/api/v1/trigonometry/asin` | 400 | 1ms | +| ✅ acos(1) | POST | `/api/v1/trigonometry/acos` | 200 | 2ms | +| ✅ acos(2) → domain error | POST | `/api/v1/trigonometry/acos` | 400 | 2ms | +| ✅ atan(0) | POST | `/api/v1/trigonometry/atan` | 200 | 2ms | +| ✅ atan2(0, 1) | POST | `/api/v1/trigonometry/atan2` | 200 | 2ms | +| ✅ atan2(1, 0) | POST | `/api/v1/trigonometry/atan2` | 200 | 1ms | +| ✅ sinh(0) | POST | `/api/v1/trigonometry/sinh` | 200 | 2ms | +| ✅ cosh(0) | POST | `/api/v1/trigonometry/cosh` | 200 | 2ms | +| ✅ tanh(0) | POST | `/api/v1/trigonometry/tanh` | 200 | 2ms | +| ✅ asinh(0) | POST | `/api/v1/trigonometry/asinh` | 200 | 2ms | +| ✅ acosh(1) | POST | `/api/v1/trigonometry/acosh` | 200 | 1ms | +| ✅ acosh(0.5) → domain error | POST | `/api/v1/trigonometry/acosh` | 400 | 1ms | +| ✅ atanh(0) | POST | `/api/v1/trigonometry/atanh` | 200 | 2ms | +| ✅ atanh(1) → domain error | POST | `/api/v1/trigonometry/atanh` | 400 | 1ms | + + +### Logarithmic ✅ 11/11 + +| Test | Method | Path | Status | Latency | +|------|--------|------|:------:|--------:| +| ✅ ln(1) | POST | `/api/v1/logarithmic/ln` | 200 | 3ms | +| ✅ ln(e) | POST | `/api/v1/logarithmic/ln` | 200 | 2ms | +| ✅ ln(0) → domain error | POST | `/api/v1/logarithmic/ln` | 400 | 2ms | +| ✅ ln(-1) → domain error | POST | `/api/v1/logarithmic/ln` | 400 | 1ms | +| ✅ log10(100) | POST | `/api/v1/logarithmic/log10` | 200 | 1ms | +| ✅ log10(1) | POST | `/api/v1/logarithmic/log10` | 200 | 2ms | +| ✅ log2(8) | POST | `/api/v1/logarithmic/log2` | 200 | 2ms | +| ✅ log(8, base=2) | POST | `/api/v1/logarithmic/log` | 200 | 2ms | +| ✅ log base 1 → domain error | POST | `/api/v1/logarithmic/log` | 400 | 2ms | +| ✅ exp(0) | POST | `/api/v1/logarithmic/exp` | 200 | 2ms | +| ✅ exp(1) | POST | `/api/v1/logarithmic/exp` | 200 | 1ms | + + +### Statistics ✅ 12/12 + +| Test | Method | Path | Status | Latency | +|------|--------|------|:------:|--------:| +| ✅ mean | POST | `/api/v1/statistics/mean` | 200 | 4ms | +| ✅ median odd count | POST | `/api/v1/statistics/median` | 200 | 2ms | +| ✅ median even count | POST | `/api/v1/statistics/median` | 200 | 2ms | +| ✅ mode | POST | `/api/v1/statistics/mode` | 200 | 2ms | +| ✅ stdev | POST | `/api/v1/statistics/stdev` | 200 | 2ms | +| ✅ variance | POST | `/api/v1/statistics/variance` | 200 | 2ms | +| ✅ pstdev | POST | `/api/v1/statistics/pstdev` | 200 | 2ms | +| ✅ pvariance | POST | `/api/v1/statistics/pvariance` | 200 | 2ms | +| ✅ min | POST | `/api/v1/statistics/min` | 200 | 2ms | +| ✅ max | POST | `/api/v1/statistics/max` | 200 | 2ms | +| ✅ sum | POST | `/api/v1/statistics/sum` | 200 | 1ms | +| ✅ count | POST | `/api/v1/statistics/count` | 200 | 1ms | + + +### Constants ✅ 10/10 + +| Test | Method | Path | Status | Latency | +|------|--------|------|:------:|--------:| +| ✅ get all constants | GET | `/api/v1/constants` | 200 | 3ms | +| ✅ get pi | GET | `/api/v1/constants/pi` | 200 | 2ms | +| ✅ get e | GET | `/api/v1/constants/e` | 200 | 1ms | +| ✅ get tau | GET | `/api/v1/constants/tau` | 200 | 2ms | +| ✅ get golden_ratio | GET | `/api/v1/constants/golden_ratio` | 200 | 3ms | +| ✅ get sqrt2 | GET | `/api/v1/constants/sqrt2` | 200 | 2ms | +| ✅ get ln2 | GET | `/api/v1/constants/ln2` | 200 | 2ms | +| ✅ get ln10 | GET | `/api/v1/constants/ln10` | 200 | 2ms | +| ✅ get inf | GET | `/api/v1/constants/inf` | 200 | 1ms | +| ✅ get nan | GET | `/api/v1/constants/nan` | 200 | 2ms | + + +### Conversions ✅ 7/7 + +| Test | Method | Path | Status | Latency | +|------|--------|------|:------:|--------:| +| ✅ 180 degrees to radians | POST | `/api/v1/conversions/angle` | 200 | 3ms | +| ✅ boiling point C to F | POST | `/api/v1/conversions/temperature` | 200 | 2ms | +| ✅ freezing point C to K | POST | `/api/v1/conversions/temperature` | 200 | 2ms | +| ✅ 1 meter to feet | POST | `/api/v1/conversions/length` | 200 | 2ms | +| ✅ 1 mile to kilometers | POST | `/api/v1/conversions/length` | 200 | 2ms | +| ✅ 1 kg to pounds | POST | `/api/v1/conversions/weight` | 200 | 1ms | +| ✅ 1 stone to kilograms | POST | `/api/v1/conversions/weight` | 200 | 1ms | + + +### Nonexistent ✅ 1/1 + +| Test | Method | Path | Status | Latency | +|------|--------|------|:------:|--------:| +| ✅ unknown endpoint → 404 | GET | `/api/v1/nonexistent` | 404 | 1ms | + + +## Code Quality + +**❌ 18 findings** (5 errors, 13 warnings) + +**Linter:** ruff 0.15.1 + +| File | Line | Code | Message | Severity | +|------|-----:|------|---------|----------| +| `app.py` | 3 | `I001` | Import block is un-sorted or un-formatted | 🟡 warning | +| `math_engine.py` | 7 | `I001` | Import block is un-sorted or un-formatted | 🟡 warning | +| `math_engine.py` | 12 | `F401` | `typing.Any` imported but unused | 🟡 warning | +| `arithmetic.py` | 65 | `E501` | Line too long (101 > 100) | 🔴 error | +| `arithmetic.py` | 78 | `E501` | Line too long (107 > 100) | 🔴 error | +| `logarithmic.py` | 3 | `I001` | Import block is un-sorted or un-formatted | 🟡 warning | +| `logarithmic.py` | 72 | `E501` | Line too long (108 > 100) | 🔴 error | +| `powers.py` | 74 | `E501` | Line too long (103 > 100) | 🔴 error | +| `trigonometry.py` | 75 | `E501` | Line too long (109 > 100) | 🔴 error | +| `conftest.py` | 8 | `I001` | Import block is un-sorted or un-formatted | 🟡 warning | +| `test_arithmetic.py` | 3 | `I001` | Import block is un-sorted or un-formatted | 🟡 warning | +| `test_arithmetic.py` | 9 | `F401` | `sci_calc.engine.math_engine.MathOverflowError` imported but unused | 🟡 warning | +| `test_constants.py` | 3 | `I001` | Import block is un-sorted or un-formatted | 🟡 warning | +| `test_conversions.py` | 3 | `I001` | Import block is un-sorted or un-formatted | 🟡 warning | +| `test_logarithmic.py` | 3 | `I001` | Import block is un-sorted or un-formatted | 🟡 warning | +| `test_powers.py` | 3 | `I001` | Import block is un-sorted or un-formatted | 🟡 warning | +| `test_statistics.py` | 3 | `I001` | Import block is un-sorted or un-formatted | 🟡 warning | +| `test_trigonometry.py` | 3 | `I001` | Import block is un-sorted or un-formatted | 🟡 warning | + +*Security scanner (bandit) was not available.* + +## Qualitative Evaluation (Semantic Similarity) + +**Overall Score: 🟢 0.8910** + +### Inception Phase + +| Dimension | Score | +|-----------|------:| +| Intent | 0.90 | +| Design | 0.89 | +| Completeness | 0.88 | +| **Overall** | **0.89** | + +| Document | Intent | Design | Complete | Overall | +|----------|-------:|-------:|---------:|--------:| +| `component-dependency.md` | 1.00 | 0.95 | 0.90 | 0.96 | +| `component-methods.md` | 1.00 | 0.95 | 0.85 | 0.95 | +| `components.md` | 1.00 | 1.00 | 1.00 | 1.00 | +| `services.md` | 0.95 | 0.90 | 0.85 | 0.91 | +| `application-design-plan.md` | 1.00 | 1.00 | 1.00 | 1.00 | +| `execution-plan.md` | 1.00 | 0.95 | 0.95 | 0.97 | +| `requirement-verification-questions.md` | 0.30 | 0.40 | 0.50 | 0.38 | +| `requirements.md` | 0.95 | 0.95 | 0.95 | 0.95 | + +
component-dependency.md — 0.96 + +Both documents capture identical intent: documenting component dependencies for a FastAPI math service with clear separation of concerns. Design is nearly identical with same architecture (routes, models, engine), same dependency patterns, and same key constraints (engine has zero framework dependencies, routes are thin adapters). Minor differences: CANDIDATE uses file paths (.py extensions) vs module notation, includes data flow diagram instead of dependency flow diagram, and omits external dependencies table and exception handler registration details. CANDIDATE adds clarification on synchronous calls and no async/database/queues. Overall highly aligned with trivial presentation differences. + +
+ +
component-methods.md — 0.95 + +Intent is identical: both define the same mathematical operations, request/response models, and API structure. Design is nearly identical with same layered architecture (routes, models, engine), same function signatures, and same exception handling approach. Minor differences: CANDIDATE uses slightly different model names (BinaryOperationRequest vs TwoOperandRequest, UnaryOperationRequest vs SingleOperandRequest) and omits detailed route path/method tables. CANDIDATE lacks the detailed routing table with HTTP methods and paths, and doesn't explicitly document the create_app() function or custom exception classes as separate entities, though the functionality is implied. Overall very strong alignment with minor organizational differences. + +
+ +
components.md — 1.00 + +Both documents describe identical component architectures with the same four-layer structure (app entry point, routes, models, engine). All seven route modules are present and match in purpose. The models layer distinguishes requests and responses identically. The engine layer responsibilities are equivalent, including pure function design, stdlib-only dependencies, and domain-specific exceptions. Minor stylistic differences exist (formatting, level of detail in operation enumeration), but the architectural intent, design decisions, and topic coverage are functionally identical. + +
+ +
services.md — 0.91 + +Both documents describe the same thin service architecture with direct route-to-engine delegation and no separate service layer. Intent is nearly identical. Design is very similar with same error handling flow and patterns, though CANDIDATE adds 404 handling and omits CORS middleware details. CANDIDATE is slightly less complete as it doesn't mention CORS configuration but adds health check details not in REFERENCE. + +
+ +
application-design-plan.md — 1.00 + +Both documents capture identical intent: a three-layer architecture (Routes, Models, Engine) for a Scientific Calculator API with FastAPI and Pydantic v2. Both explicitly state no design questions are needed due to fully specified tech-env. Both include the same deliverables (components.md, component-methods.md, services.md, component-dependency.md) and validation steps. The candidate provides slightly more context detail but maintains complete alignment with the reference. + +
+ +
execution-plan.md — 0.97 + +Both documents have identical intent and goals, capturing the same requirements and execution strategy. Design approaches are nearly identical with same component structure and skip/execute decisions. Minor differences: REFERENCE includes more detailed success criteria (1 ULP precision, HTTP status codes, structured envelope) and slightly different workflow visualization format. CANDIDATE is slightly more concise but covers all major topics. Overall extremely high alignment. + +
+ +
requirement-verification-questions.md — 0.38 + +Both documents aim to clarify ambiguities before requirements finalization, but they address almost entirely different concerns. REFERENCE focuses on floating-point handling, array limits, CORS, NaN serialization, precision, and API docs. CANDIDATE focuses on error envelope structure, mode return format, overflow handling, unknown units, coverage enforcement, and NaN input handling. Only Questions 1 (floating-point/overflow) and 4 (NaN handling) have thematic overlap, but ask different specific questions. Both documents have 6 questions and similar structure (partial completeness), but the substantive content differs significantly, indicating different areas of uncertainty were identified in each inception run. + +
+ +
requirements.md — 0.95 + +Both documents capture nearly identical intent, requirements, and technical approach for a scientific calculator API. Minor differences: REFERENCE has FR-011 (NaN/Infinity serialization as strings) and FR-013 (explicit CORS requirement) which CANDIDATE omits. CANDIDATE has FR-10.3/10.4 (overflow/NaN input handling) more explicitly stated. CANDIDATE uses sub-numbered FR format (FR-1.1, FR-2.1) vs REFERENCE's FR-001 style, but content is equivalent. Both specify same operations, error codes, tech stack, and constraints. CANDIDATE omits explicit mention of CORS and special NaN/Infinity serialization format, which are minor but notable gaps. + +
+ +### Construction Phase + +| Dimension | Score | +|-----------|------:| +| Intent | 0.93 | +| Design | 0.85 | +| Completeness | 0.90 | +| **Overall** | **0.89** | + +| Document | Intent | Design | Complete | Overall | +|----------|-------:|-------:|---------:|--------:| +| `build-and-test-summary.md` | 0.95 | 0.90 | 0.95 | 0.93 | +| `build-instructions.md` | 0.85 | 0.75 | 0.80 | 0.80 | +| `integration-test-instructions.md` | 0.85 | 0.75 | 0.90 | 0.82 | +| `unit-test-instructions.md` | 1.00 | 0.90 | 0.95 | 0.95 | +| `sci-calc-code-generation-plan.md` | 1.00 | 0.95 | 0.90 | 0.96 | + +
build-and-test-summary.md — 0.93 + +Both documents capture the same core intent: summarizing build and test results for the sci-calc project with all tests passing and ready for deployment. Design approaches are nearly identical (FastAPI, hatchling, pytest, same module structure). Minor differences: CANDIDATE has 192 tests vs REFERENCE 187 tests (likely test refinements), CANDIDATE includes detailed bug fix documentation (NaN validator), and CANDIDATE uses custom SyncTestClient workaround for Windows asyncio issue. CANDIDATE provides more granular test breakdown by module. Both meet quality gates and declare deployment readiness. Coverage reporting differs (REFERENCE: 95.20% measured, CANDIDATE: deferred to CI). File counts slightly differ (REFERENCE: 16+9 files, CANDIDATE: 13+7 files) but core structure is equivalent. Overall highly similar with minor implementation variations. + +
+ +
build-instructions.md — 0.80 + +Both documents share the core intent of providing build instructions for the sci-calc project using Python 3.13+ and uv. The candidate includes additional detail on build backends (hatchling), explicit dependency versions, package building steps, and troubleshooting sections not present in the reference. The reference focuses on simpler verification and development workflow. Design approaches are similar (uv-based, FastAPI/uvicorn stack) but candidate adds more build tooling detail. Candidate covers all major reference topics (prerequisites, install, verify, run server, linting) plus extras, though some reference elements like the health check curl command are missing. + +
+ +
integration-test-instructions.md — 0.82 + +Both documents describe integration testing for the same FastAPI calculator application with similar goals (testing HTTP request/response cycles, validation, error handling). The candidate provides more granular detail with 63 tests across 7 domains vs reference's 5 general scenarios. Design approach is similar (httpx.AsyncClient, ASGI transport, co-located tests) though candidate adds specific endpoint paths and test counts. Candidate covers all reference scenarios plus additional domains (constants, conversions, health). Minor differences in run commands but both use pytest. Overall strong alignment with enhanced detail in candidate. + +
+ +
unit-test-instructions.md — 0.95 + +Both documents share identical intent: providing unit test execution instructions for the sci_calc project with pytest and coverage targets ≥90%. Design is highly similar with pytest/coverage commands, though CANDIDATE adds Windows asyncio workaround and more detailed test architecture breakdown. CANDIDATE has 192 tests vs REFERENCE's 187 (minor evolution), and adds fallback test client documentation. REFERENCE includes detailed coverage breakdown table by module (95.20% achieved), while CANDIDATE focuses on test count breakdown by module. Both are complete construction phase test instructions with only minor structural differences. + +
+ +
sci-calc-code-generation-plan.md — 0.96 + +Both documents target the same scientific calculator API with identical goals and requirements. Design is nearly identical with same layered architecture (engine/models/routes), same FastAPI framework, and same component breakdown. Candidate provides more granular implementation details (e.g., breaking engine into sub-steps by operation type, explicit error handling steps) while reference uses broader steps. Candidate consolidates some files (conftest in step 1 vs separate step 7) and adds more explicit testing details. Minor structural differences in step organization but covers all reference topics with additional implementation specificity. + +
+ +## Baseline Comparison + +> Compared against golden baseline: `20260218T125810-b84d042dff254a72b4ffec926fe5ea99` +> Promoted: 2026-02-18T13:45:06+00:00 + +| | Count | +|---|------:| +| 🟢 Improved | 0 | +| 🔴 Regressed | 0 | +| ⚪ Unchanged | 20 | + +### Unit Tests + +| Metric | Golden | Current | Delta | Change | +|--------|-------:|--------:|------:|--------| +| Tests Passed | 192 | 192 | ⚪ 0 | unchanged | +| Tests Failed | 0 | 0 | ⚪ 0 | unchanged | +| Tests Total | 192 | 192 | ⚪ 0 | unchanged | +| Coverage % | 91 | 91 | ⚪ 0 | unchanged | + +### Contract Tests + +| Metric | Golden | Current | Delta | Change | +|--------|-------:|--------:|------:|--------| +| Contract Passed | 88 | 88 | ⚪ 0 | unchanged | +| Contract Failed | 0 | 0 | ⚪ 0 | unchanged | +| Contract Total | 88 | 88 | ⚪ 0 | unchanged | + +### Code Quality + +| Metric | Golden | Current | Delta | Change | +|--------|-------:|--------:|------:|--------| +| Lint Errors | 5 | 5 | ⚪ 0 | unchanged | +| Lint Warnings | 13 | 13 | ⚪ 0 | unchanged | +| Lint Total | 18 | 18 | ⚪ 0 | unchanged | + +### Qualitative + +| Metric | Golden | Current | Delta | Change | +|--------|-------:|--------:|------:|--------| +| Qualitative Score | 0.8910 | 0.8910 | ⚪ 0 | unchanged | +| Inception Score | 0.8900 | 0.8900 | ⚪ 0 | unchanged | +| Construction Score | 0.8920 | 0.8920 | ⚪ 0 | unchanged | + +### Artifacts + +| Metric | Golden | Current | Delta | Change | +|--------|-------:|--------:|------:|--------| +| Source Files | 17 | 17 | ⚪ 0 | unchanged | +| Test Files | 18 | 18 | ⚪ 0 | unchanged | +| Lines of Code | 3,522 | 3,522 | ⚪ 0 | unchanged | +| Doc Files | 15 | 15 | ⚪ 0 | unchanged | + +### Execution + +| Metric | Golden | Current | Delta | Change | +|--------|-------:|--------:|------:|--------| +| Total Tokens | 9,835,935 | 9,835,935 | ⚪ 0 | unchanged | +| Wall Clock (ms) | 1,445,460 | 1,445,460 | ⚪ 0 | unchanged | +| Handoffs | 3 | 3 | ⚪ 0 | unchanged | + +--- +*Report generated by aidlc-reporting v0.1.0* diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/golden.yaml b/scripts/aidlc-evaluator/test_cases/sci-calc/golden.yaml new file mode 100644 index 00000000..7a44bbaf --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/golden.yaml @@ -0,0 +1,49 @@ +run_folder: runs\20260224T214917-d18513a006b44fdf98e54495becbb27a +promoted_at: '2026-02-24T22:19:08+00:00' +executor_model: global.anthropic.claude-opus-4-6-v1 +simulator_model: us.anthropic.claude-sonnet-4-5-20250929-v1:0 +execution: + wall_clock_ms: 1428774 + total_tokens: 18394585 + input_tokens: 18248064 + output_tokens: 146521 + handoffs: 3 + executor: + input_tokens: 9247186 + output_tokens: 77992 + total_tokens: 9325178 + simulator: + input_tokens: 336997 + output_tokens: 3057 + total_tokens: 340054 +context_size: + max_tokens: 141361 + avg_tokens: 54766 + median_tokens: 47320 +artifacts: + source_files: 18 + test_files: 20 + total_files: 80 + lines_of_code: 3290 + doc_files: 16 +unit_tests: + passed: 180 + failed: 0 + total: 180 + pass_pct: 100.0 + coverage_pct: null +contract_tests: + passed: 88 + failed: 0 + total: 88 +code_quality: + lint_errors: 0 + lint_warnings: 0 + lint_total: 0 + security_total: 0 + security_high: 0 + duplication_blocks: 0 +qualitative: + overall_score: 0.8544 + inception_score: 0.8788 + construction_score: 0.83 diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/openapi.yaml b/scripts/aidlc-evaluator/test_cases/sci-calc/openapi.yaml new file mode 100644 index 00000000..5680a35c --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/openapi.yaml @@ -0,0 +1,1236 @@ +# OpenAPI 3.1 Specification — Scientific Calculator API +# +# This is a first-class project input, supplied alongside vision.md and +# tech-env.md. The contract-test harness reads this spec, discovers every +# operation, and runs the embedded x-test-cases against the live server. +# +# Security note: This is a test specification for a stateless calculator API +# running on localhost. No authentication is required for test scenarios. +# checkov:skip=CKV_OPENAPI_4:Test spec - no security required for localhost +# checkov:skip=CKV_OPENAPI_5:Test spec - no security required for localhost +# checkov:skip=CKV_OPENAPI_21:Test spec - array validation not required + +openapi: "3.1.0" +info: + title: Scientific Calculator API + version: "0.1.0" + description: > + Stateless HTTP API for scientific math — arithmetic, trigonometry, + logarithms, powers, statistics, constants, and unit conversions. + +servers: + - url: http://127.0.0.1:{port} + variables: + port: + default: "8000" + +x-app: + module: "sci_calc.app:app" + framework: fastapi + startup_timeout: 15 + +# ─────────────────────────── paths ─────────────────────────────────── + +paths: + + # ── Health ───────────────────────────────────────────────────────── + /health: + get: + operationId: health + summary: Health check + responses: + "200": + description: OK + x-test-cases: + - name: health check + expected_status: 200 + expected_body: + status: "ok" + + # ── Arithmetic ───────────────────────────────────────────────────── + /api/v1/arithmetic/add: + post: + operationId: arithmetic_add + summary: Add two numbers + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + b: {type: number} + required: [a, b] + responses: + "200": {description: OK} + "422": {description: Invalid input} + x-test-cases: + - name: "add positive integers" + body: {"a": 3, "b": 4} + expected_status: 200 + expected_body: {status: "ok", operation: "add", result: 7} + - name: "add negative numbers" + body: {"a": -10, "b": -5} + expected_status: 200 + expected_body: {status: "ok", result: -15} + - name: "add floats" + body: {"a": 1.5, "b": 2.5} + expected_status: 200 + expected_body: {status: "ok", result: 4.0} + - name: "add missing field → 422" + body: {"a": 1} + expected_status: 422 + + /api/v1/arithmetic/subtract: + post: + operationId: arithmetic_subtract + summary: Subtract b from a + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + b: {type: number} + required: [a, b] + responses: + "200": {description: OK} + x-test-cases: + - name: "subtract" + body: {"a": 10, "b": 3} + expected_status: 200 + expected_body: {status: "ok", result: 7} + + /api/v1/arithmetic/multiply: + post: + operationId: arithmetic_multiply + summary: Multiply two numbers + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + b: {type: number} + required: [a, b] + responses: + "200": {description: OK} + x-test-cases: + - name: "multiply" + body: {"a": 6, "b": 7} + expected_status: 200 + expected_body: {status: "ok", result: 42} + - name: "multiply by zero" + body: {"a": 999, "b": 0} + expected_status: 200 + expected_body: {status: "ok", result: 0} + + /api/v1/arithmetic/divide: + post: + operationId: arithmetic_divide + summary: Divide a by b + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + b: {type: number} + required: [a, b] + responses: + "200": {description: OK} + "400": {description: Division by zero} + x-test-cases: + - name: "divide" + body: {"a": 10, "b": 4} + expected_status: 200 + expected_body: {status: "ok", result: 2.5} + - name: "divide by zero → error" + body: {"a": 1, "b": 0} + expected_status: 400 + expected_body: {status: "error", error: {code: "DIVISION_BY_ZERO"}} + + /api/v1/arithmetic/modulo: + post: + operationId: arithmetic_modulo + summary: Modulo a % b + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + b: {type: number} + required: [a, b] + responses: + "200": {description: OK} + "400": {description: Division by zero} + x-test-cases: + - name: "modulo" + body: {"a": 10, "b": 3} + expected_status: 200 + expected_body: {status: "ok", result: 1} + - name: "modulo by zero → error" + body: {"a": 5, "b": 0} + expected_status: 400 + expected_body: {status: "error", error: {code: "DIVISION_BY_ZERO"}} + + /api/v1/arithmetic/abs: + post: + operationId: arithmetic_abs + summary: Absolute value + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + required: [a] + responses: + "200": {description: OK} + x-test-cases: + - name: "abs negative" + body: {"a": -5} + expected_status: 200 + expected_body: {status: "ok", result: 5} + - name: "abs positive" + body: {"a": 7} + expected_status: 200 + expected_body: {status: "ok", result: 7} + + /api/v1/arithmetic/negate: + post: + operationId: arithmetic_negate + summary: Negate a number + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + required: [a] + responses: + "200": {description: OK} + x-test-cases: + - name: "negate positive" + body: {"a": 7} + expected_status: 200 + expected_body: {status: "ok", result: -7} + - name: "negate negative" + body: {"a": -3} + expected_status: 200 + expected_body: {status: "ok", result: 3} + + # ── Powers & Roots ───────────────────────────────────────────────── + /api/v1/powers/power: + post: + operationId: powers_power + summary: Raise base to exponent + requestBody: + content: + application/json: + schema: + type: object + properties: + base: {type: number} + exponent: {type: number} + required: [base, exponent] + responses: + "200": {description: OK} + x-test-cases: + - name: "2^10" + body: {"base": 2, "exponent": 10} + expected_status: 200 + expected_body: {status: "ok", result: 1024} + - name: "5^0" + body: {"base": 5, "exponent": 0} + expected_status: 200 + expected_body: {status: "ok", result: 1} + + /api/v1/powers/sqrt: + post: + operationId: powers_sqrt + summary: Square root + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + required: [a] + responses: + "200": {description: OK} + "400": {description: Domain error} + x-test-cases: + - name: "sqrt(16)" + body: {"a": 16} + expected_status: 200 + expected_body: {status: "ok", result: 4.0} + - name: "sqrt(0)" + body: {"a": 0} + expected_status: 200 + expected_body: {status: "ok", result: 0.0} + - name: "sqrt(-1) → domain error" + body: {"a": -1} + expected_status: 400 + expected_body: {status: "error", error: {code: "DOMAIN_ERROR"}} + + /api/v1/powers/cbrt: + post: + operationId: powers_cbrt + summary: Cube root + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + required: [a] + responses: + "200": {description: OK} + x-test-cases: + - name: "cbrt(27)" + body: {"a": 27} + expected_status: 200 + expected_body: {status: "ok", result: 3.0} + - name: "cbrt(-8)" + body: {"a": -8} + expected_status: 200 + expected_body: {status: "ok", result: -2.0} + + /api/v1/powers/square: + post: + operationId: powers_square + summary: Square a number + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + required: [a] + responses: + "200": {description: OK} + x-test-cases: + - name: "square(5)" + body: {"a": 5} + expected_status: 200 + expected_body: {status: "ok", result: 25} + - name: "square(-3)" + body: {"a": -3} + expected_status: 200 + expected_body: {status: "ok", result: 9} + + /api/v1/powers/nth_root: + post: + operationId: powers_nth_root + summary: Nth root of a + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + n: {type: integer} + required: [a, n] + responses: + "200": {description: OK} + "400": {description: Domain error} + x-test-cases: + - name: "4th root of 16" + body: {"a": 16, "n": 4} + expected_status: 200 + expected_body: {status: "ok", result: 2.0} + - name: "nth_root negative even → domain error" + body: {"a": -4, "n": 2} + expected_status: 400 + expected_body: {status: "error", error: {code: "DOMAIN_ERROR"}} + + # ── Trigonometry ─────────────────────────────────────────────────── + /api/v1/trigonometry/sin: + post: + operationId: trig_sin + summary: Sine + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + angle_unit: {type: string, enum: [radians, degrees], default: radians} + required: [a] + responses: + "200": {description: OK} + x-test-cases: + - name: "sin(0)" + body: {"a": 0} + expected_status: 200 + expected_body: {status: "ok", result: 0.0} + - name: "sin(90 deg)" + body: {"a": 90, "angle_unit": "degrees"} + expected_status: 200 + expected_body: {status: "ok", result: 1.0} + + /api/v1/trigonometry/cos: + post: + operationId: trig_cos + summary: Cosine + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + angle_unit: {type: string, enum: [radians, degrees], default: radians} + required: [a] + responses: + "200": {description: OK} + x-test-cases: + - name: "cos(0)" + body: {"a": 0} + expected_status: 200 + expected_body: {status: "ok", result: 1.0} + + /api/v1/trigonometry/tan: + post: + operationId: trig_tan + summary: Tangent + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + angle_unit: {type: string, enum: [radians, degrees], default: radians} + required: [a] + responses: + "200": {description: OK} + x-test-cases: + - name: "tan(0)" + body: {"a": 0} + expected_status: 200 + expected_body: {status: "ok", result: 0.0} + + /api/v1/trigonometry/asin: + post: + operationId: trig_asin + summary: Arc sine + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + angle_unit: {type: string, enum: [radians, degrees], default: radians} + required: [a] + responses: + "200": {description: OK} + "400": {description: Domain error} + x-test-cases: + - name: "asin(0)" + body: {"a": 0} + expected_status: 200 + expected_body: {status: "ok", result: 0.0} + - name: "asin(1)" + body: {"a": 1} + expected_status: 200 + expected_body: {status: "ok"} + - name: "asin(2) → domain error" + body: {"a": 2} + expected_status: 400 + expected_body: {status: "error", error: {code: "DOMAIN_ERROR"}} + + /api/v1/trigonometry/acos: + post: + operationId: trig_acos + summary: Arc cosine + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + angle_unit: {type: string, enum: [radians, degrees], default: radians} + required: [a] + responses: + "200": {description: OK} + "400": {description: Domain error} + x-test-cases: + - name: "acos(1)" + body: {"a": 1} + expected_status: 200 + expected_body: {status: "ok", result: 0.0} + - name: "acos(2) → domain error" + body: {"a": 2} + expected_status: 400 + expected_body: {status: "error", error: {code: "DOMAIN_ERROR"}} + + /api/v1/trigonometry/atan: + post: + operationId: trig_atan + summary: Arc tangent + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + angle_unit: {type: string, enum: [radians, degrees], default: radians} + required: [a] + responses: + "200": {description: OK} + x-test-cases: + - name: "atan(0)" + body: {"a": 0} + expected_status: 200 + expected_body: {status: "ok", result: 0.0} + + /api/v1/trigonometry/atan2: + post: + operationId: trig_atan2 + summary: Two-argument arc tangent + requestBody: + content: + application/json: + schema: + type: object + properties: + y: {type: number} + x: {type: number} + angle_unit: {type: string, enum: [radians, degrees], default: radians} + required: [y, x] + responses: + "200": {description: OK} + x-test-cases: + - name: "atan2(0, 1)" + body: {"y": 0, "x": 1} + expected_status: 200 + expected_body: {status: "ok", result: 0.0} + - name: "atan2(1, 0)" + body: {"y": 1, "x": 0} + expected_status: 200 + expected_body: {status: "ok"} + + /api/v1/trigonometry/sinh: + post: + operationId: trig_sinh + summary: Hyperbolic sine + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + required: [a] + responses: + "200": {description: OK} + x-test-cases: + - name: "sinh(0)" + body: {"a": 0} + expected_status: 200 + expected_body: {status: "ok", result: 0.0} + + /api/v1/trigonometry/cosh: + post: + operationId: trig_cosh + summary: Hyperbolic cosine + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + required: [a] + responses: + "200": {description: OK} + x-test-cases: + - name: "cosh(0)" + body: {"a": 0} + expected_status: 200 + expected_body: {status: "ok", result: 1.0} + + /api/v1/trigonometry/tanh: + post: + operationId: trig_tanh + summary: Hyperbolic tangent + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + required: [a] + responses: + "200": {description: OK} + x-test-cases: + - name: "tanh(0)" + body: {"a": 0} + expected_status: 200 + expected_body: {status: "ok", result: 0.0} + + /api/v1/trigonometry/asinh: + post: + operationId: trig_asinh + summary: Inverse hyperbolic sine + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + required: [a] + responses: + "200": {description: OK} + x-test-cases: + - name: "asinh(0)" + body: {"a": 0} + expected_status: 200 + expected_body: {status: "ok", result: 0.0} + + /api/v1/trigonometry/acosh: + post: + operationId: trig_acosh + summary: Inverse hyperbolic cosine + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + required: [a] + responses: + "200": {description: OK} + "400": {description: Domain error} + x-test-cases: + - name: "acosh(1)" + body: {"a": 1} + expected_status: 200 + expected_body: {status: "ok", result: 0.0} + - name: "acosh(0.5) → domain error" + body: {"a": 0.5} + expected_status: 400 + expected_body: {status: "error", error: {code: "DOMAIN_ERROR"}} + + /api/v1/trigonometry/atanh: + post: + operationId: trig_atanh + summary: Inverse hyperbolic tangent + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + required: [a] + responses: + "200": {description: OK} + "400": {description: Domain error} + x-test-cases: + - name: "atanh(0)" + body: {"a": 0} + expected_status: 200 + expected_body: {status: "ok", result: 0.0} + - name: "atanh(1) → domain error" + body: {"a": 1} + expected_status: 400 + expected_body: {status: "error", error: {code: "DOMAIN_ERROR"}} + + # ── Logarithmic ─────────────────────────────────────────────────── + /api/v1/logarithmic/ln: + post: + operationId: log_ln + summary: Natural logarithm + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + required: [a] + responses: + "200": {description: OK} + "400": {description: Domain error} + x-test-cases: + - name: "ln(1)" + body: {"a": 1} + expected_status: 200 + expected_body: {status: "ok", result: 0.0} + - name: "ln(e)" + body: {"a": 2.718281828459045} + expected_status: 200 + expected_body: {status: "ok", result: 1.0} + - name: "ln(0) → domain error" + body: {"a": 0} + expected_status: 400 + expected_body: {status: "error", error: {code: "DOMAIN_ERROR"}} + - name: "ln(-1) → domain error" + body: {"a": -1} + expected_status: 400 + expected_body: {status: "error", error: {code: "DOMAIN_ERROR"}} + + /api/v1/logarithmic/log10: + post: + operationId: log_log10 + summary: Base-10 logarithm + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + required: [a] + responses: + "200": {description: OK} + "400": {description: Domain error} + x-test-cases: + - name: "log10(100)" + body: {"a": 100} + expected_status: 200 + expected_body: {status: "ok", result: 2.0} + - name: "log10(1)" + body: {"a": 1} + expected_status: 200 + expected_body: {status: "ok", result: 0.0} + + /api/v1/logarithmic/log2: + post: + operationId: log_log2 + summary: Base-2 logarithm + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + required: [a] + responses: + "200": {description: OK} + "400": {description: Domain error} + x-test-cases: + - name: "log2(8)" + body: {"a": 8} + expected_status: 200 + expected_body: {status: "ok", result: 3.0} + + /api/v1/logarithmic/log: + post: + operationId: log_log + summary: Logarithm with arbitrary base + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + base: {type: number} + required: [a, base] + responses: + "200": {description: OK} + "400": {description: Domain error} + x-test-cases: + - name: "log(8, base=2)" + body: {"a": 8, "base": 2} + expected_status: 200 + expected_body: {status: "ok", result: 3.0} + - name: "log base 1 → domain error" + body: {"a": 10, "base": 1} + expected_status: 400 + expected_body: {status: "error", error: {code: "DOMAIN_ERROR"}} + + /api/v1/logarithmic/exp: + post: + operationId: log_exp + summary: Exponential (e^a) + requestBody: + content: + application/json: + schema: + type: object + properties: + a: {type: number} + required: [a] + responses: + "200": {description: OK} + x-test-cases: + - name: "exp(0)" + body: {"a": 0} + expected_status: 200 + expected_body: {status: "ok", result: 1.0} + - name: "exp(1)" + body: {"a": 1} + expected_status: 200 + expected_body: {status: "ok"} + + # ── Statistics ───────────────────────────────────────────────────── + /api/v1/statistics/mean: + post: + operationId: stats_mean + summary: Arithmetic mean + requestBody: + content: + application/json: + schema: + type: object + properties: + values: {type: array, items: {type: number}, minItems: 1} + required: [values] + responses: + "200": {description: OK} + x-test-cases: + - name: "mean" + body: {"values": [1, 2, 3, 4, 5]} + expected_status: 200 + expected_body: {status: "ok", result: 3.0} + + /api/v1/statistics/median: + post: + operationId: stats_median + summary: Median + requestBody: + content: + application/json: + schema: + type: object + properties: + values: {type: array, items: {type: number}, minItems: 1} + required: [values] + responses: + "200": {description: OK} + x-test-cases: + - name: "median odd count" + body: {"values": [3, 1, 2]} + expected_status: 200 + expected_body: {status: "ok", result: 2.0} + - name: "median even count" + body: {"values": [1, 2, 3, 4]} + expected_status: 200 + expected_body: {status: "ok", result: 2.5} + + /api/v1/statistics/mode: + post: + operationId: stats_mode + summary: Mode (smallest on ties) + requestBody: + content: + application/json: + schema: + type: object + properties: + values: {type: array, items: {type: number}, minItems: 1} + required: [values] + responses: + "200": {description: OK} + x-test-cases: + - name: "mode" + body: {"values": [1, 2, 2, 3, 3]} + expected_status: 200 + expected_body: {status: "ok", result: 2} + + /api/v1/statistics/stdev: + post: + operationId: stats_stdev + summary: Sample standard deviation (requires >= 2 values) + requestBody: + content: + application/json: + schema: + type: object + properties: + values: {type: array, items: {type: number}, minItems: 2} + required: [values] + responses: + "200": {description: OK} + x-test-cases: + - name: "stdev" + body: {"values": [2, 4, 4, 4, 5, 5, 7, 9]} + expected_status: 200 + expected_body: {status: "ok"} + + /api/v1/statistics/variance: + post: + operationId: stats_variance + summary: Sample variance (requires >= 2 values) + requestBody: + content: + application/json: + schema: + type: object + properties: + values: {type: array, items: {type: number}, minItems: 2} + required: [values] + responses: + "200": {description: OK} + x-test-cases: + - name: "variance" + body: {"values": [2, 4, 4, 4, 5, 5, 7, 9]} + expected_status: 200 + expected_body: {status: "ok"} + + /api/v1/statistics/pstdev: + post: + operationId: stats_pstdev + summary: Population standard deviation + requestBody: + content: + application/json: + schema: + type: object + properties: + values: {type: array, items: {type: number}, minItems: 1} + required: [values] + responses: + "200": {description: OK} + x-test-cases: + - name: "pstdev" + body: {"values": [2, 4, 4, 4, 5, 5, 7, 9]} + expected_status: 200 + expected_body: {status: "ok"} + + /api/v1/statistics/pvariance: + post: + operationId: stats_pvariance + summary: Population variance + requestBody: + content: + application/json: + schema: + type: object + properties: + values: {type: array, items: {type: number}, minItems: 1} + required: [values] + responses: + "200": {description: OK} + x-test-cases: + - name: "pvariance" + body: {"values": [2, 4, 4, 4, 5, 5, 7, 9]} + expected_status: 200 + expected_body: {status: "ok"} + + /api/v1/statistics/min: + post: + operationId: stats_min + summary: Minimum value + requestBody: + content: + application/json: + schema: + type: object + properties: + values: {type: array, items: {type: number}, minItems: 1} + required: [values] + responses: + "200": {description: OK} + x-test-cases: + - name: "min" + body: {"values": [5, 2, 8, 1, 9]} + expected_status: 200 + expected_body: {status: "ok", result: 1} + + /api/v1/statistics/max: + post: + operationId: stats_max + summary: Maximum value + requestBody: + content: + application/json: + schema: + type: object + properties: + values: {type: array, items: {type: number}, minItems: 1} + required: [values] + responses: + "200": {description: OK} + x-test-cases: + - name: "max" + body: {"values": [5, 2, 8, 1, 9]} + expected_status: 200 + expected_body: {status: "ok", result: 9} + + /api/v1/statistics/sum: + post: + operationId: stats_sum + summary: Sum all values + requestBody: + content: + application/json: + schema: + type: object + properties: + values: {type: array, items: {type: number}, minItems: 1} + required: [values] + responses: + "200": {description: OK} + x-test-cases: + - name: "sum" + body: {"values": [10, 20, 30]} + expected_status: 200 + expected_body: {status: "ok", result: 60} + + /api/v1/statistics/count: + post: + operationId: stats_count + summary: Count values + requestBody: + content: + application/json: + schema: + type: object + properties: + values: {type: array, items: {type: number}, minItems: 1} + required: [values] + responses: + "200": {description: OK} + x-test-cases: + - name: "count" + body: {"values": [1, 2, 3, 4, 5]} + expected_status: 200 + expected_body: {status: "ok", result: 5} + + # ── Constants ────────────────────────────────────────────────────── + /api/v1/constants: + get: + operationId: constants_all + summary: Get all constants as a map + responses: + "200": {description: OK} + x-test-cases: + - name: "get all constants" + expected_status: 200 + expected_body: {status: "ok"} + + /api/v1/constants/pi: + get: + operationId: constants_pi + summary: Pi + responses: + "200": {description: OK} + x-test-cases: + - name: "get pi" + expected_status: 200 + expected_body: {status: "ok"} + + /api/v1/constants/e: + get: + operationId: constants_e + summary: Euler's number + responses: + "200": {description: OK} + x-test-cases: + - name: "get e" + expected_status: 200 + expected_body: {status: "ok"} + + /api/v1/constants/tau: + get: + operationId: constants_tau + summary: Tau (2*pi) + responses: + "200": {description: OK} + x-test-cases: + - name: "get tau" + expected_status: 200 + expected_body: {status: "ok"} + + /api/v1/constants/golden_ratio: + get: + operationId: constants_golden_ratio + summary: Golden ratio + responses: + "200": {description: OK} + x-test-cases: + - name: "get golden_ratio" + expected_status: 200 + expected_body: {status: "ok"} + + /api/v1/constants/sqrt2: + get: + operationId: constants_sqrt2 + summary: Square root of 2 + responses: + "200": {description: OK} + x-test-cases: + - name: "get sqrt2" + expected_status: 200 + expected_body: {status: "ok"} + + /api/v1/constants/ln2: + get: + operationId: constants_ln2 + summary: Natural log of 2 + responses: + "200": {description: OK} + x-test-cases: + - name: "get ln2" + expected_status: 200 + expected_body: {status: "ok"} + + /api/v1/constants/ln10: + get: + operationId: constants_ln10 + summary: Natural log of 10 + responses: + "200": {description: OK} + x-test-cases: + - name: "get ln10" + expected_status: 200 + expected_body: {status: "ok"} + + /api/v1/constants/inf: + get: + operationId: constants_inf + summary: Infinity + responses: + "200": {description: OK} + x-test-cases: + - name: "get inf" + expected_status: 200 + expected_body: {status: "ok"} + + /api/v1/constants/nan: + get: + operationId: constants_nan + summary: Not a number + responses: + "200": {description: OK} + x-test-cases: + - name: "get nan" + expected_status: 200 + expected_body: {status: "ok"} + + # ── Conversions ──────────────────────────────────────────────────── + /api/v1/conversions/angle: + post: + operationId: convert_angle + summary: Angle conversion (degrees/radians/gradians) + requestBody: + content: + application/json: + schema: + type: object + properties: + value: {type: number} + from_unit: {type: string} + to_unit: {type: string} + required: [value, from_unit, to_unit] + responses: + "200": {description: OK} + x-test-cases: + - name: "180 degrees to radians" + body: {"value": 180, "from_unit": "degrees", "to_unit": "radians"} + expected_status: 200 + expected_body: {status: "ok"} + + /api/v1/conversions/temperature: + post: + operationId: convert_temperature + summary: Temperature conversion (celsius/fahrenheit/kelvin) + requestBody: + content: + application/json: + schema: + type: object + properties: + value: {type: number} + from_unit: {type: string} + to_unit: {type: string} + required: [value, from_unit, to_unit] + responses: + "200": {description: OK} + x-test-cases: + - name: "boiling point C to F" + body: {"value": 100, "from_unit": "celsius", "to_unit": "fahrenheit"} + expected_status: 200 + expected_body: {status: "ok", result: 212.0} + - name: "freezing point C to K" + body: {"value": 0, "from_unit": "celsius", "to_unit": "kelvin"} + expected_status: 200 + expected_body: {status: "ok", result: 273.15} + + /api/v1/conversions/length: + post: + operationId: convert_length + summary: Length conversion + requestBody: + content: + application/json: + schema: + type: object + properties: + value: {type: number} + from_unit: {type: string} + to_unit: {type: string} + required: [value, from_unit, to_unit] + responses: + "200": {description: OK} + x-test-cases: + - name: "1 meter to feet" + body: {"value": 1, "from_unit": "meters", "to_unit": "feet"} + expected_status: 200 + expected_body: {status: "ok"} + - name: "1 mile to kilometers" + body: {"value": 1, "from_unit": "miles", "to_unit": "kilometers"} + expected_status: 200 + expected_body: {status: "ok"} + + /api/v1/conversions/weight: + post: + operationId: convert_weight + summary: Weight conversion + requestBody: + content: + application/json: + schema: + type: object + properties: + value: {type: number} + from_unit: {type: string} + to_unit: {type: string} + required: [value, from_unit, to_unit] + responses: + "200": {description: OK} + x-test-cases: + - name: "1 kg to pounds" + body: {"value": 1, "from_unit": "kilograms", "to_unit": "pounds"} + expected_status: 200 + expected_body: {status: "ok"} + - name: "1 stone to kilograms" + body: {"value": 1, "from_unit": "stones", "to_unit": "kilograms"} + expected_status: 200 + expected_body: {status: "ok"} + + # ── Error cases (spec-level) ────────────────────────────────────── + /api/v1/nonexistent: + get: + operationId: error_not_found + summary: Unknown endpoint should return 404 + responses: + "404": {description: Not found} + x-test-cases: + - name: "unknown endpoint → 404" + expected_status: 404 diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/scenario.yaml b/scripts/aidlc-evaluator/test_cases/sci-calc/scenario.yaml new file mode 100644 index 00000000..d60d9ef1 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/scenario.yaml @@ -0,0 +1,8 @@ +name: sci-calc +description: "Scientific calculator — standard AIDLC benchmark" +vision: vision.md +tech_env: tech-env.md +openapi: openapi.yaml +golden_baseline: golden.yaml +golden_aidlc_docs: golden-aidlc-docs/ +tags: [standard, benchmark] diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/tech-env.md b/scripts/aidlc-evaluator/test_cases/sci-calc/tech-env.md new file mode 100644 index 00000000..d27c7b36 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/tech-env.md @@ -0,0 +1,97 @@ +# Technical Environment: Scientific Calculator API + +## Language and Package Manager + +- **Python 3.13** +- **uv** for all package management (no pip, poetry, or conda) +- `pyproject.toml` for project and tool configuration + +## Web Framework + +- **FastAPI** with Pydantic v2 for request/response validation +- **uvicorn** as the ASGI server + +## Project Structure + +``` +sci-calc/ +├── pyproject.toml +├── src/ +│ └── sci_calc/ +│ ├── __init__.py +│ ├── app.py +│ ├── routes/ +│ │ ├── __init__.py +│ │ ├── arithmetic.py +│ │ ├── trigonometry.py +│ │ ├── logarithmic.py +│ │ ├── powers.py +│ │ ├── statistics.py +│ │ ├── constants.py +│ │ └── conversions.py +│ ├── models/ +│ │ ├── __init__.py +│ │ ├── requests.py +│ │ └── responses.py +│ └── engine/ +│ ├── __init__.py +│ └── math_engine.py +└── tests/ + ├── __init__.py + ├── conftest.py + ├── test_arithmetic.py + ├── test_trigonometry.py + ├── test_logarithmic.py + ├── test_powers.py + ├── test_statistics.py + ├── test_constants.py + └── test_conversions.py +``` + +## Testing + +- **pytest** with pytest-asyncio and httpx (async test client) +- **pytest-cov** with 90% line coverage minimum +- Unit tests exercise `math_engine.py` directly with known-value tables +- Integration tests use `httpx.AsyncClient` with FastAPI TestClient +- Boundary tests verify every domain constraint produces the correct error code +- Run command: `uv run pytest` + +## Linting and Formatting + +- **ruff** (line-length 100, target py313) + +## Build Backend + +- **hatchling** + +## Do NOT Use + +| Prohibited | Reason | Use Instead | +|-----------|--------|-------------| +| Flask, Django | Project uses FastAPI | FastAPI | +| requests | Blocks async event loop | httpx | +| sympy | Too heavy for this scope | Python `math` stdlib | +| pandas, numpy | Not needed for single calculations | Standard Python | +| pip, poetry, pipenv | Project uses uv exclusively | uv | +| black, flake8, isort | Replaced by ruff | ruff | + +## Non-Functional Requirements + +| Requirement | Target | +|---|---| +| Startup time | < 2 seconds | +| Response latency (p95) | < 50ms for any single operation | +| Test coverage | >= 90% line coverage | +| Floating-point agreement | Results match Python `math` stdlib to <= 1 ULP | +| Max request body size | 1 MB | +| Python version | 3.13.x (enforced via `requires-python = ">=3.13"`) | + +## Development Workflow + +```bash +uv sync +uv run uvicorn sci_calc.app:app --reload --port 8000 +uv run pytest +uv run ruff check . && uv run ruff format . +``` diff --git a/scripts/aidlc-evaluator/test_cases/sci-calc/vision.md b/scripts/aidlc-evaluator/test_cases/sci-calc/vision.md new file mode 100644 index 00000000..37fdae6d --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/sci-calc/vision.md @@ -0,0 +1,87 @@ +# Scientific Calculator API + +## Executive Summary + +A stateless HTTP API that performs scientific math operations — arithmetic, trigonometry, logarithms, powers, statistics, and unit conversions. Any HTTP client can consume it without installing a math library. The calculator prioritises correctness, precision, and clear error reporting over raw throughput. It serves as a golden test-case application: small enough to reason about completely, yet rich enough to exercise code-generation tooling across many dimensions. + +## Features In Scope (MVP) + +- Arithmetic: add, subtract, multiply, divide, modulo, abs, negate +- Powers and roots: power, sqrt, cbrt, nth_root, square +- Trigonometry: sin, cos, tan, asin, acos, atan, atan2, sinh, cosh, tanh, asinh, acosh, atanh (degree and radian modes) +- Logarithms: ln, log10, log2, log (arbitrary base), exp +- Statistics: mean, median, mode, stdev, variance, pstdev, pvariance, min, max, sum, count +- Constants: pi, e, tau, inf, nan, golden_ratio, sqrt2, ln2, ln10 +- Unit conversions: angle, temperature, length, weight +- Health-check endpoint +- Structured error responses for all failure cases +- Unit and integration tests + +## Features Explicitly Out of Scope (MVP) + +- Persistent storage or user accounts +- Graphical or terminal UI +- Symbolic / computer-algebra (CAS) capabilities +- Arbitrary-precision or big-number libraries beyond Python's standard `decimal` module +- Authentication, rate-limiting, or production hardening +- Expression evaluation from string input + +## API Specification + +All endpoints accept and return `application/json`. + +### Response Envelopes + +**Success:** + +```json +{ "status": "ok", "operation": "", "inputs": { ... }, "result": } +``` + +**Error:** + +```json +{ "status": "error", "operation": "", "inputs": { ... }, "error": { "code": "", "message": "..." } } +``` + +| Error Code | HTTP Status | Meaning | +|---|---|---| +| `INVALID_INPUT` | 422 | Request body fails validation | +| `DIVISION_BY_ZERO` | 400 | Division or modulo by zero | +| `DOMAIN_ERROR` | 400 | Input outside mathematical domain (e.g. sqrt(-1), log(0)) | +| `OVERFLOW` | 400 | Result exceeds representable range | +| `NOT_FOUND` | 404 | Unknown endpoint | + +### Endpoints + +**`GET /health`** — Returns `{"status": "ok", "version": "0.1.0"}`. + +**`POST /api/v1/arithmetic/{operation}`** — `add`, `subtract`, `multiply`, `divide`, `modulo` take `{"a": N, "b": N}`. `abs`, `negate` take `{"a": N}`. + +**`POST /api/v1/powers/{operation}`** — `power` takes `{"base": N, "exponent": N}`. `sqrt`, `cbrt`, `square` take `{"a": N}`. `nth_root` takes `{"a": N, "n": int}`. Domain error if `a < 0` for sqrt; domain error if `a < 0` and `n` is even for nth_root. + +**`POST /api/v1/trigonometry/{operation}`** — Most take `{"a": N, "angle_unit": "radians"|"degrees"}` (defaults to radians). `atan2` takes `{"y": N, "x": N, "angle_unit": ...}`. Domain constraints: asin/acos require -1 <= a <= 1, acosh requires a >= 1, atanh requires -1 < a < 1. + +**`POST /api/v1/logarithmic/{operation}`** — `ln`, `log10`, `log2` take `{"a": N}` (domain error if a <= 0). `log` takes `{"a": N, "base": N}` (domain error if a <= 0, base <= 0, or base = 1). `exp` takes `{"a": N}`. + +**`POST /api/v1/statistics/{operation}`** — All take `{"values": [N, ...]}`. At least 1 element required. `stdev`/`variance` require at least 2 elements. `pstdev`/`pvariance` require at least 1. `mode` returns smallest mode on ties. + +**`GET /api/v1/constants/{name}`** — Returns the named constant. `GET /api/v1/constants` returns all as a map. + +**`POST /api/v1/conversions/{category}`** — Takes `{"value": N, "from_unit": "...", "to_unit": "..."}`. Categories: angle (degrees/radians/gradians), temperature (celsius/fahrenheit/kelvin), length (meters/feet/inches/centimeters/millimeters/kilometers/miles/yards), weight (kilograms/pounds/ounces/grams/milligrams/tonnes/stones). + +## Error Handling Principles + +1. Never return a bare 500. Catch math-domain and overflow errors and translate them to the structured error envelope. +2. Let FastAPI/Pydantic handle schema-validation errors; override the default 422 handler to conform to the error envelope. +3. Log unexpected exceptions at ERROR level and return a generic `INTERNAL_ERROR` response. + +## Success Metrics + +- All tests pass with >= 90% line coverage +- Results match Python `math` stdlib to <= 1 ULP for standard operations +- Response latency p95 < 50ms for any single operation + +## Versioning + +API versioned via URL prefix (`/api/v1/...`). Initial release is v0.1.0. Semver applies. diff --git a/scripts/aidlc-evaluator/test_cases/todo-app/scenario.yaml b/scripts/aidlc-evaluator/test_cases/todo-app/scenario.yaml new file mode 100644 index 00000000..837d948e --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/todo-app/scenario.yaml @@ -0,0 +1,6 @@ +name: todo-app +description: "Todo list application — full-stack AIDLC benchmark" +status: draft +vision: vision.md +tech_env: tech-env.md +tags: [standard, full-stack] diff --git a/scripts/aidlc-evaluator/test_cases/todo-app/tech-env.md b/scripts/aidlc-evaluator/test_cases/todo-app/tech-env.md new file mode 100644 index 00000000..37949fb7 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/todo-app/tech-env.md @@ -0,0 +1,81 @@ +# Technical Environment: Todo List Application + +## Language and Package Manager + +- **Node.js 22** (LTS) +- **npm** for package management +- `package.json` for project and script configuration + +## Backend Framework + +- **Express.js** for the REST API server +- In-memory data store (plain JavaScript Map/Array) — no database required for MVP +- **uuid** package for generating todo IDs + +## Frontend Framework + +- **React 19** with functional components and hooks +- **Vite** as the build tool and dev server +- Plain CSS (no CSS framework required) + +## Project Structure + +``` +todo-app/ +├── package.json +├── server/ +│ ├── index.js # Express server entry point +│ ├── routes/ +│ │ └── todos.js # Todo CRUD routes +│ └── store.js # In-memory todo store +├── client/ +│ ├── index.html +│ ├── src/ +│ │ ├── main.jsx # React entry point +│ │ ├── App.jsx # Root component +│ │ ├── components/ +│ │ │ ├── TodoInput.jsx +│ │ │ ├── TodoList.jsx +│ │ │ ├── TodoItem.jsx +│ │ │ └── FilterBar.jsx +│ │ ├── hooks/ +│ │ │ └── useTodos.js +│ │ └── styles/ +│ │ └── App.css +│ └── vite.config.js +└── tests/ + ├── server/ + │ └── todos.test.js # API integration tests + └── client/ + └── App.test.jsx # Component tests +``` + +## Testing + +- **Vitest** for both server and client tests +- **React Testing Library** for component tests +- **supertest** for API integration tests +- Tests run via `npm test` + +## Development Scripts + +```json +{ + "scripts": { + "dev": "concurrently \"npm run dev:server\" \"npm run dev:client\"", + "dev:server": "node server/index.js", + "dev:client": "vite client", + "build": "vite build client", + "test": "vitest run", + "start": "node server/index.js" + } +} +``` + +## Conventions + +- ES modules (`"type": "module"` in package.json) +- Server listens on port 3001, proxied from Vite dev server on port 5173 +- All API routes prefixed with `/api/` +- Health endpoint at `/health` (no `/api/` prefix) +- Standard HTTP status codes: 200, 201, 400, 404, 500 diff --git a/scripts/aidlc-evaluator/test_cases/todo-app/vision.md b/scripts/aidlc-evaluator/test_cases/todo-app/vision.md new file mode 100644 index 00000000..893dc865 --- /dev/null +++ b/scripts/aidlc-evaluator/test_cases/todo-app/vision.md @@ -0,0 +1,50 @@ +# Todo List Application — Product Vision + +## Overview + +A simple, full-stack todo list application that allows users to create, read, update, and delete tasks. The application provides a clean web interface for managing daily tasks with filtering and completion tracking. + +## Core Features + +### Task Management +- Create new tasks with a title and optional description +- Mark tasks as complete or incomplete (toggle) +- Edit existing task titles and descriptions +- Delete tasks permanently +- View all tasks in a scrollable list + +### Filtering +- Filter tasks by status: All, Active (incomplete), Completed +- Display count of remaining active tasks + +### Persistence +- Tasks persist across page refreshes via the REST API +- Server stores tasks in memory (no database required for MVP) + +## User Interface + +The UI is a single-page application with: +- A header showing the application title +- An input field at the top for adding new tasks +- A list of tasks below, each with: + - A checkbox to toggle completion + - The task title (with strikethrough when completed) + - An edit button + - A delete button +- A filter bar at the bottom with All / Active / Completed tabs +- A counter showing "X items left" + +## Non-Functional Requirements + +- The application should load in under 2 seconds +- The UI should be responsive and work on mobile viewports +- All CRUD operations should complete in under 500ms +- The API should return proper HTTP status codes and error messages + +## Out of Scope (MVP) + +- User authentication / multi-user support +- Task due dates or priorities +- Drag-and-drop reordering +- Database persistence (in-memory store is acceptable) +- Deployment / CI pipeline diff --git a/scripts/aidlc-evaluator/uv.lock b/scripts/aidlc-evaluator/uv.lock new file mode 100644 index 00000000..40d64660 --- /dev/null +++ b/scripts/aidlc-evaluator/uv.lock @@ -0,0 +1,2024 @@ +version = 1 +revision = 3 +requires-python = ">=3.13" + +[manifest] +members = [ + "aidlc-cli-harness", + "aidlc-contracttest", + "aidlc-evaluation-framework", + "aidlc-ide-harness", + "aidlc-nonfunctional", + "aidlc-qualitative", + "aidlc-quantitative", + "aidlc-reporting", + "aidlc-runner", + "aidlc-shared", + "aidlc-trend-reports", +] + +[[package]] +name = "aidlc-cli-harness" +version = "0.1.0" +source = { editable = "packages/cli-harness" } +dependencies = [ + { name = "pyyaml" }, +] + +[package.optional-dependencies] +dev = [ + { name = "pytest" }, +] + +[package.metadata] +requires-dist = [ + { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" }, + { name = "pyyaml", specifier = ">=6.0" }, +] +provides-extras = ["dev"] + +[[package]] +name = "aidlc-contracttest" +version = "0.1.0" +source = { editable = "packages/contracttest" } +dependencies = [ + { name = "httpx" }, + { name = "pyyaml" }, +] + +[package.metadata] +requires-dist = [ + { name = "httpx", specifier = ">=0.28.0" }, + { name = "pyyaml", specifier = ">=6.0" }, +] + +[[package]] +name = "aidlc-evaluation-framework" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "aidlc-contracttest" }, + { name = "aidlc-nonfunctional" }, + { name = "aidlc-qualitative" }, + { name = "aidlc-quantitative" }, + { name = "aidlc-reporting" }, + { name = "aidlc-runner" }, + { name = "aidlc-shared" }, + { name = "aidlc-trend-reports" }, + { name = "boto3" }, + { name = "pyyaml" }, +] + +[package.dev-dependencies] +dev = [ + { name = "bandit" }, + { name = "pytest" }, + { name = "pytest-asyncio" }, + { name = "ruff" }, + { name = "semgrep" }, +] + +[package.metadata] +requires-dist = [ + { name = "aidlc-contracttest", editable = "packages/contracttest" }, + { name = "aidlc-nonfunctional", editable = "packages/nonfunctional" }, + { name = "aidlc-qualitative", editable = "packages/qualitative" }, + { name = "aidlc-quantitative", editable = "packages/quantitative" }, + { name = "aidlc-reporting", editable = "packages/reporting" }, + { name = "aidlc-runner", editable = "packages/execution" }, + { name = "aidlc-shared", editable = "packages/shared" }, + { name = "aidlc-trend-reports", editable = "packages/trend-reports" }, + { name = "boto3", specifier = ">=1.42.47" }, + { name = "pyyaml", specifier = ">=6.0" }, +] + +[package.metadata.requires-dev] +dev = [ + { name = "bandit", specifier = ">=1.7.0" }, + { name = "pytest", specifier = ">=8.0.0" }, + { name = "pytest-asyncio", specifier = ">=0.23.0" }, + { name = "ruff", specifier = ">=0.3.0" }, + { name = "semgrep", specifier = ">=1.0.0" }, +] + +[[package]] +name = "aidlc-ide-harness" +version = "0.1.0" +source = { editable = "packages/ide-harness" } +dependencies = [ + { name = "pyyaml" }, +] + +[package.optional-dependencies] +dev = [ + { name = "pytest" }, +] + +[package.metadata] +requires-dist = [ + { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" }, + { name = "pyyaml", specifier = ">=6.0" }, +] +provides-extras = ["dev"] + +[[package]] +name = "aidlc-nonfunctional" +version = "0.1.0" +source = { editable = "packages/nonfunctional" } +dependencies = [ + { name = "aidlc-shared" }, + { name = "pyyaml" }, +] + +[package.metadata] +requires-dist = [ + { name = "aidlc-shared", editable = "packages/shared" }, + { name = "pyyaml", specifier = ">=6.0" }, +] + +[[package]] +name = "aidlc-qualitative" +version = "0.1.0" +source = { editable = "packages/qualitative" } +dependencies = [ + { name = "aidlc-shared" }, + { name = "pyyaml" }, +] + +[package.optional-dependencies] +dev = [ + { name = "pytest" }, +] +llm = [ + { name = "boto3" }, +] + +[package.metadata] +requires-dist = [ + { name = "aidlc-shared", editable = "packages/shared" }, + { name = "boto3", marker = "extra == 'llm'", specifier = ">=1.35.0" }, + { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" }, + { name = "pyyaml", specifier = ">=6.0" }, +] +provides-extras = ["llm", "dev"] + +[[package]] +name = "aidlc-quantitative" +version = "0.1.0" +source = { editable = "packages/quantitative" } +dependencies = [ + { name = "defusedxml" }, + { name = "pyyaml" }, +] + +[package.metadata] +requires-dist = [ + { name = "defusedxml", specifier = ">=0.7.1" }, + { name = "pyyaml", specifier = ">=6.0" }, +] + +[[package]] +name = "aidlc-reporting" +version = "0.1.0" +source = { editable = "packages/reporting" } +dependencies = [ + { name = "pyyaml" }, +] + +[package.metadata] +requires-dist = [{ name = "pyyaml", specifier = ">=6.0" }] + +[[package]] +name = "aidlc-runner" +version = "0.1.0" +source = { editable = "packages/execution" } +dependencies = [ + { name = "pyyaml" }, + { name = "strands-agents" }, + { name = "strands-agents-tools" }, +] + +[package.metadata] +requires-dist = [ + { name = "pyyaml", specifier = ">=6.0" }, + { name = "strands-agents", specifier = ">=0.1.0" }, + { name = "strands-agents-tools", specifier = ">=0.1.0" }, +] + +[[package]] +name = "aidlc-shared" +version = "0.1.0" +source = { editable = "packages/shared" } +dependencies = [ + { name = "pyyaml" }, +] + +[package.metadata] +requires-dist = [{ name = "pyyaml", specifier = ">=6.0" }] + +[[package]] +name = "aidlc-trend-reports" +version = "0.1.0" +source = { editable = "packages/trend-reports" } +dependencies = [ + { name = "pyyaml" }, +] + +[package.metadata] +requires-dist = [{ name = "pyyaml", specifier = ">=6.0" }] + +[[package]] +name = "aiohappyeyeballs" +version = "2.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760, upload-time = "2025-03-12T01:42:48.764Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265, upload-time = "2025-03-12T01:42:47.083Z" }, +] + +[[package]] +name = "aiohttp" +version = "3.13.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohappyeyeballs" }, + { name = "aiosignal" }, + { name = "attrs" }, + { name = "frozenlist" }, + { name = "multidict" }, + { name = "propcache" }, + { name = "yarl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/45/4a/064321452809dae953c1ed6e017504e72551a26b6f5708a5a80e4bf556ff/aiohttp-3.13.4.tar.gz", hash = "sha256:d97a6d09c66087890c2ab5d49069e1e570583f7ac0314ecf98294c1b6aaebd38", size = 7859748, upload-time = "2026-03-28T17:19:40.6Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/ac/892f4162df9b115b4758d615f32ec63d00f3084c705ff5526630887b9b42/aiohttp-3.13.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:63dd5e5b1e43b8fb1e91b79b7ceba1feba588b317d1edff385084fcc7a0a4538", size = 745744, upload-time = "2026-03-28T17:16:44.67Z" }, + { url = "https://files.pythonhosted.org/packages/97/a9/c5b87e4443a2f0ea88cb3000c93a8fdad1ee63bffc9ded8d8c8e0d66efc6/aiohttp-3.13.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:746ac3cc00b5baea424dacddea3ec2c2702f9590de27d837aa67004db1eebc6e", size = 498178, upload-time = "2026-03-28T17:16:46.766Z" }, + { url = "https://files.pythonhosted.org/packages/94/42/07e1b543a61250783650df13da8ddcdc0d0a5538b2bd15cef6e042aefc61/aiohttp-3.13.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bda8f16ea99d6a6705e5946732e48487a448be874e54a4f73d514660ff7c05d3", size = 498331, upload-time = "2026-03-28T17:16:48.9Z" }, + { url = "https://files.pythonhosted.org/packages/20/d6/492f46bf0328534124772d0cf58570acae5b286ea25006900650f69dae0e/aiohttp-3.13.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4b061e7b5f840391e3f64d0ddf672973e45c4cfff7a0feea425ea24e51530fc2", size = 1744414, upload-time = "2026-03-28T17:16:50.968Z" }, + { url = "https://files.pythonhosted.org/packages/e2/4d/e02627b2683f68051246215d2d62b2d2f249ff7a285e7a858dc47d6b6a14/aiohttp-3.13.4-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b252e8d5cd66184b570d0d010de742736e8a4fab22c58299772b0c5a466d4b21", size = 1719226, upload-time = "2026-03-28T17:16:53.173Z" }, + { url = "https://files.pythonhosted.org/packages/7b/6c/5d0a3394dd2b9f9aeba6e1b6065d0439e4b75d41f1fb09a3ec010b43552b/aiohttp-3.13.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:20af8aad61d1803ff11152a26146d8d81c266aa8c5aa9b4504432abb965c36a0", size = 1782110, upload-time = "2026-03-28T17:16:55.362Z" }, + { url = "https://files.pythonhosted.org/packages/0d/2d/c20791e3437700a7441a7edfb59731150322424f5aadf635602d1d326101/aiohttp-3.13.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:13a5cc924b59859ad2adb1478e31f410a7ed46e92a2a619d6d1dd1a63c1a855e", size = 1884809, upload-time = "2026-03-28T17:16:57.734Z" }, + { url = "https://files.pythonhosted.org/packages/c8/94/d99dbfbd1924a87ef643833932eb2a3d9e5eee87656efea7d78058539eff/aiohttp-3.13.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:534913dfb0a644d537aebb4123e7d466d94e3be5549205e6a31f72368980a81a", size = 1764938, upload-time = "2026-03-28T17:17:00.221Z" }, + { url = "https://files.pythonhosted.org/packages/49/61/3ce326a1538781deb89f6cf5e094e2029cd308ed1e21b2ba2278b08426f6/aiohttp-3.13.4-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:320e40192a2dcc1cf4b5576936e9652981ab596bf81eb309535db7e2f5b5672f", size = 1570697, upload-time = "2026-03-28T17:17:02.985Z" }, + { url = "https://files.pythonhosted.org/packages/b6/77/4ab5a546857bb3028fbaf34d6eea180267bdab022ee8b1168b1fcde4bfdd/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9e587fcfce2bcf06526a43cb705bdee21ac089096f2e271d75de9c339db3100c", size = 1702258, upload-time = "2026-03-28T17:17:05.28Z" }, + { url = "https://files.pythonhosted.org/packages/79/63/d8f29021e39bc5af8e5d5e9da1b07976fb9846487a784e11e4f4eeda4666/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:9eb9c2eea7278206b5c6c1441fdd9dc420c278ead3f3b2cc87f9b693698cc500", size = 1740287, upload-time = "2026-03-28T17:17:07.712Z" }, + { url = "https://files.pythonhosted.org/packages/55/3a/cbc6b3b124859a11bc8055d3682c26999b393531ef926754a3445b99dfef/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:29be00c51972b04bf9d5c8f2d7f7314f48f96070ca40a873a53056e652e805f7", size = 1753011, upload-time = "2026-03-28T17:17:10.053Z" }, + { url = "https://files.pythonhosted.org/packages/e0/30/836278675205d58c1368b21520eab9572457cf19afd23759216c04483048/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:90c06228a6c3a7c9f776fe4fc0b7ff647fffd3bed93779a6913c804ae00c1073", size = 1566359, upload-time = "2026-03-28T17:17:12.433Z" }, + { url = "https://files.pythonhosted.org/packages/50/b4/8032cc9b82d17e4277704ba30509eaccb39329dc18d6a35f05e424439e32/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:a533ec132f05fd9a1d959e7f34184cd7d5e8511584848dab85faefbaac573069", size = 1785537, upload-time = "2026-03-28T17:17:14.721Z" }, + { url = "https://files.pythonhosted.org/packages/17/7d/5873e98230bde59f493bf1f7c3e327486a4b5653fa401144704df5d00211/aiohttp-3.13.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1c946f10f413836f82ea4cfb90200d2a59578c549f00857e03111cf45ad01ca5", size = 1740752, upload-time = "2026-03-28T17:17:17.387Z" }, + { url = "https://files.pythonhosted.org/packages/7b/f2/13e46e0df051494d7d3c68b7f72d071f48c384c12716fc294f75d5b1a064/aiohttp-3.13.4-cp313-cp313-win32.whl", hash = "sha256:48708e2706106da6967eff5908c78ca3943f005ed6bcb75da2a7e4da94ef8c70", size = 433187, upload-time = "2026-03-28T17:17:19.523Z" }, + { url = "https://files.pythonhosted.org/packages/ea/c0/649856ee655a843c8f8664592cfccb73ac80ede6a8c8db33a25d810c12db/aiohttp-3.13.4-cp313-cp313-win_amd64.whl", hash = "sha256:74a2eb058da44fa3a877a49e2095b591d4913308bb424c418b77beb160c55ce3", size = 459778, upload-time = "2026-03-28T17:17:21.964Z" }, + { url = "https://files.pythonhosted.org/packages/6d/29/6657cc37ae04cacc2dbf53fb730a06b6091cc4cbe745028e047c53e6d840/aiohttp-3.13.4-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:e0a2c961fc92abeff61d6444f2ce6ad35bb982db9fc8ff8a47455beacf454a57", size = 749363, upload-time = "2026-03-28T17:17:24.044Z" }, + { url = "https://files.pythonhosted.org/packages/90/7f/30ccdf67ca3d24b610067dc63d64dcb91e5d88e27667811640644aa4a85d/aiohttp-3.13.4-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:153274535985a0ff2bff1fb6c104ed547cec898a09213d21b0f791a44b14d933", size = 499317, upload-time = "2026-03-28T17:17:26.199Z" }, + { url = "https://files.pythonhosted.org/packages/93/13/e372dd4e68ad04ee25dafb050c7f98b0d91ea643f7352757e87231102555/aiohttp-3.13.4-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:351f3171e2458da3d731ce83f9e6b9619e325c45cbd534c7759750cabf453ad7", size = 500477, upload-time = "2026-03-28T17:17:28.279Z" }, + { url = "https://files.pythonhosted.org/packages/e5/fe/ee6298e8e586096fb6f5eddd31393d8544f33ae0792c71ecbb4c2bef98ac/aiohttp-3.13.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f989ac8bc5595ff761a5ccd32bdb0768a117f36dd1504b1c2c074ed5d3f4df9c", size = 1737227, upload-time = "2026-03-28T17:17:30.587Z" }, + { url = "https://files.pythonhosted.org/packages/b0/b9/a7a0463a09e1a3fe35100f74324f23644bfc3383ac5fd5effe0722a5f0b7/aiohttp-3.13.4-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d36fc1709110ec1e87a229b201dd3ddc32aa01e98e7868083a794609b081c349", size = 1694036, upload-time = "2026-03-28T17:17:33.29Z" }, + { url = "https://files.pythonhosted.org/packages/57/7c/8972ae3fb7be00a91aee6b644b2a6a909aedb2c425269a3bfd90115e6f8f/aiohttp-3.13.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:42adaeea83cbdf069ab94f5103ce0787c21fb1a0153270da76b59d5578302329", size = 1786814, upload-time = "2026-03-28T17:17:36.035Z" }, + { url = "https://files.pythonhosted.org/packages/93/01/c81e97e85c774decbaf0d577de7d848934e8166a3a14ad9f8aa5be329d28/aiohttp-3.13.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:92deb95469928cc41fd4b42a95d8012fa6df93f6b1c0a83af0ffbc4a5e218cde", size = 1866676, upload-time = "2026-03-28T17:17:38.441Z" }, + { url = "https://files.pythonhosted.org/packages/5a/5f/5b46fe8694a639ddea2cd035bf5729e4677ea882cb251396637e2ef1590d/aiohttp-3.13.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0c0c7c07c4257ef3a1df355f840bc62d133bcdef5c1c5ba75add3c08553e2eed", size = 1740842, upload-time = "2026-03-28T17:17:40.783Z" }, + { url = "https://files.pythonhosted.org/packages/20/a2/0d4b03d011cca6b6b0acba8433193c1e484efa8d705ea58295590fe24203/aiohttp-3.13.4-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f062c45de8a1098cb137a1898819796a2491aec4e637a06b03f149315dff4d8f", size = 1566508, upload-time = "2026-03-28T17:17:43.235Z" }, + { url = "https://files.pythonhosted.org/packages/98/17/e689fd500da52488ec5f889effd6404dece6a59de301e380f3c64f167beb/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:76093107c531517001114f0ebdb4f46858ce818590363e3e99a4a2280334454a", size = 1700569, upload-time = "2026-03-28T17:17:46.165Z" }, + { url = "https://files.pythonhosted.org/packages/d8/0d/66402894dbcf470ef7db99449e436105ea862c24f7ea4c95c683e635af35/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:6f6ec32162d293b82f8b63a16edc80769662fbd5ae6fbd4936d3206a2c2cc63b", size = 1707407, upload-time = "2026-03-28T17:17:48.825Z" }, + { url = "https://files.pythonhosted.org/packages/2f/eb/af0ab1a3650092cbd8e14ef29e4ab0209e1460e1c299996c3f8288b3f1ff/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:5903e2db3d202a00ad9f0ec35a122c005e85d90c9836ab4cda628f01edf425e2", size = 1752214, upload-time = "2026-03-28T17:17:51.206Z" }, + { url = "https://files.pythonhosted.org/packages/5a/bf/72326f8a98e4c666f292f03c385545963cc65e358835d2a7375037a97b57/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:2d5bea57be7aca98dbbac8da046d99b5557c5cf4e28538c4c786313078aca09e", size = 1562162, upload-time = "2026-03-28T17:17:53.634Z" }, + { url = "https://files.pythonhosted.org/packages/67/9f/13b72435f99151dd9a5469c96b3b5f86aa29b7e785ca7f35cf5e538f74c0/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:bcf0c9902085976edc0232b75006ef38f89686901249ce14226b6877f88464fb", size = 1768904, upload-time = "2026-03-28T17:17:55.991Z" }, + { url = "https://files.pythonhosted.org/packages/18/bc/28d4970e7d5452ac7776cdb5431a1164a0d9cf8bd2fffd67b4fb463aa56d/aiohttp-3.13.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c3295f98bfeed2e867cab588f2a146a9db37a85e3ae9062abf46ba062bd29165", size = 1723378, upload-time = "2026-03-28T17:17:58.348Z" }, + { url = "https://files.pythonhosted.org/packages/53/74/b32458ca1a7f34d65bdee7aef2036adbe0438123d3d53e2b083c453c24dd/aiohttp-3.13.4-cp314-cp314-win32.whl", hash = "sha256:a598a5c5767e1369d8f5b08695cab1d8160040f796c4416af76fd773d229b3c9", size = 438711, upload-time = "2026-03-28T17:18:00.728Z" }, + { url = "https://files.pythonhosted.org/packages/40/b2/54b487316c2df3e03a8f3435e9636f8a81a42a69d942164830d193beb56a/aiohttp-3.13.4-cp314-cp314-win_amd64.whl", hash = "sha256:c555db4bc7a264bead5a7d63d92d41a1122fcd39cc62a4db815f45ad46f9c2c8", size = 464977, upload-time = "2026-03-28T17:18:03.367Z" }, + { url = "https://files.pythonhosted.org/packages/47/fb/e41b63c6ce71b07a59243bb8f3b457ee0c3402a619acb9d2c0d21ef0e647/aiohttp-3.13.4-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:45abbbf09a129825d13c18c7d3182fecd46d9da3cfc383756145394013604ac1", size = 781549, upload-time = "2026-03-28T17:18:05.779Z" }, + { url = "https://files.pythonhosted.org/packages/97/53/532b8d28df1e17e44c4d9a9368b78dcb6bf0b51037522136eced13afa9e8/aiohttp-3.13.4-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:74c80b2bc2c2adb7b3d1941b2b60701ee2af8296fc8aad8b8bc48bc25767266c", size = 514383, upload-time = "2026-03-28T17:18:08.096Z" }, + { url = "https://files.pythonhosted.org/packages/1b/1f/62e5d400603e8468cd635812d99cb81cfdc08127a3dc474c647615f31339/aiohttp-3.13.4-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c97989ae40a9746650fa196894f317dafc12227c808c774929dda0ff873a5954", size = 518304, upload-time = "2026-03-28T17:18:10.642Z" }, + { url = "https://files.pythonhosted.org/packages/90/57/2326b37b10896447e3c6e0cbef4fe2486d30913639a5cfd1332b5d870f82/aiohttp-3.13.4-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dae86be9811493f9990ef44fff1685f5c1a3192e9061a71a109d527944eed551", size = 1893433, upload-time = "2026-03-28T17:18:13.121Z" }, + { url = "https://files.pythonhosted.org/packages/d2/b4/a24d82112c304afdb650167ef2fe190957d81cbddac7460bedd245f765aa/aiohttp-3.13.4-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:1db491abe852ca2fa6cc48a3341985b0174b3741838e1341b82ac82c8bd9e871", size = 1755901, upload-time = "2026-03-28T17:18:16.21Z" }, + { url = "https://files.pythonhosted.org/packages/9e/2d/0883ef9d878d7846287f036c162a951968f22aabeef3ac97b0bea6f76d5d/aiohttp-3.13.4-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:0e5d701c0aad02a7dce72eef6b93226cf3734330f1a31d69ebbf69f33b86666e", size = 1876093, upload-time = "2026-03-28T17:18:18.703Z" }, + { url = "https://files.pythonhosted.org/packages/ad/52/9204bb59c014869b71971addad6778f005daa72a96eed652c496789d7468/aiohttp-3.13.4-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8ac32a189081ae0a10ba18993f10f338ec94341f0d5df8fff348043962f3c6f8", size = 1970815, upload-time = "2026-03-28T17:18:21.858Z" }, + { url = "https://files.pythonhosted.org/packages/d6/b5/e4eb20275a866dde0f570f411b36c6b48f7b53edfe4f4071aa1b0728098a/aiohttp-3.13.4-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98e968cdaba43e45c73c3f306fca418c8009a957733bac85937c9f9cf3f4de27", size = 1816223, upload-time = "2026-03-28T17:18:24.729Z" }, + { url = "https://files.pythonhosted.org/packages/d8/23/e98075c5bb146aa61a1239ee1ac7714c85e814838d6cebbe37d3fe19214a/aiohttp-3.13.4-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:ca114790c9144c335d538852612d3e43ea0f075288f4849cf4b05d6cd2238ce7", size = 1649145, upload-time = "2026-03-28T17:18:27.269Z" }, + { url = "https://files.pythonhosted.org/packages/d6/c1/7bad8be33bb06c2bb224b6468874346026092762cbec388c3bdb65a368ee/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:ea2e071661ba9cfe11eabbc81ac5376eaeb3061f6e72ec4cc86d7cdd1ffbdbbb", size = 1816562, upload-time = "2026-03-28T17:18:29.847Z" }, + { url = "https://files.pythonhosted.org/packages/5c/10/c00323348695e9a5e316825969c88463dcc24c7e9d443244b8a2c9cf2eae/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:34e89912b6c20e0fd80e07fa401fd218a410aa1ce9f1c2f1dad6db1bd0ce0927", size = 1800333, upload-time = "2026-03-28T17:18:32.269Z" }, + { url = "https://files.pythonhosted.org/packages/84/43/9b2147a1df3559f49bd723e22905b46a46c068a53adb54abdca32c4de180/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:0e217cf9f6a42908c52b46e42c568bd57adc39c9286ced31aaace614b6087965", size = 1820617, upload-time = "2026-03-28T17:18:35.238Z" }, + { url = "https://files.pythonhosted.org/packages/a9/7f/b3481a81e7a586d02e99387b18c6dafff41285f6efd3daa2124c01f87eae/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:0c296f1221e21ba979f5ac1964c3b78cfde15c5c5f855ffd2caab337e9cd9182", size = 1643417, upload-time = "2026-03-28T17:18:37.949Z" }, + { url = "https://files.pythonhosted.org/packages/8f/72/07181226bc99ce1124e0f89280f5221a82d3ae6a6d9d1973ce429d48e52b/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:d99a9d168ebaffb74f36d011750e490085ac418f4db926cce3989c8fe6cb6b1b", size = 1849286, upload-time = "2026-03-28T17:18:40.534Z" }, + { url = "https://files.pythonhosted.org/packages/1a/e6/1b3566e103eca6da5be4ae6713e112a053725c584e96574caf117568ffef/aiohttp-3.13.4-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cb19177205d93b881f3f89e6081593676043a6828f59c78c17a0fd6c1fbed2ba", size = 1782635, upload-time = "2026-03-28T17:18:43.073Z" }, + { url = "https://files.pythonhosted.org/packages/37/58/1b11c71904b8d079eb0c39fe664180dd1e14bebe5608e235d8bfbadc8929/aiohttp-3.13.4-cp314-cp314t-win32.whl", hash = "sha256:c606aa5656dab6552e52ca368e43869c916338346bfaf6304e15c58fb113ea30", size = 472537, upload-time = "2026-03-28T17:18:46.286Z" }, + { url = "https://files.pythonhosted.org/packages/bc/8f/87c56a1a1977d7dddea5b31e12189665a140fdb48a71e9038ff90bb564ec/aiohttp-3.13.4-cp314-cp314t-win_amd64.whl", hash = "sha256:014dcc10ec8ab8db681f0d68e939d1e9286a5aa2b993cbbdb0db130853e02144", size = 506381, upload-time = "2026-03-28T17:18:48.74Z" }, +] + +[[package]] +name = "aiosignal" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "frozenlist" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/61/62/06741b579156360248d1ec624842ad0edf697050bbaf7c3e46394e106ad1/aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7", size = 25007, upload-time = "2025-07-03T22:54:43.528Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, +] + +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, +] + +[[package]] +name = "anyio" +version = "4.12.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/96/f0/5eb65b2bb0d09ac6776f2eb54adee6abe8228ea05b20a5ad0e4945de8aac/anyio-4.12.1.tar.gz", hash = "sha256:41cfcc3a4c85d3f05c932da7c26d0201ac36f72abd4435ba90d0464a3ffed703", size = 228685, upload-time = "2026-01-06T11:45:21.246Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/38/0e/27be9fdef66e72d64c0cdc3cc2823101b80585f8119b5c112c2e8f5f7dab/anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c", size = 113592, upload-time = "2026-01-06T11:45:19.497Z" }, +] + +[[package]] +name = "attrs" +version = "25.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6b/5c/685e6633917e101e5dcb62b9dd76946cbb57c26e133bae9e0cd36033c0a9/attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11", size = 934251, upload-time = "2025-10-06T13:54:44.725Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" }, +] + +[[package]] +name = "aws-requests-auth" +version = "0.4.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/54/b2/455c0bfcbd772dafd4c9e93c4b713e36790abf9ccbca9b8e661968b29798/aws-requests-auth-0.4.3.tar.gz", hash = "sha256:33593372018b960a31dbbe236f89421678b885c35f0b6a7abfae35bb77e069b2", size = 10096, upload-time = "2020-05-27T23:10:34.742Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/11/5dc8be418e1d54bed15eaf3a7461797e5ebb9e6a34869ad750561f35fa5b/aws_requests_auth-0.4.3-py2.py3-none-any.whl", hash = "sha256:646bc37d62140ea1c709d20148f5d43197e6bd2d63909eb36fa4bb2345759977", size = 6838, upload-time = "2020-05-27T23:10:33.658Z" }, +] + +[[package]] +name = "bandit" +version = "1.9.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "pyyaml" }, + { name = "rich" }, + { name = "stevedore" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/89/76/a7f3e639b78601118aaa4a394db2c66ae2597fbd8c39644c32874ed11e0c/bandit-1.9.3.tar.gz", hash = "sha256:ade4b9b7786f89ef6fc7344a52b34558caec5da74cb90373aed01de88472f774", size = 4242154, upload-time = "2026-01-19T04:05:22.802Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e0/0b/8bdc52111c83e2dc2f97403dc87c0830b8989d9ae45732b34b686326fb2c/bandit-1.9.3-py3-none-any.whl", hash = "sha256:4745917c88d2246def79748bde5e08b9d5e9b92f877863d43fab70cd8814ce6a", size = 134451, upload-time = "2026-01-19T04:05:20.938Z" }, +] + +[[package]] +name = "beautifulsoup4" +version = "4.14.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "soupsieve" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c3/b0/1c6a16426d389813b48d95e26898aff79abbde42ad353958ad95cc8c9b21/beautifulsoup4-4.14.3.tar.gz", hash = "sha256:6292b1c5186d356bba669ef9f7f051757099565ad9ada5dd630bd9de5fa7fb86", size = 627737, upload-time = "2025-11-30T15:08:26.084Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1a/39/47f9197bdd44df24d67ac8893641e16f386c984a0619ef2ee4c51fbbc019/beautifulsoup4-4.14.3-py3-none-any.whl", hash = "sha256:0918bfe44902e6ad8d57732ba310582e98da931428d231a5ecb9e7c703a735bb", size = 107721, upload-time = "2025-11-30T15:08:24.087Z" }, +] + +[[package]] +name = "boltons" +version = "21.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ad/1f/6c0608d86e0fc77c982a2923ece80eef85f091f2332fc13cbce41d70d502/boltons-21.0.0.tar.gz", hash = "sha256:65e70a79a731a7fe6e98592ecfb5ccf2115873d01dbc576079874629e5c90f13", size = 180201, upload-time = "2021-05-17T01:20:17.802Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f7/a7/1a31561d10a089fcb46fe286766dd4e053a12f6e23b4fd1c26478aff2475/boltons-21.0.0-py2.py3-none-any.whl", hash = "sha256:b9bb7b58b2b420bbe11a6025fdef6d3e5edc9f76a42fb467afe7ca212ef9948b", size = 193723, upload-time = "2021-05-17T01:20:20.023Z" }, +] + +[[package]] +name = "boto3" +version = "1.42.47" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, + { name = "jmespath" }, + { name = "s3transfer" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/eb/fe/3363024b6dda5968401f45d8b345ed95ce4fd536d58f799988b4b28184ad/boto3-1.42.47.tar.gz", hash = "sha256:74812a2e29de7c2bd19e446d765cb887394f20f1517388484b51891a410f33b2", size = 112884, upload-time = "2026-02-11T20:49:49.196Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/7b/884e30adab2339ce5cce7b800f5fa619254d36e89e50a8cf39a5524edc35/boto3-1.42.47-py3-none-any.whl", hash = "sha256:ed881ed246027028af566acbb80f008aa619be4d3fdbcc4ad3c75dbe8c34bfaf", size = 140608, upload-time = "2026-02-11T20:49:47.664Z" }, +] + +[[package]] +name = "botocore" +version = "1.42.47" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jmespath" }, + { name = "python-dateutil" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ed/a6/d15f5dfe990abd76dbdb2105a7697e0d948e04c41dfd97c058bc76c7cebd/botocore-1.42.47.tar.gz", hash = "sha256:c26e190c1b4d863ba7b44dc68cc574d8eb862ddae5f0fe3472801daee12a0378", size = 14952255, upload-time = "2026-02-11T20:49:40.157Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/5e/50e3a59b243894088eeb949a654fb21d9ab7d0d703034470de016828d85a/botocore-1.42.47-py3-none-any.whl", hash = "sha256:c60f5feaf189423e17755aca3f1d672b7466620dd2032440b32aaac64ae8cac8", size = 14625351, upload-time = "2026-02-11T20:49:36.143Z" }, +] + +[[package]] +name = "bracex" +version = "2.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/63/9a/fec38644694abfaaeca2798b58e276a8e61de49e2e37494ace423395febc/bracex-2.6.tar.gz", hash = "sha256:98f1347cd77e22ee8d967a30ad4e310b233f7754dbf31ff3fceb76145ba47dc7", size = 26642, upload-time = "2025-06-22T19:12:31.254Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9d/2a/9186535ce58db529927f6cf5990a849aa9e052eea3e2cfefe20b9e1802da/bracex-2.6-py3-none-any.whl", hash = "sha256:0b0049264e7340b3ec782b5cb99beb325f36c3782a32e36e876452fd49a09952", size = 11508, upload-time = "2025-06-22T19:12:29.781Z" }, +] + +[[package]] +name = "certifi" +version = "2026.1.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/2d/a891ca51311197f6ad14a7ef42e2399f36cf2f9bd44752b3dc4eab60fdc5/certifi-2026.1.4.tar.gz", hash = "sha256:ac726dd470482006e014ad384921ed6438c457018f4b3d204aea4281258b2120", size = 154268, upload-time = "2026-01-04T02:42:41.825Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e6/ad/3cc14f097111b4de0040c83a525973216457bbeeb63739ef1ed275c1c021/certifi-2026.1.4-py3-none-any.whl", hash = "sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c", size = 152900, upload-time = "2026-01-04T02:42:40.15Z" }, +] + +[[package]] +name = "cffi" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pycparser", marker = "implementation_name != 'PyPy'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4b/8d/a0a47a0c9e413a658623d014e91e74a50cdd2c423f7ccfd44086ef767f90/cffi-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb", size = 185230, upload-time = "2025-09-08T23:23:00.879Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d2/a6c0296814556c68ee32009d9c2ad4f85f2707cdecfd7727951ec228005d/cffi-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca", size = 181043, upload-time = "2025-09-08T23:23:02.231Z" }, + { url = "https://files.pythonhosted.org/packages/b0/1e/d22cc63332bd59b06481ceaac49d6c507598642e2230f201649058a7e704/cffi-2.0.0-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b", size = 212446, upload-time = "2025-09-08T23:23:03.472Z" }, + { url = "https://files.pythonhosted.org/packages/a9/f5/a2c23eb03b61a0b8747f211eb716446c826ad66818ddc7810cc2cc19b3f2/cffi-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b", size = 220101, upload-time = "2025-09-08T23:23:04.792Z" }, + { url = "https://files.pythonhosted.org/packages/f2/7f/e6647792fc5850d634695bc0e6ab4111ae88e89981d35ac269956605feba/cffi-2.0.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2", size = 207948, upload-time = "2025-09-08T23:23:06.127Z" }, + { url = "https://files.pythonhosted.org/packages/cb/1e/a5a1bd6f1fb30f22573f76533de12a00bf274abcdc55c8edab639078abb6/cffi-2.0.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3", size = 206422, upload-time = "2025-09-08T23:23:07.753Z" }, + { url = "https://files.pythonhosted.org/packages/98/df/0a1755e750013a2081e863e7cd37e0cdd02664372c754e5560099eb7aa44/cffi-2.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26", size = 219499, upload-time = "2025-09-08T23:23:09.648Z" }, + { url = "https://files.pythonhosted.org/packages/50/e1/a969e687fcf9ea58e6e2a928ad5e2dd88cc12f6f0ab477e9971f2309b57c/cffi-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c", size = 222928, upload-time = "2025-09-08T23:23:10.928Z" }, + { url = "https://files.pythonhosted.org/packages/36/54/0362578dd2c9e557a28ac77698ed67323ed5b9775ca9d3fe73fe191bb5d8/cffi-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b", size = 221302, upload-time = "2025-09-08T23:23:12.42Z" }, + { url = "https://files.pythonhosted.org/packages/eb/6d/bf9bda840d5f1dfdbf0feca87fbdb64a918a69bca42cfa0ba7b137c48cb8/cffi-2.0.0-cp313-cp313-win32.whl", hash = "sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27", size = 172909, upload-time = "2025-09-08T23:23:14.32Z" }, + { url = "https://files.pythonhosted.org/packages/37/18/6519e1ee6f5a1e579e04b9ddb6f1676c17368a7aba48299c3759bbc3c8b3/cffi-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75", size = 183402, upload-time = "2025-09-08T23:23:15.535Z" }, + { url = "https://files.pythonhosted.org/packages/cb/0e/02ceeec9a7d6ee63bb596121c2c8e9b3a9e150936f4fbef6ca1943e6137c/cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91", size = 177780, upload-time = "2025-09-08T23:23:16.761Z" }, + { url = "https://files.pythonhosted.org/packages/92/c4/3ce07396253a83250ee98564f8d7e9789fab8e58858f35d07a9a2c78de9f/cffi-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5", size = 185320, upload-time = "2025-09-08T23:23:18.087Z" }, + { url = "https://files.pythonhosted.org/packages/59/dd/27e9fa567a23931c838c6b02d0764611c62290062a6d4e8ff7863daf9730/cffi-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13", size = 181487, upload-time = "2025-09-08T23:23:19.622Z" }, + { url = "https://files.pythonhosted.org/packages/d6/43/0e822876f87ea8a4ef95442c3d766a06a51fc5298823f884ef87aaad168c/cffi-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b", size = 220049, upload-time = "2025-09-08T23:23:20.853Z" }, + { url = "https://files.pythonhosted.org/packages/b4/89/76799151d9c2d2d1ead63c2429da9ea9d7aac304603de0c6e8764e6e8e70/cffi-2.0.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c", size = 207793, upload-time = "2025-09-08T23:23:22.08Z" }, + { url = "https://files.pythonhosted.org/packages/bb/dd/3465b14bb9e24ee24cb88c9e3730f6de63111fffe513492bf8c808a3547e/cffi-2.0.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef", size = 206300, upload-time = "2025-09-08T23:23:23.314Z" }, + { url = "https://files.pythonhosted.org/packages/47/d9/d83e293854571c877a92da46fdec39158f8d7e68da75bf73581225d28e90/cffi-2.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775", size = 219244, upload-time = "2025-09-08T23:23:24.541Z" }, + { url = "https://files.pythonhosted.org/packages/2b/0f/1f177e3683aead2bb00f7679a16451d302c436b5cbf2505f0ea8146ef59e/cffi-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205", size = 222828, upload-time = "2025-09-08T23:23:26.143Z" }, + { url = "https://files.pythonhosted.org/packages/c6/0f/cafacebd4b040e3119dcb32fed8bdef8dfe94da653155f9d0b9dc660166e/cffi-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1", size = 220926, upload-time = "2025-09-08T23:23:27.873Z" }, + { url = "https://files.pythonhosted.org/packages/3e/aa/df335faa45b395396fcbc03de2dfcab242cd61a9900e914fe682a59170b1/cffi-2.0.0-cp314-cp314-win32.whl", hash = "sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f", size = 175328, upload-time = "2025-09-08T23:23:44.61Z" }, + { url = "https://files.pythonhosted.org/packages/bb/92/882c2d30831744296ce713f0feb4c1cd30f346ef747b530b5318715cc367/cffi-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25", size = 185650, upload-time = "2025-09-08T23:23:45.848Z" }, + { url = "https://files.pythonhosted.org/packages/9f/2c/98ece204b9d35a7366b5b2c6539c350313ca13932143e79dc133ba757104/cffi-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad", size = 180687, upload-time = "2025-09-08T23:23:47.105Z" }, + { url = "https://files.pythonhosted.org/packages/3e/61/c768e4d548bfa607abcda77423448df8c471f25dbe64fb2ef6d555eae006/cffi-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9", size = 188773, upload-time = "2025-09-08T23:23:29.347Z" }, + { url = "https://files.pythonhosted.org/packages/2c/ea/5f76bce7cf6fcd0ab1a1058b5af899bfbef198bea4d5686da88471ea0336/cffi-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d", size = 185013, upload-time = "2025-09-08T23:23:30.63Z" }, + { url = "https://files.pythonhosted.org/packages/be/b4/c56878d0d1755cf9caa54ba71e5d049479c52f9e4afc230f06822162ab2f/cffi-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c", size = 221593, upload-time = "2025-09-08T23:23:31.91Z" }, + { url = "https://files.pythonhosted.org/packages/e0/0d/eb704606dfe8033e7128df5e90fee946bbcb64a04fcdaa97321309004000/cffi-2.0.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8", size = 209354, upload-time = "2025-09-08T23:23:33.214Z" }, + { url = "https://files.pythonhosted.org/packages/d8/19/3c435d727b368ca475fb8742ab97c9cb13a0de600ce86f62eab7fa3eea60/cffi-2.0.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc", size = 208480, upload-time = "2025-09-08T23:23:34.495Z" }, + { url = "https://files.pythonhosted.org/packages/d0/44/681604464ed9541673e486521497406fadcc15b5217c3e326b061696899a/cffi-2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592", size = 221584, upload-time = "2025-09-08T23:23:36.096Z" }, + { url = "https://files.pythonhosted.org/packages/25/8e/342a504ff018a2825d395d44d63a767dd8ebc927ebda557fecdaca3ac33a/cffi-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512", size = 224443, upload-time = "2025-09-08T23:23:37.328Z" }, + { url = "https://files.pythonhosted.org/packages/e1/5e/b666bacbbc60fbf415ba9988324a132c9a7a0448a9a8f125074671c0f2c3/cffi-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4", size = 223437, upload-time = "2025-09-08T23:23:38.945Z" }, + { url = "https://files.pythonhosted.org/packages/a0/1d/ec1a60bd1a10daa292d3cd6bb0b359a81607154fb8165f3ec95fe003b85c/cffi-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e", size = 180487, upload-time = "2025-09-08T23:23:40.423Z" }, + { url = "https://files.pythonhosted.org/packages/bf/41/4c1168c74fac325c0c8156f04b6749c8b6a8f405bbf91413ba088359f60d/cffi-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6", size = 191726, upload-time = "2025-09-08T23:23:41.742Z" }, + { url = "https://files.pythonhosted.org/packages/ae/3a/dbeec9d1ee0844c679f6bb5d6ad4e9f198b1224f4e7a32825f47f6192b0c/cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9", size = 184195, upload-time = "2025-09-08T23:23:43.004Z" }, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/13/69/33ddede1939fdd074bce5434295f38fae7136463422fe4fd3e0e89b98062/charset_normalizer-3.4.4.tar.gz", hash = "sha256:94537985111c35f28720e43603b8e7b43a6ecfb2ce1d3058bbe955b73404e21a", size = 129418, upload-time = "2025-10-14T04:42:32.879Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/97/45/4b3a1239bbacd321068ea6e7ac28875b03ab8bc0aa0966452db17cd36714/charset_normalizer-3.4.4-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e1f185f86a6f3403aa2420e815904c67b2f9ebc443f045edd0de921108345794", size = 208091, upload-time = "2025-10-14T04:41:13.346Z" }, + { url = "https://files.pythonhosted.org/packages/7d/62/73a6d7450829655a35bb88a88fca7d736f9882a27eacdca2c6d505b57e2e/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b39f987ae8ccdf0d2642338faf2abb1862340facc796048b604ef14919e55ed", size = 147936, upload-time = "2025-10-14T04:41:14.461Z" }, + { url = "https://files.pythonhosted.org/packages/89/c5/adb8c8b3d6625bef6d88b251bbb0d95f8205831b987631ab0c8bb5d937c2/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3162d5d8ce1bb98dd51af660f2121c55d0fa541b46dff7bb9b9f86ea1d87de72", size = 144180, upload-time = "2025-10-14T04:41:15.588Z" }, + { url = "https://files.pythonhosted.org/packages/91/ed/9706e4070682d1cc219050b6048bfd293ccf67b3d4f5a4f39207453d4b99/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:81d5eb2a312700f4ecaa977a8235b634ce853200e828fbadf3a9c50bab278328", size = 161346, upload-time = "2025-10-14T04:41:16.738Z" }, + { url = "https://files.pythonhosted.org/packages/d5/0d/031f0d95e4972901a2f6f09ef055751805ff541511dc1252ba3ca1f80cf5/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5bd2293095d766545ec1a8f612559f6b40abc0eb18bb2f5d1171872d34036ede", size = 158874, upload-time = "2025-10-14T04:41:17.923Z" }, + { url = "https://files.pythonhosted.org/packages/f5/83/6ab5883f57c9c801ce5e5677242328aa45592be8a00644310a008d04f922/charset_normalizer-3.4.4-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a8a8b89589086a25749f471e6a900d3f662d1d3b6e2e59dcecf787b1cc3a1894", size = 153076, upload-time = "2025-10-14T04:41:19.106Z" }, + { url = "https://files.pythonhosted.org/packages/75/1e/5ff781ddf5260e387d6419959ee89ef13878229732732ee73cdae01800f2/charset_normalizer-3.4.4-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc7637e2f80d8530ee4a78e878bce464f70087ce73cf7c1caf142416923b98f1", size = 150601, upload-time = "2025-10-14T04:41:20.245Z" }, + { url = "https://files.pythonhosted.org/packages/d7/57/71be810965493d3510a6ca79b90c19e48696fb1ff964da319334b12677f0/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f8bf04158c6b607d747e93949aa60618b61312fe647a6369f88ce2ff16043490", size = 150376, upload-time = "2025-10-14T04:41:21.398Z" }, + { url = "https://files.pythonhosted.org/packages/e5/d5/c3d057a78c181d007014feb7e9f2e65905a6c4ef182c0ddf0de2924edd65/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:554af85e960429cf30784dd47447d5125aaa3b99a6f0683589dbd27e2f45da44", size = 144825, upload-time = "2025-10-14T04:41:22.583Z" }, + { url = "https://files.pythonhosted.org/packages/e6/8c/d0406294828d4976f275ffbe66f00266c4b3136b7506941d87c00cab5272/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:74018750915ee7ad843a774364e13a3db91682f26142baddf775342c3f5b1133", size = 162583, upload-time = "2025-10-14T04:41:23.754Z" }, + { url = "https://files.pythonhosted.org/packages/d7/24/e2aa1f18c8f15c4c0e932d9287b8609dd30ad56dbe41d926bd846e22fb8d/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:c0463276121fdee9c49b98908b3a89c39be45d86d1dbaa22957e38f6321d4ce3", size = 150366, upload-time = "2025-10-14T04:41:25.27Z" }, + { url = "https://files.pythonhosted.org/packages/e4/5b/1e6160c7739aad1e2df054300cc618b06bf784a7a164b0f238360721ab86/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:362d61fd13843997c1c446760ef36f240cf81d3ebf74ac62652aebaf7838561e", size = 160300, upload-time = "2025-10-14T04:41:26.725Z" }, + { url = "https://files.pythonhosted.org/packages/7a/10/f882167cd207fbdd743e55534d5d9620e095089d176d55cb22d5322f2afd/charset_normalizer-3.4.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9a26f18905b8dd5d685d6d07b0cdf98a79f3c7a918906af7cc143ea2e164c8bc", size = 154465, upload-time = "2025-10-14T04:41:28.322Z" }, + { url = "https://files.pythonhosted.org/packages/89/66/c7a9e1b7429be72123441bfdbaf2bc13faab3f90b933f664db506dea5915/charset_normalizer-3.4.4-cp313-cp313-win32.whl", hash = "sha256:9b35f4c90079ff2e2edc5b26c0c77925e5d2d255c42c74fdb70fb49b172726ac", size = 99404, upload-time = "2025-10-14T04:41:29.95Z" }, + { url = "https://files.pythonhosted.org/packages/c4/26/b9924fa27db384bdcd97ab83b4f0a8058d96ad9626ead570674d5e737d90/charset_normalizer-3.4.4-cp313-cp313-win_amd64.whl", hash = "sha256:b435cba5f4f750aa6c0a0d92c541fb79f69a387c91e61f1795227e4ed9cece14", size = 107092, upload-time = "2025-10-14T04:41:31.188Z" }, + { url = "https://files.pythonhosted.org/packages/af/8f/3ed4bfa0c0c72a7ca17f0380cd9e4dd842b09f664e780c13cff1dcf2ef1b/charset_normalizer-3.4.4-cp313-cp313-win_arm64.whl", hash = "sha256:542d2cee80be6f80247095cc36c418f7bddd14f4a6de45af91dfad36d817bba2", size = 100408, upload-time = "2025-10-14T04:41:32.624Z" }, + { url = "https://files.pythonhosted.org/packages/2a/35/7051599bd493e62411d6ede36fd5af83a38f37c4767b92884df7301db25d/charset_normalizer-3.4.4-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:da3326d9e65ef63a817ecbcc0df6e94463713b754fe293eaa03da99befb9a5bd", size = 207746, upload-time = "2025-10-14T04:41:33.773Z" }, + { url = "https://files.pythonhosted.org/packages/10/9a/97c8d48ef10d6cd4fcead2415523221624bf58bcf68a802721a6bc807c8f/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8af65f14dc14a79b924524b1e7fffe304517b2bff5a58bf64f30b98bbc5079eb", size = 147889, upload-time = "2025-10-14T04:41:34.897Z" }, + { url = "https://files.pythonhosted.org/packages/10/bf/979224a919a1b606c82bd2c5fa49b5c6d5727aa47b4312bb27b1734f53cd/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:74664978bb272435107de04e36db5a9735e78232b85b77d45cfb38f758efd33e", size = 143641, upload-time = "2025-10-14T04:41:36.116Z" }, + { url = "https://files.pythonhosted.org/packages/ba/33/0ad65587441fc730dc7bd90e9716b30b4702dc7b617e6ba4997dc8651495/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:752944c7ffbfdd10c074dc58ec2d5a8a4cd9493b314d367c14d24c17684ddd14", size = 160779, upload-time = "2025-10-14T04:41:37.229Z" }, + { url = "https://files.pythonhosted.org/packages/67/ed/331d6b249259ee71ddea93f6f2f0a56cfebd46938bde6fcc6f7b9a3d0e09/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d1f13550535ad8cff21b8d757a3257963e951d96e20ec82ab44bc64aeb62a191", size = 159035, upload-time = "2025-10-14T04:41:38.368Z" }, + { url = "https://files.pythonhosted.org/packages/67/ff/f6b948ca32e4f2a4576aa129d8bed61f2e0543bf9f5f2b7fc3758ed005c9/charset_normalizer-3.4.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ecaae4149d99b1c9e7b88bb03e3221956f68fd6d50be2ef061b2381b61d20838", size = 152542, upload-time = "2025-10-14T04:41:39.862Z" }, + { url = "https://files.pythonhosted.org/packages/16/85/276033dcbcc369eb176594de22728541a925b2632f9716428c851b149e83/charset_normalizer-3.4.4-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cb6254dc36b47a990e59e1068afacdcd02958bdcce30bb50cc1700a8b9d624a6", size = 149524, upload-time = "2025-10-14T04:41:41.319Z" }, + { url = "https://files.pythonhosted.org/packages/9e/f2/6a2a1f722b6aba37050e626530a46a68f74e63683947a8acff92569f979a/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c8ae8a0f02f57a6e61203a31428fa1d677cbe50c93622b4149d5c0f319c1d19e", size = 150395, upload-time = "2025-10-14T04:41:42.539Z" }, + { url = "https://files.pythonhosted.org/packages/60/bb/2186cb2f2bbaea6338cad15ce23a67f9b0672929744381e28b0592676824/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:47cc91b2f4dd2833fddaedd2893006b0106129d4b94fdb6af1f4ce5a9965577c", size = 143680, upload-time = "2025-10-14T04:41:43.661Z" }, + { url = "https://files.pythonhosted.org/packages/7d/a5/bf6f13b772fbb2a90360eb620d52ed8f796f3c5caee8398c3b2eb7b1c60d/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:82004af6c302b5d3ab2cfc4cc5f29db16123b1a8417f2e25f9066f91d4411090", size = 162045, upload-time = "2025-10-14T04:41:44.821Z" }, + { url = "https://files.pythonhosted.org/packages/df/c5/d1be898bf0dc3ef9030c3825e5d3b83f2c528d207d246cbabe245966808d/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:2b7d8f6c26245217bd2ad053761201e9f9680f8ce52f0fcd8d0755aeae5b2152", size = 149687, upload-time = "2025-10-14T04:41:46.442Z" }, + { url = "https://files.pythonhosted.org/packages/a5/42/90c1f7b9341eef50c8a1cb3f098ac43b0508413f33affd762855f67a410e/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:799a7a5e4fb2d5898c60b640fd4981d6a25f1c11790935a44ce38c54e985f828", size = 160014, upload-time = "2025-10-14T04:41:47.631Z" }, + { url = "https://files.pythonhosted.org/packages/76/be/4d3ee471e8145d12795ab655ece37baed0929462a86e72372fd25859047c/charset_normalizer-3.4.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:99ae2cffebb06e6c22bdc25801d7b30f503cc87dbd283479e7b606f70aff57ec", size = 154044, upload-time = "2025-10-14T04:41:48.81Z" }, + { url = "https://files.pythonhosted.org/packages/b0/6f/8f7af07237c34a1defe7defc565a9bc1807762f672c0fde711a4b22bf9c0/charset_normalizer-3.4.4-cp314-cp314-win32.whl", hash = "sha256:f9d332f8c2a2fcbffe1378594431458ddbef721c1769d78e2cbc06280d8155f9", size = 99940, upload-time = "2025-10-14T04:41:49.946Z" }, + { url = "https://files.pythonhosted.org/packages/4b/51/8ade005e5ca5b0d80fb4aff72a3775b325bdc3d27408c8113811a7cbe640/charset_normalizer-3.4.4-cp314-cp314-win_amd64.whl", hash = "sha256:8a6562c3700cce886c5be75ade4a5db4214fda19fede41d9792d100288d8f94c", size = 107104, upload-time = "2025-10-14T04:41:51.051Z" }, + { url = "https://files.pythonhosted.org/packages/da/5f/6b8f83a55bb8278772c5ae54a577f3099025f9ade59d0136ac24a0df4bde/charset_normalizer-3.4.4-cp314-cp314-win_arm64.whl", hash = "sha256:de00632ca48df9daf77a2c65a484531649261ec9f25489917f09e455cb09ddb2", size = 100743, upload-time = "2025-10-14T04:41:52.122Z" }, + { url = "https://files.pythonhosted.org/packages/0a/4c/925909008ed5a988ccbb72dcc897407e5d6d3bd72410d69e051fc0c14647/charset_normalizer-3.4.4-py3-none-any.whl", hash = "sha256:7a32c560861a02ff789ad905a2fe94e3f840803362c84fecf1851cb4cf3dc37f", size = 53402, upload-time = "2025-10-14T04:42:31.76Z" }, +] + +[[package]] +name = "click" +version = "8.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065, upload-time = "2025-11-15T20:45:42.706Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274, upload-time = "2025-11-15T20:45:41.139Z" }, +] + +[[package]] +name = "click-option-group" +version = "0.5.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ef/ff/d291d66595b30b83d1cb9e314b2c9be7cfc7327d4a0d40a15da2416ea97b/click_option_group-0.5.9.tar.gz", hash = "sha256:f94ed2bc4cf69052e0f29592bd1e771a1789bd7bfc482dd0bc482134aff95823", size = 22222, upload-time = "2025-10-09T09:38:01.474Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/75/45/54bb2d8d4138964a94bef6e9afe48b0be4705ba66ac442ae7d8a8dc4ffef/click_option_group-0.5.9-py3-none-any.whl", hash = "sha256:ad2599248bd373e2e19bec5407967c3eec1d0d4fc4a5e77b08a0481e75991080", size = 11553, upload-time = "2025-10-09T09:38:00.066Z" }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, +] + +[[package]] +name = "cryptography" +version = "46.0.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/47/93/ac8f3d5ff04d54bc814e961a43ae5b0b146154c89c61b47bb07557679b18/cryptography-46.0.7.tar.gz", hash = "sha256:e4cfd68c5f3e0bfdad0d38e023239b96a2fe84146481852dffbcca442c245aa5", size = 750652, upload-time = "2026-04-08T01:57:54.692Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0b/5d/4a8f770695d73be252331e60e526291e3df0c9b27556a90a6b47bccca4c2/cryptography-46.0.7-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:ea42cbe97209df307fdc3b155f1b6fa2577c0defa8f1f7d3be7d31d189108ad4", size = 7179869, upload-time = "2026-04-08T01:56:17.157Z" }, + { url = "https://files.pythonhosted.org/packages/5f/45/6d80dc379b0bbc1f9d1e429f42e4cb9e1d319c7a8201beffd967c516ea01/cryptography-46.0.7-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b36a4695e29fe69215d75960b22577197aca3f7a25b9cf9d165dcfe9d80bc325", size = 4275492, upload-time = "2026-04-08T01:56:19.36Z" }, + { url = "https://files.pythonhosted.org/packages/4a/9a/1765afe9f572e239c3469f2cb429f3ba7b31878c893b246b4b2994ffe2fe/cryptography-46.0.7-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5ad9ef796328c5e3c4ceed237a183f5d41d21150f972455a9d926593a1dcb308", size = 4426670, upload-time = "2026-04-08T01:56:21.415Z" }, + { url = "https://files.pythonhosted.org/packages/8f/3e/af9246aaf23cd4ee060699adab1e47ced3f5f7e7a8ffdd339f817b446462/cryptography-46.0.7-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:73510b83623e080a2c35c62c15298096e2a5dc8d51c3b4e1740211839d0dea77", size = 4280275, upload-time = "2026-04-08T01:56:23.539Z" }, + { url = "https://files.pythonhosted.org/packages/0f/54/6bbbfc5efe86f9d71041827b793c24811a017c6ac0fd12883e4caa86b8ed/cryptography-46.0.7-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:cbd5fb06b62bd0721e1170273d3f4d5a277044c47ca27ee257025146c34cbdd1", size = 4928402, upload-time = "2026-04-08T01:56:25.624Z" }, + { url = "https://files.pythonhosted.org/packages/2d/cf/054b9d8220f81509939599c8bdbc0c408dbd2bdd41688616a20731371fe0/cryptography-46.0.7-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:420b1e4109cc95f0e5700eed79908cef9268265c773d3a66f7af1eef53d409ef", size = 4459985, upload-time = "2026-04-08T01:56:27.309Z" }, + { url = "https://files.pythonhosted.org/packages/f9/46/4e4e9c6040fb01c7467d47217d2f882daddeb8828f7df800cb806d8a2288/cryptography-46.0.7-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:24402210aa54baae71d99441d15bb5a1919c195398a87b563df84468160a65de", size = 3990652, upload-time = "2026-04-08T01:56:29.095Z" }, + { url = "https://files.pythonhosted.org/packages/36/5f/313586c3be5a2fbe87e4c9a254207b860155a8e1f3cca99f9910008e7d08/cryptography-46.0.7-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:8a469028a86f12eb7d2fe97162d0634026d92a21f3ae0ac87ed1c4a447886c83", size = 4279805, upload-time = "2026-04-08T01:56:30.928Z" }, + { url = "https://files.pythonhosted.org/packages/69/33/60dfc4595f334a2082749673386a4d05e4f0cf4df8248e63b2c3437585f2/cryptography-46.0.7-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:9694078c5d44c157ef3162e3bf3946510b857df5a3955458381d1c7cfc143ddb", size = 4892883, upload-time = "2026-04-08T01:56:32.614Z" }, + { url = "https://files.pythonhosted.org/packages/c7/0b/333ddab4270c4f5b972f980adef4faa66951a4aaf646ca067af597f15563/cryptography-46.0.7-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:42a1e5f98abb6391717978baf9f90dc28a743b7d9be7f0751a6f56a75d14065b", size = 4459756, upload-time = "2026-04-08T01:56:34.306Z" }, + { url = "https://files.pythonhosted.org/packages/d2/14/633913398b43b75f1234834170947957c6b623d1701ffc7a9600da907e89/cryptography-46.0.7-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:91bbcb08347344f810cbe49065914fe048949648f6bd5c2519f34619142bbe85", size = 4410244, upload-time = "2026-04-08T01:56:35.977Z" }, + { url = "https://files.pythonhosted.org/packages/10/f2/19ceb3b3dc14009373432af0c13f46aa08e3ce334ec6eff13492e1812ccd/cryptography-46.0.7-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5d1c02a14ceb9148cc7816249f64f623fbfee39e8c03b3650d842ad3f34d637e", size = 4674868, upload-time = "2026-04-08T01:56:38.034Z" }, + { url = "https://files.pythonhosted.org/packages/1a/bb/a5c213c19ee94b15dfccc48f363738633a493812687f5567addbcbba9f6f/cryptography-46.0.7-cp311-abi3-win32.whl", hash = "sha256:d23c8ca48e44ee015cd0a54aeccdf9f09004eba9fc96f38c911011d9ff1bd457", size = 3026504, upload-time = "2026-04-08T01:56:39.666Z" }, + { url = "https://files.pythonhosted.org/packages/2b/02/7788f9fefa1d060ca68717c3901ae7fffa21ee087a90b7f23c7a603c32ae/cryptography-46.0.7-cp311-abi3-win_amd64.whl", hash = "sha256:397655da831414d165029da9bc483bed2fe0e75dde6a1523ec2fe63f3c46046b", size = 3488363, upload-time = "2026-04-08T01:56:41.893Z" }, + { url = "https://files.pythonhosted.org/packages/7b/56/15619b210e689c5403bb0540e4cb7dbf11a6bf42e483b7644e471a2812b3/cryptography-46.0.7-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:d151173275e1728cf7839aaa80c34fe550c04ddb27b34f48c232193df8db5842", size = 7119671, upload-time = "2026-04-08T01:56:44Z" }, + { url = "https://files.pythonhosted.org/packages/74/66/e3ce040721b0b5599e175ba91ab08884c75928fbeb74597dd10ef13505d2/cryptography-46.0.7-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:db0f493b9181c7820c8134437eb8b0b4792085d37dbb24da050476ccb664e59c", size = 4268551, upload-time = "2026-04-08T01:56:46.071Z" }, + { url = "https://files.pythonhosted.org/packages/03/11/5e395f961d6868269835dee1bafec6a1ac176505a167f68b7d8818431068/cryptography-46.0.7-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ebd6daf519b9f189f85c479427bbd6e9c9037862cf8fe89ee35503bd209ed902", size = 4408887, upload-time = "2026-04-08T01:56:47.718Z" }, + { url = "https://files.pythonhosted.org/packages/40/53/8ed1cf4c3b9c8e611e7122fb56f1c32d09e1fff0f1d77e78d9ff7c82653e/cryptography-46.0.7-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:b7b412817be92117ec5ed95f880defe9cf18a832e8cafacf0a22337dc1981b4d", size = 4271354, upload-time = "2026-04-08T01:56:49.312Z" }, + { url = "https://files.pythonhosted.org/packages/50/46/cf71e26025c2e767c5609162c866a78e8a2915bbcfa408b7ca495c6140c4/cryptography-46.0.7-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:fbfd0e5f273877695cb93baf14b185f4878128b250cc9f8e617ea0c025dfb022", size = 4905845, upload-time = "2026-04-08T01:56:50.916Z" }, + { url = "https://files.pythonhosted.org/packages/c0/ea/01276740375bac6249d0a971ebdf6b4dc9ead0ee0a34ef3b5a88c1a9b0d4/cryptography-46.0.7-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:ffca7aa1d00cf7d6469b988c581598f2259e46215e0140af408966a24cf086ce", size = 4444641, upload-time = "2026-04-08T01:56:52.882Z" }, + { url = "https://files.pythonhosted.org/packages/3d/4c/7d258f169ae71230f25d9f3d06caabcff8c3baf0978e2b7d65e0acac3827/cryptography-46.0.7-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:60627cf07e0d9274338521205899337c5d18249db56865f943cbe753aa96f40f", size = 3967749, upload-time = "2026-04-08T01:56:54.597Z" }, + { url = "https://files.pythonhosted.org/packages/b5/2a/2ea0767cad19e71b3530e4cad9605d0b5e338b6a1e72c37c9c1ceb86c333/cryptography-46.0.7-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:80406c3065e2c55d7f49a9550fe0c49b3f12e5bfff5dedb727e319e1afb9bf99", size = 4270942, upload-time = "2026-04-08T01:56:56.416Z" }, + { url = "https://files.pythonhosted.org/packages/41/3d/fe14df95a83319af25717677e956567a105bb6ab25641acaa093db79975d/cryptography-46.0.7-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:c5b1ccd1239f48b7151a65bc6dd54bcfcc15e028c8ac126d3fada09db0e07ef1", size = 4871079, upload-time = "2026-04-08T01:56:58.31Z" }, + { url = "https://files.pythonhosted.org/packages/9c/59/4a479e0f36f8f378d397f4eab4c850b4ffb79a2f0d58704b8fa0703ddc11/cryptography-46.0.7-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:d5f7520159cd9c2154eb61eb67548ca05c5774d39e9c2c4339fd793fe7d097b2", size = 4443999, upload-time = "2026-04-08T01:57:00.508Z" }, + { url = "https://files.pythonhosted.org/packages/28/17/b59a741645822ec6d04732b43c5d35e4ef58be7bfa84a81e5ae6f05a1d33/cryptography-46.0.7-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:fcd8eac50d9138c1d7fc53a653ba60a2bee81a505f9f8850b6b2888555a45d0e", size = 4399191, upload-time = "2026-04-08T01:57:02.654Z" }, + { url = "https://files.pythonhosted.org/packages/59/6a/bb2e166d6d0e0955f1e9ff70f10ec4b2824c9cfcdb4da772c7dd69cc7d80/cryptography-46.0.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:65814c60f8cc400c63131584e3e1fad01235edba2614b61fbfbfa954082db0ee", size = 4655782, upload-time = "2026-04-08T01:57:04.592Z" }, + { url = "https://files.pythonhosted.org/packages/95/b6/3da51d48415bcb63b00dc17c2eff3a651b7c4fed484308d0f19b30e8cb2c/cryptography-46.0.7-cp314-cp314t-win32.whl", hash = "sha256:fdd1736fed309b4300346f88f74cd120c27c56852c3838cab416e7a166f67298", size = 3002227, upload-time = "2026-04-08T01:57:06.91Z" }, + { url = "https://files.pythonhosted.org/packages/32/a8/9f0e4ed57ec9cebe506e58db11ae472972ecb0c659e4d52bbaee80ca340a/cryptography-46.0.7-cp314-cp314t-win_amd64.whl", hash = "sha256:e06acf3c99be55aa3b516397fe42f5855597f430add9c17fa46bf2e0fb34c9bb", size = 3475332, upload-time = "2026-04-08T01:57:08.807Z" }, + { url = "https://files.pythonhosted.org/packages/a7/7f/cd42fc3614386bc0c12f0cb3c4ae1fc2bbca5c9662dfed031514911d513d/cryptography-46.0.7-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:462ad5cb1c148a22b2e3bcc5ad52504dff325d17daf5df8d88c17dda1f75f2a4", size = 7165618, upload-time = "2026-04-08T01:57:10.645Z" }, + { url = "https://files.pythonhosted.org/packages/a5/d0/36a49f0262d2319139d2829f773f1b97ef8aef7f97e6e5bd21455e5a8fb5/cryptography-46.0.7-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:84d4cced91f0f159a7ddacad249cc077e63195c36aac40b4150e7a57e84fffe7", size = 4270628, upload-time = "2026-04-08T01:57:12.885Z" }, + { url = "https://files.pythonhosted.org/packages/8a/6c/1a42450f464dda6ffbe578a911f773e54dd48c10f9895a23a7e88b3e7db5/cryptography-46.0.7-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:128c5edfe5e5938b86b03941e94fac9ee793a94452ad1365c9fc3f4f62216832", size = 4415405, upload-time = "2026-04-08T01:57:14.923Z" }, + { url = "https://files.pythonhosted.org/packages/9a/92/4ed714dbe93a066dc1f4b4581a464d2d7dbec9046f7c8b7016f5286329e2/cryptography-46.0.7-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:5e51be372b26ef4ba3de3c167cd3d1022934bc838ae9eaad7e644986d2a3d163", size = 4272715, upload-time = "2026-04-08T01:57:16.638Z" }, + { url = "https://files.pythonhosted.org/packages/b7/e6/a26b84096eddd51494bba19111f8fffe976f6a09f132706f8f1bf03f51f7/cryptography-46.0.7-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:cdf1a610ef82abb396451862739e3fc93b071c844399e15b90726ef7470eeaf2", size = 4918400, upload-time = "2026-04-08T01:57:19.021Z" }, + { url = "https://files.pythonhosted.org/packages/c7/08/ffd537b605568a148543ac3c2b239708ae0bd635064bab41359252ef88ed/cryptography-46.0.7-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:1d25aee46d0c6f1a501adcddb2d2fee4b979381346a78558ed13e50aa8a59067", size = 4450634, upload-time = "2026-04-08T01:57:21.185Z" }, + { url = "https://files.pythonhosted.org/packages/16/01/0cd51dd86ab5b9befe0d031e276510491976c3a80e9f6e31810cce46c4ad/cryptography-46.0.7-cp38-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:cdfbe22376065ffcf8be74dc9a909f032df19bc58a699456a21712d6e5eabfd0", size = 3985233, upload-time = "2026-04-08T01:57:22.862Z" }, + { url = "https://files.pythonhosted.org/packages/92/49/819d6ed3a7d9349c2939f81b500a738cb733ab62fbecdbc1e38e83d45e12/cryptography-46.0.7-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:abad9dac36cbf55de6eb49badd4016806b3165d396f64925bf2999bcb67837ba", size = 4271955, upload-time = "2026-04-08T01:57:24.814Z" }, + { url = "https://files.pythonhosted.org/packages/80/07/ad9b3c56ebb95ed2473d46df0847357e01583f4c52a85754d1a55e29e4d0/cryptography-46.0.7-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:935ce7e3cfdb53e3536119a542b839bb94ec1ad081013e9ab9b7cfd478b05006", size = 4879888, upload-time = "2026-04-08T01:57:26.88Z" }, + { url = "https://files.pythonhosted.org/packages/b8/c7/201d3d58f30c4c2bdbe9b03844c291feb77c20511cc3586daf7edc12a47b/cryptography-46.0.7-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:35719dc79d4730d30f1c2b6474bd6acda36ae2dfae1e3c16f2051f215df33ce0", size = 4449961, upload-time = "2026-04-08T01:57:29.068Z" }, + { url = "https://files.pythonhosted.org/packages/a5/ef/649750cbf96f3033c3c976e112265c33906f8e462291a33d77f90356548c/cryptography-46.0.7-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:7bbc6ccf49d05ac8f7d7b5e2e2c33830d4fe2061def88210a126d130d7f71a85", size = 4401696, upload-time = "2026-04-08T01:57:31.029Z" }, + { url = "https://files.pythonhosted.org/packages/41/52/a8908dcb1a389a459a29008c29966c1d552588d4ae6d43f3a1a4512e0ebe/cryptography-46.0.7-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a1529d614f44b863a7b480c6d000fe93b59acee9c82ffa027cfadc77521a9f5e", size = 4664256, upload-time = "2026-04-08T01:57:33.144Z" }, + { url = "https://files.pythonhosted.org/packages/4b/fa/f0ab06238e899cc3fb332623f337a7364f36f4bb3f2534c2bb95a35b132c/cryptography-46.0.7-cp38-abi3-win32.whl", hash = "sha256:f247c8c1a1fb45e12586afbb436ef21ff1e80670b2861a90353d9b025583d246", size = 3013001, upload-time = "2026-04-08T01:57:34.933Z" }, + { url = "https://files.pythonhosted.org/packages/d2/f1/00ce3bde3ca542d1acd8f8cfa38e446840945aa6363f9b74746394b14127/cryptography-46.0.7-cp38-abi3-win_amd64.whl", hash = "sha256:506c4ff91eff4f82bdac7633318a526b1d1309fc07ca76a3ad182cb5b686d6d3", size = 3472985, upload-time = "2026-04-08T01:57:36.714Z" }, +] + +[[package]] +name = "defusedxml" +version = "0.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0f/d5/c66da9b79e5bdb124974bfe172b4daf3c984ebd9c2a06e2b8a4dc7331c72/defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69", size = 75520, upload-time = "2021-03-08T10:59:26.269Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/6c/aa3f2f849e01cb6a001cd8554a88d4c77c5c1a31c95bdf1cf9301e6d9ef4/defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61", size = 25604, upload-time = "2021-03-08T10:59:24.45Z" }, +] + +[[package]] +name = "dill" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/81/e1/56027a71e31b02ddc53c7d65b01e68edf64dea2932122fe7746a516f75d5/dill-0.4.1.tar.gz", hash = "sha256:423092df4182177d4d8ba8290c8a5b640c66ab35ec7da59ccfa00f6fa3eea5fa", size = 187315, upload-time = "2026-01-19T02:36:56.85Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/77/dc8c558f7593132cf8fefec57c4f60c83b16941c574ac5f619abb3ae7933/dill-0.4.1-py3-none-any.whl", hash = "sha256:1e1ce33e978ae97fcfcff5638477032b801c46c7c65cf717f95fbc2248f79a9d", size = 120019, upload-time = "2026-01-19T02:36:55.663Z" }, +] + +[[package]] +name = "docstring-parser" +version = "0.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/9d/c3b43da9515bd270df0f80548d9944e389870713cc1fe2b8fb35fe2bcefd/docstring_parser-0.17.0.tar.gz", hash = "sha256:583de4a309722b3315439bb31d64ba3eebada841f2e2cee23b99df001434c912", size = 27442, upload-time = "2025-07-21T07:35:01.868Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/55/e2/2537ebcff11c1ee1ff17d8d0b6f4db75873e3b0fb32c2d4a2ee31ecb310a/docstring_parser-0.17.0-py3-none-any.whl", hash = "sha256:cf2569abd23dce8099b300f9b4fa8191e9582dda731fd533daf54c4551658708", size = 36896, upload-time = "2025-07-21T07:35:00.684Z" }, +] + +[[package]] +name = "exceptiongroup" +version = "1.2.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/09/35/2495c4ac46b980e4ca1f6ad6db102322ef3ad2410b79fdde159a4b0f3b92/exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc", size = 28883, upload-time = "2024-07-12T22:26:00.161Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/02/cc/b7e31358aac6ed1ef2bb790a9746ac2c69bcb3c8588b41616914eb106eaf/exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b", size = 16453, upload-time = "2024-07-12T22:25:58.476Z" }, +] + +[[package]] +name = "face" +version = "26.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "boltons" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/24/4e/0e106b0ba486cc38c858fb5efe899002f2ec4765e0808b298d8e19a16efb/face-26.0.0.tar.gz", hash = "sha256:ae12136ff0052f124811f5319670a8d9d29b7d2caaaabe542813690967cc6bca", size = 49862, upload-time = "2026-02-14T00:17:12.576Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/63/1d/c2f7a4334f7501a3474766b5bc0948e8e0b0916217a54d092dd700a5ed3c/face-26.0.0-py3-none-any.whl", hash = "sha256:6ec9cf271d8ee2447f04b14264209a09ec9cbe8252255e61fb7ab6b154e300f9", size = 54825, upload-time = "2026-02-14T00:17:11.519Z" }, +] + +[[package]] +name = "frozenlist" +version = "1.8.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2d/f5/c831fac6cc817d26fd54c7eaccd04ef7e0288806943f7cc5bbf69f3ac1f0/frozenlist-1.8.0.tar.gz", hash = "sha256:3ede829ed8d842f6cd48fc7081d7a41001a56f1f38603f9d49bf3020d59a31ad", size = 45875, upload-time = "2025-10-06T05:38:17.865Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2d/40/0832c31a37d60f60ed79e9dfb5a92e1e2af4f40a16a29abcc7992af9edff/frozenlist-1.8.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8d92f1a84bb12d9e56f818b3a746f3efba93c1b63c8387a73dde655e1e42282a", size = 85717, upload-time = "2025-10-06T05:36:27.341Z" }, + { url = "https://files.pythonhosted.org/packages/30/ba/b0b3de23f40bc55a7057bd38434e25c34fa48e17f20ee273bbde5e0650f3/frozenlist-1.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:96153e77a591c8adc2ee805756c61f59fef4cf4073a9275ee86fe8cba41241f7", size = 49651, upload-time = "2025-10-06T05:36:28.855Z" }, + { url = "https://files.pythonhosted.org/packages/0c/ab/6e5080ee374f875296c4243c381bbdef97a9ac39c6e3ce1d5f7d42cb78d6/frozenlist-1.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f21f00a91358803399890ab167098c131ec2ddd5f8f5fd5fe9c9f2c6fcd91e40", size = 49417, upload-time = "2025-10-06T05:36:29.877Z" }, + { url = "https://files.pythonhosted.org/packages/d5/4e/e4691508f9477ce67da2015d8c00acd751e6287739123113a9fca6f1604e/frozenlist-1.8.0-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fb30f9626572a76dfe4293c7194a09fb1fe93ba94c7d4f720dfae3b646b45027", size = 234391, upload-time = "2025-10-06T05:36:31.301Z" }, + { url = "https://files.pythonhosted.org/packages/40/76/c202df58e3acdf12969a7895fd6f3bc016c642e6726aa63bd3025e0fc71c/frozenlist-1.8.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eaa352d7047a31d87dafcacbabe89df0aa506abb5b1b85a2fb91bc3faa02d822", size = 233048, upload-time = "2025-10-06T05:36:32.531Z" }, + { url = "https://files.pythonhosted.org/packages/f9/c0/8746afb90f17b73ca5979c7a3958116e105ff796e718575175319b5bb4ce/frozenlist-1.8.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:03ae967b4e297f58f8c774c7eabcce57fe3c2434817d4385c50661845a058121", size = 226549, upload-time = "2025-10-06T05:36:33.706Z" }, + { url = "https://files.pythonhosted.org/packages/7e/eb/4c7eefc718ff72f9b6c4893291abaae5fbc0c82226a32dcd8ef4f7a5dbef/frozenlist-1.8.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f6292f1de555ffcc675941d65fffffb0a5bcd992905015f85d0592201793e0e5", size = 239833, upload-time = "2025-10-06T05:36:34.947Z" }, + { url = "https://files.pythonhosted.org/packages/c2/4e/e5c02187cf704224f8b21bee886f3d713ca379535f16893233b9d672ea71/frozenlist-1.8.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:29548f9b5b5e3460ce7378144c3010363d8035cea44bc0bf02d57f5a685e084e", size = 245363, upload-time = "2025-10-06T05:36:36.534Z" }, + { url = "https://files.pythonhosted.org/packages/1f/96/cb85ec608464472e82ad37a17f844889c36100eed57bea094518bf270692/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ec3cc8c5d4084591b4237c0a272cc4f50a5b03396a47d9caaf76f5d7b38a4f11", size = 229314, upload-time = "2025-10-06T05:36:38.582Z" }, + { url = "https://files.pythonhosted.org/packages/5d/6f/4ae69c550e4cee66b57887daeebe006fe985917c01d0fff9caab9883f6d0/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:517279f58009d0b1f2e7c1b130b377a349405da3f7621ed6bfae50b10adf20c1", size = 243365, upload-time = "2025-10-06T05:36:40.152Z" }, + { url = "https://files.pythonhosted.org/packages/7a/58/afd56de246cf11780a40a2c28dc7cbabbf06337cc8ddb1c780a2d97e88d8/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:db1e72ede2d0d7ccb213f218df6a078a9c09a7de257c2fe8fcef16d5925230b1", size = 237763, upload-time = "2025-10-06T05:36:41.355Z" }, + { url = "https://files.pythonhosted.org/packages/cb/36/cdfaf6ed42e2644740d4a10452d8e97fa1c062e2a8006e4b09f1b5fd7d63/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:b4dec9482a65c54a5044486847b8a66bf10c9cb4926d42927ec4e8fd5db7fed8", size = 240110, upload-time = "2025-10-06T05:36:42.716Z" }, + { url = "https://files.pythonhosted.org/packages/03/a8/9ea226fbefad669f11b52e864c55f0bd57d3c8d7eb07e9f2e9a0b39502e1/frozenlist-1.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:21900c48ae04d13d416f0e1e0c4d81f7931f73a9dfa0b7a8746fb2fe7dd970ed", size = 233717, upload-time = "2025-10-06T05:36:44.251Z" }, + { url = "https://files.pythonhosted.org/packages/1e/0b/1b5531611e83ba7d13ccc9988967ea1b51186af64c42b7a7af465dcc9568/frozenlist-1.8.0-cp313-cp313-win32.whl", hash = "sha256:8b7b94a067d1c504ee0b16def57ad5738701e4ba10cec90529f13fa03c833496", size = 39628, upload-time = "2025-10-06T05:36:45.423Z" }, + { url = "https://files.pythonhosted.org/packages/d8/cf/174c91dbc9cc49bc7b7aab74d8b734e974d1faa8f191c74af9b7e80848e6/frozenlist-1.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:878be833caa6a3821caf85eb39c5ba92d28e85df26d57afb06b35b2efd937231", size = 43882, upload-time = "2025-10-06T05:36:46.796Z" }, + { url = "https://files.pythonhosted.org/packages/c1/17/502cd212cbfa96eb1388614fe39a3fc9ab87dbbe042b66f97acb57474834/frozenlist-1.8.0-cp313-cp313-win_arm64.whl", hash = "sha256:44389d135b3ff43ba8cc89ff7f51f5a0bb6b63d829c8300f79a2fe4fe61bcc62", size = 39676, upload-time = "2025-10-06T05:36:47.8Z" }, + { url = "https://files.pythonhosted.org/packages/d2/5c/3bbfaa920dfab09e76946a5d2833a7cbdf7b9b4a91c714666ac4855b88b4/frozenlist-1.8.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:e25ac20a2ef37e91c1b39938b591457666a0fa835c7783c3a8f33ea42870db94", size = 89235, upload-time = "2025-10-06T05:36:48.78Z" }, + { url = "https://files.pythonhosted.org/packages/d2/d6/f03961ef72166cec1687e84e8925838442b615bd0b8854b54923ce5b7b8a/frozenlist-1.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:07cdca25a91a4386d2e76ad992916a85038a9b97561bf7a3fd12d5d9ce31870c", size = 50742, upload-time = "2025-10-06T05:36:49.837Z" }, + { url = "https://files.pythonhosted.org/packages/1e/bb/a6d12b7ba4c3337667d0e421f7181c82dda448ce4e7ad7ecd249a16fa806/frozenlist-1.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:4e0c11f2cc6717e0a741f84a527c52616140741cd812a50422f83dc31749fb52", size = 51725, upload-time = "2025-10-06T05:36:50.851Z" }, + { url = "https://files.pythonhosted.org/packages/bc/71/d1fed0ffe2c2ccd70b43714c6cab0f4188f09f8a67a7914a6b46ee30f274/frozenlist-1.8.0-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b3210649ee28062ea6099cfda39e147fa1bc039583c8ee4481cb7811e2448c51", size = 284533, upload-time = "2025-10-06T05:36:51.898Z" }, + { url = "https://files.pythonhosted.org/packages/c9/1f/fb1685a7b009d89f9bf78a42d94461bc06581f6e718c39344754a5d9bada/frozenlist-1.8.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:581ef5194c48035a7de2aefc72ac6539823bb71508189e5de01d60c9dcd5fa65", size = 292506, upload-time = "2025-10-06T05:36:53.101Z" }, + { url = "https://files.pythonhosted.org/packages/e6/3b/b991fe1612703f7e0d05c0cf734c1b77aaf7c7d321df4572e8d36e7048c8/frozenlist-1.8.0-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:3ef2d026f16a2b1866e1d86fc4e1291e1ed8a387b2c333809419a2f8b3a77b82", size = 274161, upload-time = "2025-10-06T05:36:54.309Z" }, + { url = "https://files.pythonhosted.org/packages/ca/ec/c5c618767bcdf66e88945ec0157d7f6c4a1322f1473392319b7a2501ded7/frozenlist-1.8.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:5500ef82073f599ac84d888e3a8c1f77ac831183244bfd7f11eaa0289fb30714", size = 294676, upload-time = "2025-10-06T05:36:55.566Z" }, + { url = "https://files.pythonhosted.org/packages/7c/ce/3934758637d8f8a88d11f0585d6495ef54b2044ed6ec84492a91fa3b27aa/frozenlist-1.8.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:50066c3997d0091c411a66e710f4e11752251e6d2d73d70d8d5d4c76442a199d", size = 300638, upload-time = "2025-10-06T05:36:56.758Z" }, + { url = "https://files.pythonhosted.org/packages/fc/4f/a7e4d0d467298f42de4b41cbc7ddaf19d3cfeabaf9ff97c20c6c7ee409f9/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:5c1c8e78426e59b3f8005e9b19f6ff46e5845895adbde20ece9218319eca6506", size = 283067, upload-time = "2025-10-06T05:36:57.965Z" }, + { url = "https://files.pythonhosted.org/packages/dc/48/c7b163063d55a83772b268e6d1affb960771b0e203b632cfe09522d67ea5/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:eefdba20de0d938cec6a89bd4d70f346a03108a19b9df4248d3cf0d88f1b0f51", size = 292101, upload-time = "2025-10-06T05:36:59.237Z" }, + { url = "https://files.pythonhosted.org/packages/9f/d0/2366d3c4ecdc2fd391e0afa6e11500bfba0ea772764d631bbf82f0136c9d/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:cf253e0e1c3ceb4aaff6df637ce033ff6535fb8c70a764a8f46aafd3d6ab798e", size = 289901, upload-time = "2025-10-06T05:37:00.811Z" }, + { url = "https://files.pythonhosted.org/packages/b8/94/daff920e82c1b70e3618a2ac39fbc01ae3e2ff6124e80739ce5d71c9b920/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:032efa2674356903cd0261c4317a561a6850f3ac864a63fc1583147fb05a79b0", size = 289395, upload-time = "2025-10-06T05:37:02.115Z" }, + { url = "https://files.pythonhosted.org/packages/e3/20/bba307ab4235a09fdcd3cc5508dbabd17c4634a1af4b96e0f69bfe551ebd/frozenlist-1.8.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6da155091429aeba16851ecb10a9104a108bcd32f6c1642867eadaee401c1c41", size = 283659, upload-time = "2025-10-06T05:37:03.711Z" }, + { url = "https://files.pythonhosted.org/packages/fd/00/04ca1c3a7a124b6de4f8a9a17cc2fcad138b4608e7a3fc5877804b8715d7/frozenlist-1.8.0-cp313-cp313t-win32.whl", hash = "sha256:0f96534f8bfebc1a394209427d0f8a63d343c9779cda6fc25e8e121b5fd8555b", size = 43492, upload-time = "2025-10-06T05:37:04.915Z" }, + { url = "https://files.pythonhosted.org/packages/59/5e/c69f733a86a94ab10f68e496dc6b7e8bc078ebb415281d5698313e3af3a1/frozenlist-1.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:5d63a068f978fc69421fb0e6eb91a9603187527c86b7cd3f534a5b77a592b888", size = 48034, upload-time = "2025-10-06T05:37:06.343Z" }, + { url = "https://files.pythonhosted.org/packages/16/6c/be9d79775d8abe79b05fa6d23da99ad6e7763a1d080fbae7290b286093fd/frozenlist-1.8.0-cp313-cp313t-win_arm64.whl", hash = "sha256:bf0a7e10b077bf5fb9380ad3ae8ce20ef919a6ad93b4552896419ac7e1d8e042", size = 41749, upload-time = "2025-10-06T05:37:07.431Z" }, + { url = "https://files.pythonhosted.org/packages/f1/c8/85da824b7e7b9b6e7f7705b2ecaf9591ba6f79c1177f324c2735e41d36a2/frozenlist-1.8.0-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:cee686f1f4cadeb2136007ddedd0aaf928ab95216e7691c63e50a8ec066336d0", size = 86127, upload-time = "2025-10-06T05:37:08.438Z" }, + { url = "https://files.pythonhosted.org/packages/8e/e8/a1185e236ec66c20afd72399522f142c3724c785789255202d27ae992818/frozenlist-1.8.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:119fb2a1bd47307e899c2fac7f28e85b9a543864df47aa7ec9d3c1b4545f096f", size = 49698, upload-time = "2025-10-06T05:37:09.48Z" }, + { url = "https://files.pythonhosted.org/packages/a1/93/72b1736d68f03fda5fdf0f2180fb6caaae3894f1b854d006ac61ecc727ee/frozenlist-1.8.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:4970ece02dbc8c3a92fcc5228e36a3e933a01a999f7094ff7c23fbd2beeaa67c", size = 49749, upload-time = "2025-10-06T05:37:10.569Z" }, + { url = "https://files.pythonhosted.org/packages/a7/b2/fabede9fafd976b991e9f1b9c8c873ed86f202889b864756f240ce6dd855/frozenlist-1.8.0-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:cba69cb73723c3f329622e34bdbf5ce1f80c21c290ff04256cff1cd3c2036ed2", size = 231298, upload-time = "2025-10-06T05:37:11.993Z" }, + { url = "https://files.pythonhosted.org/packages/3a/3b/d9b1e0b0eed36e70477ffb8360c49c85c8ca8ef9700a4e6711f39a6e8b45/frozenlist-1.8.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:778a11b15673f6f1df23d9586f83c4846c471a8af693a22e066508b77d201ec8", size = 232015, upload-time = "2025-10-06T05:37:13.194Z" }, + { url = "https://files.pythonhosted.org/packages/dc/94/be719d2766c1138148564a3960fc2c06eb688da592bdc25adcf856101be7/frozenlist-1.8.0-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0325024fe97f94c41c08872db482cf8ac4800d80e79222c6b0b7b162d5b13686", size = 225038, upload-time = "2025-10-06T05:37:14.577Z" }, + { url = "https://files.pythonhosted.org/packages/e4/09/6712b6c5465f083f52f50cf74167b92d4ea2f50e46a9eea0523d658454ae/frozenlist-1.8.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:97260ff46b207a82a7567b581ab4190bd4dfa09f4db8a8b49d1a958f6aa4940e", size = 240130, upload-time = "2025-10-06T05:37:15.781Z" }, + { url = "https://files.pythonhosted.org/packages/f8/d4/cd065cdcf21550b54f3ce6a22e143ac9e4836ca42a0de1022da8498eac89/frozenlist-1.8.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:54b2077180eb7f83dd52c40b2750d0a9f175e06a42e3213ce047219de902717a", size = 242845, upload-time = "2025-10-06T05:37:17.037Z" }, + { url = "https://files.pythonhosted.org/packages/62/c3/f57a5c8c70cd1ead3d5d5f776f89d33110b1addae0ab010ad774d9a44fb9/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2f05983daecab868a31e1da44462873306d3cbfd76d1f0b5b69c473d21dbb128", size = 229131, upload-time = "2025-10-06T05:37:18.221Z" }, + { url = "https://files.pythonhosted.org/packages/6c/52/232476fe9cb64f0742f3fde2b7d26c1dac18b6d62071c74d4ded55e0ef94/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:33f48f51a446114bc5d251fb2954ab0164d5be02ad3382abcbfe07e2531d650f", size = 240542, upload-time = "2025-10-06T05:37:19.771Z" }, + { url = "https://files.pythonhosted.org/packages/5f/85/07bf3f5d0fb5414aee5f47d33c6f5c77bfe49aac680bfece33d4fdf6a246/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:154e55ec0655291b5dd1b8731c637ecdb50975a2ae70c606d100750a540082f7", size = 237308, upload-time = "2025-10-06T05:37:20.969Z" }, + { url = "https://files.pythonhosted.org/packages/11/99/ae3a33d5befd41ac0ca2cc7fd3aa707c9c324de2e89db0e0f45db9a64c26/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:4314debad13beb564b708b4a496020e5306c7333fa9a3ab90374169a20ffab30", size = 238210, upload-time = "2025-10-06T05:37:22.252Z" }, + { url = "https://files.pythonhosted.org/packages/b2/60/b1d2da22f4970e7a155f0adde9b1435712ece01b3cd45ba63702aea33938/frozenlist-1.8.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:073f8bf8becba60aa931eb3bc420b217bb7d5b8f4750e6f8b3be7f3da85d38b7", size = 231972, upload-time = "2025-10-06T05:37:23.5Z" }, + { url = "https://files.pythonhosted.org/packages/3f/ab/945b2f32de889993b9c9133216c068b7fcf257d8595a0ac420ac8677cab0/frozenlist-1.8.0-cp314-cp314-win32.whl", hash = "sha256:bac9c42ba2ac65ddc115d930c78d24ab8d4f465fd3fc473cdedfccadb9429806", size = 40536, upload-time = "2025-10-06T05:37:25.581Z" }, + { url = "https://files.pythonhosted.org/packages/59/ad/9caa9b9c836d9ad6f067157a531ac48b7d36499f5036d4141ce78c230b1b/frozenlist-1.8.0-cp314-cp314-win_amd64.whl", hash = "sha256:3e0761f4d1a44f1d1a47996511752cf3dcec5bbdd9cc2b4fe595caf97754b7a0", size = 44330, upload-time = "2025-10-06T05:37:26.928Z" }, + { url = "https://files.pythonhosted.org/packages/82/13/e6950121764f2676f43534c555249f57030150260aee9dcf7d64efda11dd/frozenlist-1.8.0-cp314-cp314-win_arm64.whl", hash = "sha256:d1eaff1d00c7751b7c6662e9c5ba6eb2c17a2306ba5e2a37f24ddf3cc953402b", size = 40627, upload-time = "2025-10-06T05:37:28.075Z" }, + { url = "https://files.pythonhosted.org/packages/c0/c7/43200656ecc4e02d3f8bc248df68256cd9572b3f0017f0a0c4e93440ae23/frozenlist-1.8.0-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:d3bb933317c52d7ea5004a1c442eef86f426886fba134ef8cf4226ea6ee1821d", size = 89238, upload-time = "2025-10-06T05:37:29.373Z" }, + { url = "https://files.pythonhosted.org/packages/d1/29/55c5f0689b9c0fb765055629f472c0de484dcaf0acee2f7707266ae3583c/frozenlist-1.8.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:8009897cdef112072f93a0efdce29cd819e717fd2f649ee3016efd3cd885a7ed", size = 50738, upload-time = "2025-10-06T05:37:30.792Z" }, + { url = "https://files.pythonhosted.org/packages/ba/7d/b7282a445956506fa11da8c2db7d276adcbf2b17d8bb8407a47685263f90/frozenlist-1.8.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2c5dcbbc55383e5883246d11fd179782a9d07a986c40f49abe89ddf865913930", size = 51739, upload-time = "2025-10-06T05:37:32.127Z" }, + { url = "https://files.pythonhosted.org/packages/62/1c/3d8622e60d0b767a5510d1d3cf21065b9db874696a51ea6d7a43180a259c/frozenlist-1.8.0-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:39ecbc32f1390387d2aa4f5a995e465e9e2f79ba3adcac92d68e3e0afae6657c", size = 284186, upload-time = "2025-10-06T05:37:33.21Z" }, + { url = "https://files.pythonhosted.org/packages/2d/14/aa36d5f85a89679a85a1d44cd7a6657e0b1c75f61e7cad987b203d2daca8/frozenlist-1.8.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92db2bf818d5cc8d9c1f1fc56b897662e24ea5adb36ad1f1d82875bd64e03c24", size = 292196, upload-time = "2025-10-06T05:37:36.107Z" }, + { url = "https://files.pythonhosted.org/packages/05/23/6bde59eb55abd407d34f77d39a5126fb7b4f109a3f611d3929f14b700c66/frozenlist-1.8.0-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:2dc43a022e555de94c3b68a4ef0b11c4f747d12c024a520c7101709a2144fb37", size = 273830, upload-time = "2025-10-06T05:37:37.663Z" }, + { url = "https://files.pythonhosted.org/packages/d2/3f/22cff331bfad7a8afa616289000ba793347fcd7bc275f3b28ecea2a27909/frozenlist-1.8.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb89a7f2de3602cfed448095bab3f178399646ab7c61454315089787df07733a", size = 294289, upload-time = "2025-10-06T05:37:39.261Z" }, + { url = "https://files.pythonhosted.org/packages/a4/89/5b057c799de4838b6c69aa82b79705f2027615e01be996d2486a69ca99c4/frozenlist-1.8.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:33139dc858c580ea50e7e60a1b0ea003efa1fd42e6ec7fdbad78fff65fad2fd2", size = 300318, upload-time = "2025-10-06T05:37:43.213Z" }, + { url = "https://files.pythonhosted.org/packages/30/de/2c22ab3eb2a8af6d69dc799e48455813bab3690c760de58e1bf43b36da3e/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:168c0969a329b416119507ba30b9ea13688fafffac1b7822802537569a1cb0ef", size = 282814, upload-time = "2025-10-06T05:37:45.337Z" }, + { url = "https://files.pythonhosted.org/packages/59/f7/970141a6a8dbd7f556d94977858cfb36fa9b66e0892c6dd780d2219d8cd8/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:28bd570e8e189d7f7b001966435f9dac6718324b5be2990ac496cf1ea9ddb7fe", size = 291762, upload-time = "2025-10-06T05:37:46.657Z" }, + { url = "https://files.pythonhosted.org/packages/c1/15/ca1adae83a719f82df9116d66f5bb28bb95557b3951903d39135620ef157/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:b2a095d45c5d46e5e79ba1e5b9cb787f541a8dee0433836cea4b96a2c439dcd8", size = 289470, upload-time = "2025-10-06T05:37:47.946Z" }, + { url = "https://files.pythonhosted.org/packages/ac/83/dca6dc53bf657d371fbc88ddeb21b79891e747189c5de990b9dfff2ccba1/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:eab8145831a0d56ec9c4139b6c3e594c7a83c2c8be25d5bcf2d86136a532287a", size = 289042, upload-time = "2025-10-06T05:37:49.499Z" }, + { url = "https://files.pythonhosted.org/packages/96/52/abddd34ca99be142f354398700536c5bd315880ed0a213812bc491cff5e4/frozenlist-1.8.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:974b28cf63cc99dfb2188d8d222bc6843656188164848c4f679e63dae4b0708e", size = 283148, upload-time = "2025-10-06T05:37:50.745Z" }, + { url = "https://files.pythonhosted.org/packages/af/d3/76bd4ed4317e7119c2b7f57c3f6934aba26d277acc6309f873341640e21f/frozenlist-1.8.0-cp314-cp314t-win32.whl", hash = "sha256:342c97bf697ac5480c0a7ec73cd700ecfa5a8a40ac923bd035484616efecc2df", size = 44676, upload-time = "2025-10-06T05:37:52.222Z" }, + { url = "https://files.pythonhosted.org/packages/89/76/c615883b7b521ead2944bb3480398cbb07e12b7b4e4d073d3752eb721558/frozenlist-1.8.0-cp314-cp314t-win_amd64.whl", hash = "sha256:06be8f67f39c8b1dc671f5d83aaefd3358ae5cdcf8314552c57e7ed3e6475bdd", size = 49451, upload-time = "2025-10-06T05:37:53.425Z" }, + { url = "https://files.pythonhosted.org/packages/e0/a3/5982da14e113d07b325230f95060e2169f5311b1017ea8af2a29b374c289/frozenlist-1.8.0-cp314-cp314t-win_arm64.whl", hash = "sha256:102e6314ca4da683dca92e3b1355490fed5f313b768500084fbe6371fddfdb79", size = 42507, upload-time = "2025-10-06T05:37:54.513Z" }, + { url = "https://files.pythonhosted.org/packages/9a/9a/e35b4a917281c0b8419d4207f4334c8e8c5dbf4f3f5f9ada73958d937dcc/frozenlist-1.8.0-py3-none-any.whl", hash = "sha256:0c18a16eab41e82c295618a77502e17b195883241c563b00f0aa5106fc4eaa0d", size = 13409, upload-time = "2025-10-06T05:38:16.721Z" }, +] + +[[package]] +name = "glom" +version = "22.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "boltons" }, + { name = "face" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3f/d1/69432deefa6f5283ec75b246d0540097ae26f618b915519ee3824c4c5dd6/glom-22.1.0.tar.gz", hash = "sha256:1510c6587a8f9c64a246641b70033cbc5ebde99f02ad245693678038e821aeb5", size = 189738, upload-time = "2022-01-24T09:34:04.874Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/e8/68e274b2a30e1fdfd25bdc27194382be3f233929c8f727c0440d58ac074f/glom-22.1.0-py2.py3-none-any.whl", hash = "sha256:5339da206bf3532e01a83a35aca202960ea885156986d190574b779598e9e772", size = 100687, upload-time = "2022-01-24T09:34:02.391Z" }, +] + +[[package]] +name = "h11" +version = "0.16.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, +] + +[[package]] +name = "httpcore" +version = "1.0.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" }, +] + +[[package]] +name = "httpx" +version = "0.28.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "certifi" }, + { name = "httpcore" }, + { name = "idna" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, +] + +[[package]] +name = "httpx-sse" +version = "0.4.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0f/4c/751061ffa58615a32c31b2d82e8482be8dd4a89154f003147acee90f2be9/httpx_sse-0.4.3.tar.gz", hash = "sha256:9b1ed0127459a66014aec3c56bebd93da3c1bc8bb6618c8082039a44889a755d", size = 15943, upload-time = "2025-10-10T21:48:22.271Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/fd/6668e5aec43ab844de6fc74927e155a3b37bf40d7c3790e49fc0406b6578/httpx_sse-0.4.3-py3-none-any.whl", hash = "sha256:0ac1c9fe3c0afad2e0ebb25a934a59f4c7823b60792691f779fad2c5568830fc", size = 8960, upload-time = "2025-10-10T21:48:21.158Z" }, +] + +[[package]] +name = "idna" +version = "3.11" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, +] + +[[package]] +name = "importlib-metadata" +version = "8.7.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "zipp" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865, upload-time = "2025-12-21T10:00:18.329Z" }, +] + +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + +[[package]] +name = "jmespath" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" }, +] + +[[package]] +name = "jsonschema" +version = "4.26.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "jsonschema-specifications" }, + { name = "referencing" }, + { name = "rpds-py" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/fc/e067678238fa451312d4c62bf6e6cf5ec56375422aee02f9cb5f909b3047/jsonschema-4.26.0.tar.gz", hash = "sha256:0c26707e2efad8aa1bfc5b7ce170f3fccc2e4918ff85989ba9ffa9facb2be326", size = 366583, upload-time = "2026-01-07T13:41:07.246Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/90/f63fb5873511e014207a475e2bb4e8b2e570d655b00ac19a9a0ca0a385ee/jsonschema-4.26.0-py3-none-any.whl", hash = "sha256:d489f15263b8d200f8387e64b4c3a75f06629559fb73deb8fdfb525f2dab50ce", size = 90630, upload-time = "2026-01-07T13:41:05.306Z" }, +] + +[[package]] +name = "jsonschema-specifications" +version = "2025.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "referencing" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/19/74/a633ee74eb36c44aa6d1095e7cc5569bebf04342ee146178e2d36600708b/jsonschema_specifications-2025.9.1.tar.gz", hash = "sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d", size = 32855, upload-time = "2025-09-08T01:34:59.186Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" }, +] + +[[package]] +name = "markdown-it-py" +version = "4.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mdurl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/54/e7d793b573f298e1c9013b8c4dade17d481164aa517d1d7148619c2cedbf/markdown_it_py-4.0.0-py3-none-any.whl", hash = "sha256:87327c59b172c5011896038353a81343b6754500a08cd7a4973bb48c6d578147", size = 87321, upload-time = "2025-08-11T12:57:51.923Z" }, +] + +[[package]] +name = "markdownify" +version = "1.2.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "beautifulsoup4" }, + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/3f/bc/c8c8eea5335341306b0fa7e1cb33c5e1c8d24ef70ddd684da65f41c49c92/markdownify-1.2.2.tar.gz", hash = "sha256:b274f1b5943180b031b699b199cbaeb1e2ac938b75851849a31fd0c3d6603d09", size = 18816, upload-time = "2025-11-16T19:21:18.565Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/ce/f1e3e9d959db134cedf06825fae8d5b294bd368aacdd0831a3975b7c4d55/markdownify-1.2.2-py3-none-any.whl", hash = "sha256:3f02d3cc52714084d6e589f70397b6fc9f2f3a8531481bf35e8cc39f975e186a", size = 15724, upload-time = "2025-11-16T19:21:17.622Z" }, +] + +[[package]] +name = "mcp" +version = "1.26.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "httpx" }, + { name = "httpx-sse" }, + { name = "jsonschema" }, + { name = "pydantic" }, + { name = "pydantic-settings" }, + { name = "pyjwt", extra = ["crypto"] }, + { name = "python-multipart" }, + { name = "pywin32", marker = "sys_platform == 'win32'" }, + { name = "sse-starlette" }, + { name = "starlette" }, + { name = "typing-extensions" }, + { name = "typing-inspection" }, + { name = "uvicorn", marker = "sys_platform != 'emscripten'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fc/6d/62e76bbb8144d6ed86e202b5edd8a4cb631e7c8130f3f4893c3f90262b10/mcp-1.26.0.tar.gz", hash = "sha256:db6e2ef491eecc1a0d93711a76f28dec2e05999f93afd48795da1c1137142c66", size = 608005, upload-time = "2026-01-24T19:40:32.468Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fd/d9/eaa1f80170d2b7c5ba23f3b59f766f3a0bb41155fbc32a69adfa1adaaef9/mcp-1.26.0-py3-none-any.whl", hash = "sha256:904a21c33c25aa98ddbeb47273033c435e595bbacfdb177f4bd87f6dceebe1ca", size = 233615, upload-time = "2026-01-24T19:40:30.652Z" }, +] + +[[package]] +name = "mdurl" +version = "0.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729, upload-time = "2022-08-14T12:40:10.846Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979, upload-time = "2022-08-14T12:40:09.779Z" }, +] + +[[package]] +name = "mpmath" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, +] + +[[package]] +name = "multidict" +version = "6.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1a/c2/c2d94cbe6ac1753f3fc980da97b3d930efe1da3af3c9f5125354436c073d/multidict-6.7.1.tar.gz", hash = "sha256:ec6652a1bee61c53a3e5776b6049172c53b6aaba34f18c9ad04f82712bac623d", size = 102010, upload-time = "2026-01-26T02:46:45.979Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f2/22/929c141d6c0dba87d3e1d38fbdf1ba8baba86b7776469f2bc2d3227a1e67/multidict-6.7.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2b41f5fed0ed563624f1c17630cb9941cf2309d4df00e494b551b5f3e3d67a23", size = 76174, upload-time = "2026-01-26T02:44:18.509Z" }, + { url = "https://files.pythonhosted.org/packages/c7/75/bc704ae15fee974f8fccd871305e254754167dce5f9e42d88a2def741a1d/multidict-6.7.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:84e61e3af5463c19b67ced91f6c634effb89ef8bfc5ca0267f954451ed4bb6a2", size = 45116, upload-time = "2026-01-26T02:44:19.745Z" }, + { url = "https://files.pythonhosted.org/packages/79/76/55cd7186f498ed080a18440c9013011eb548f77ae1b297206d030eb1180a/multidict-6.7.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:935434b9853c7c112eee7ac891bc4cb86455aa631269ae35442cb316790c1445", size = 43524, upload-time = "2026-01-26T02:44:21.571Z" }, + { url = "https://files.pythonhosted.org/packages/e9/3c/414842ef8d5a1628d68edee29ba0e5bcf235dbfb3ccd3ea303a7fe8c72ff/multidict-6.7.1-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:432feb25a1cb67fe82a9680b4d65fb542e4635cb3166cd9c01560651ad60f177", size = 249368, upload-time = "2026-01-26T02:44:22.803Z" }, + { url = "https://files.pythonhosted.org/packages/f6/32/befed7f74c458b4a525e60519fe8d87eef72bb1e99924fa2b0f9d97a221e/multidict-6.7.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e82d14e3c948952a1a85503817e038cba5905a3352de76b9a465075d072fba23", size = 256952, upload-time = "2026-01-26T02:44:24.306Z" }, + { url = "https://files.pythonhosted.org/packages/03/d6/c878a44ba877f366630c860fdf74bfb203c33778f12b6ac274936853c451/multidict-6.7.1-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:4cfb48c6ea66c83bcaaf7e4dfa7ec1b6bbcf751b7db85a328902796dfde4c060", size = 240317, upload-time = "2026-01-26T02:44:25.772Z" }, + { url = "https://files.pythonhosted.org/packages/68/49/57421b4d7ad2e9e60e25922b08ceb37e077b90444bde6ead629095327a6f/multidict-6.7.1-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1d540e51b7e8e170174555edecddbd5538105443754539193e3e1061864d444d", size = 267132, upload-time = "2026-01-26T02:44:27.648Z" }, + { url = "https://files.pythonhosted.org/packages/b7/fe/ec0edd52ddbcea2a2e89e174f0206444a61440b40f39704e64dc807a70bd/multidict-6.7.1-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:273d23f4b40f3dce4d6c8a821c741a86dec62cded82e1175ba3d99be128147ed", size = 268140, upload-time = "2026-01-26T02:44:29.588Z" }, + { url = "https://files.pythonhosted.org/packages/b0/73/6e1b01cbeb458807aa0831742232dbdd1fa92bfa33f52a3f176b4ff3dc11/multidict-6.7.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d624335fd4fa1c08a53f8b4be7676ebde19cd092b3895c421045ca87895b429", size = 254277, upload-time = "2026-01-26T02:44:30.902Z" }, + { url = "https://files.pythonhosted.org/packages/6a/b2/5fb8c124d7561a4974c342bc8c778b471ebbeb3cc17df696f034a7e9afe7/multidict-6.7.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:12fad252f8b267cc75b66e8fc51b3079604e8d43a75428ffe193cd9e2195dfd6", size = 252291, upload-time = "2026-01-26T02:44:32.31Z" }, + { url = "https://files.pythonhosted.org/packages/5a/96/51d4e4e06bcce92577fcd488e22600bd38e4fd59c20cb49434d054903bd2/multidict-6.7.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:03ede2a6ffbe8ef936b92cb4529f27f42be7f56afcdab5ab739cd5f27fb1cbf9", size = 250156, upload-time = "2026-01-26T02:44:33.734Z" }, + { url = "https://files.pythonhosted.org/packages/db/6b/420e173eec5fba721a50e2a9f89eda89d9c98fded1124f8d5c675f7a0c0f/multidict-6.7.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:90efbcf47dbe33dcf643a1e400d67d59abeac5db07dc3f27d6bdeae497a2198c", size = 249742, upload-time = "2026-01-26T02:44:35.222Z" }, + { url = "https://files.pythonhosted.org/packages/44/a3/ec5b5bd98f306bc2aa297b8c6f11a46714a56b1e6ef5ebda50a4f5d7c5fb/multidict-6.7.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:5c4b9bfc148f5a91be9244d6264c53035c8a0dcd2f51f1c3c6e30e30ebaa1c84", size = 262221, upload-time = "2026-01-26T02:44:36.604Z" }, + { url = "https://files.pythonhosted.org/packages/cd/f7/e8c0d0da0cd1e28d10e624604e1a36bcc3353aaebdfdc3a43c72bc683a12/multidict-6.7.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:401c5a650f3add2472d1d288c26deebc540f99e2fb83e9525007a74cd2116f1d", size = 258664, upload-time = "2026-01-26T02:44:38.008Z" }, + { url = "https://files.pythonhosted.org/packages/52/da/151a44e8016dd33feed44f730bd856a66257c1ee7aed4f44b649fb7edeb3/multidict-6.7.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:97891f3b1b3ffbded884e2916cacf3c6fc87b66bb0dde46f7357404750559f33", size = 249490, upload-time = "2026-01-26T02:44:39.386Z" }, + { url = "https://files.pythonhosted.org/packages/87/af/a3b86bf9630b732897f6fc3f4c4714b90aa4361983ccbdcd6c0339b21b0c/multidict-6.7.1-cp313-cp313-win32.whl", hash = "sha256:e1c5988359516095535c4301af38d8a8838534158f649c05dd1050222321bcb3", size = 41695, upload-time = "2026-01-26T02:44:41.318Z" }, + { url = "https://files.pythonhosted.org/packages/b2/35/e994121b0e90e46134673422dd564623f93304614f5d11886b1b3e06f503/multidict-6.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:960c83bf01a95b12b08fd54324a4eb1d5b52c88932b5cba5d6e712bb3ed12eb5", size = 45884, upload-time = "2026-01-26T02:44:42.488Z" }, + { url = "https://files.pythonhosted.org/packages/ca/61/42d3e5dbf661242a69c97ea363f2d7b46c567da8eadef8890022be6e2ab0/multidict-6.7.1-cp313-cp313-win_arm64.whl", hash = "sha256:563fe25c678aaba333d5399408f5ec3c383ca5b663e7f774dd179a520b8144df", size = 43122, upload-time = "2026-01-26T02:44:43.664Z" }, + { url = "https://files.pythonhosted.org/packages/6d/b3/e6b21c6c4f314bb956016b0b3ef2162590a529b84cb831c257519e7fde44/multidict-6.7.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:c76c4bec1538375dad9d452d246ca5368ad6e1c9039dadcf007ae59c70619ea1", size = 83175, upload-time = "2026-01-26T02:44:44.894Z" }, + { url = "https://files.pythonhosted.org/packages/fb/76/23ecd2abfe0957b234f6c960f4ade497f55f2c16aeb684d4ecdbf1c95791/multidict-6.7.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:57b46b24b5d5ebcc978da4ec23a819a9402b4228b8a90d9c656422b4bdd8a963", size = 48460, upload-time = "2026-01-26T02:44:46.106Z" }, + { url = "https://files.pythonhosted.org/packages/c4/57/a0ed92b23f3a042c36bc4227b72b97eca803f5f1801c1ab77c8a212d455e/multidict-6.7.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e954b24433c768ce78ab7929e84ccf3422e46deb45a4dc9f93438f8217fa2d34", size = 46930, upload-time = "2026-01-26T02:44:47.278Z" }, + { url = "https://files.pythonhosted.org/packages/b5/66/02ec7ace29162e447f6382c495dc95826bf931d3818799bbef11e8f7df1a/multidict-6.7.1-cp313-cp313t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3bd231490fa7217cc832528e1cd8752a96f0125ddd2b5749390f7c3ec8721b65", size = 242582, upload-time = "2026-01-26T02:44:48.604Z" }, + { url = "https://files.pythonhosted.org/packages/58/18/64f5a795e7677670e872673aca234162514696274597b3708b2c0d276cce/multidict-6.7.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:253282d70d67885a15c8a7716f3a73edf2d635793ceda8173b9ecc21f2fb8292", size = 250031, upload-time = "2026-01-26T02:44:50.544Z" }, + { url = "https://files.pythonhosted.org/packages/c8/ed/e192291dbbe51a8290c5686f482084d31bcd9d09af24f63358c3d42fd284/multidict-6.7.1-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:0b4c48648d7649c9335cf1927a8b87fa692de3dcb15faa676c6a6f1f1aabda43", size = 228596, upload-time = "2026-01-26T02:44:51.951Z" }, + { url = "https://files.pythonhosted.org/packages/1e/7e/3562a15a60cf747397e7f2180b0a11dc0c38d9175a650e75fa1b4d325e15/multidict-6.7.1-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:98bc624954ec4d2c7cb074b8eefc2b5d0ce7d482e410df446414355d158fe4ca", size = 257492, upload-time = "2026-01-26T02:44:53.902Z" }, + { url = "https://files.pythonhosted.org/packages/24/02/7d0f9eae92b5249bb50ac1595b295f10e263dd0078ebb55115c31e0eaccd/multidict-6.7.1-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1b99af4d9eec0b49927b4402bcbb58dea89d3e0db8806a4086117019939ad3dd", size = 255899, upload-time = "2026-01-26T02:44:55.316Z" }, + { url = "https://files.pythonhosted.org/packages/00/e3/9b60ed9e23e64c73a5cde95269ef1330678e9c6e34dd4eb6b431b85b5a10/multidict-6.7.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6aac4f16b472d5b7dc6f66a0d49dd57b0e0902090be16594dc9ebfd3d17c47e7", size = 247970, upload-time = "2026-01-26T02:44:56.783Z" }, + { url = "https://files.pythonhosted.org/packages/3e/06/538e58a63ed5cfb0bd4517e346b91da32fde409d839720f664e9a4ae4f9d/multidict-6.7.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:21f830fe223215dffd51f538e78c172ed7c7f60c9b96a2bf05c4848ad49921c3", size = 245060, upload-time = "2026-01-26T02:44:58.195Z" }, + { url = "https://files.pythonhosted.org/packages/b2/2f/d743a3045a97c895d401e9bd29aaa09b94f5cbdf1bd561609e5a6c431c70/multidict-6.7.1-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:f5dd81c45b05518b9aa4da4aa74e1c93d715efa234fd3e8a179df611cc85e5f4", size = 235888, upload-time = "2026-01-26T02:44:59.57Z" }, + { url = "https://files.pythonhosted.org/packages/38/83/5a325cac191ab28b63c52f14f1131f3b0a55ba3b9aa65a6d0bf2a9b921a0/multidict-6.7.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:eb304767bca2bb92fb9c5bd33cedc95baee5bb5f6c88e63706533a1c06ad08c8", size = 243554, upload-time = "2026-01-26T02:45:01.054Z" }, + { url = "https://files.pythonhosted.org/packages/20/1f/9d2327086bd15da2725ef6aae624208e2ef828ed99892b17f60c344e57ed/multidict-6.7.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:c9035dde0f916702850ef66460bc4239d89d08df4d02023a5926e7446724212c", size = 252341, upload-time = "2026-01-26T02:45:02.484Z" }, + { url = "https://files.pythonhosted.org/packages/e8/2c/2a1aa0280cf579d0f6eed8ee5211c4f1730bd7e06c636ba2ee6aafda302e/multidict-6.7.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:af959b9beeb66c822380f222f0e0a1889331597e81f1ded7f374f3ecb0fd6c52", size = 246391, upload-time = "2026-01-26T02:45:03.862Z" }, + { url = "https://files.pythonhosted.org/packages/e5/03/7ca022ffc36c5a3f6e03b179a5ceb829be9da5783e6fe395f347c0794680/multidict-6.7.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:41f2952231456154ee479651491e94118229844dd7226541788be783be2b5108", size = 243422, upload-time = "2026-01-26T02:45:05.296Z" }, + { url = "https://files.pythonhosted.org/packages/dc/1d/b31650eab6c5778aceed46ba735bd97f7c7d2f54b319fa916c0f96e7805b/multidict-6.7.1-cp313-cp313t-win32.whl", hash = "sha256:df9f19c28adcb40b6aae30bbaa1478c389efd50c28d541d76760199fc1037c32", size = 47770, upload-time = "2026-01-26T02:45:06.754Z" }, + { url = "https://files.pythonhosted.org/packages/ac/5b/2d2d1d522e51285bd61b1e20df8f47ae1a9d80839db0b24ea783b3832832/multidict-6.7.1-cp313-cp313t-win_amd64.whl", hash = "sha256:d54ecf9f301853f2c5e802da559604b3e95bb7a3b01a9c295c6ee591b9882de8", size = 53109, upload-time = "2026-01-26T02:45:08.044Z" }, + { url = "https://files.pythonhosted.org/packages/3d/a3/cc409ba012c83ca024a308516703cf339bdc4b696195644a7215a5164a24/multidict-6.7.1-cp313-cp313t-win_arm64.whl", hash = "sha256:5a37ca18e360377cfda1d62f5f382ff41f2b8c4ccb329ed974cc2e1643440118", size = 45573, upload-time = "2026-01-26T02:45:09.349Z" }, + { url = "https://files.pythonhosted.org/packages/91/cc/db74228a8be41884a567e88a62fd589a913708fcf180d029898c17a9a371/multidict-6.7.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:8f333ec9c5eb1b7105e3b84b53141e66ca05a19a605368c55450b6ba208cb9ee", size = 75190, upload-time = "2026-01-26T02:45:10.651Z" }, + { url = "https://files.pythonhosted.org/packages/d5/22/492f2246bb5b534abd44804292e81eeaf835388901f0c574bac4eeec73c5/multidict-6.7.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:a407f13c188f804c759fc6a9f88286a565c242a76b27626594c133b82883b5c2", size = 44486, upload-time = "2026-01-26T02:45:11.938Z" }, + { url = "https://files.pythonhosted.org/packages/f1/4f/733c48f270565d78b4544f2baddc2fb2a245e5a8640254b12c36ac7ac68e/multidict-6.7.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0e161ddf326db5577c3a4cc2d8648f81456e8a20d40415541587a71620d7a7d1", size = 43219, upload-time = "2026-01-26T02:45:14.346Z" }, + { url = "https://files.pythonhosted.org/packages/24/bb/2c0c2287963f4259c85e8bcbba9182ced8d7fca65c780c38e99e61629d11/multidict-6.7.1-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:1e3a8bb24342a8201d178c3b4984c26ba81a577c80d4d525727427460a50c22d", size = 245132, upload-time = "2026-01-26T02:45:15.712Z" }, + { url = "https://files.pythonhosted.org/packages/a7/f9/44d4b3064c65079d2467888794dea218d1601898ac50222ab8a9a8094460/multidict-6.7.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97231140a50f5d447d3164f994b86a0bed7cd016e2682f8650d6a9158e14fd31", size = 252420, upload-time = "2026-01-26T02:45:17.293Z" }, + { url = "https://files.pythonhosted.org/packages/8b/13/78f7275e73fa17b24c9a51b0bd9d73ba64bb32d0ed51b02a746eb876abe7/multidict-6.7.1-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:6b10359683bd8806a200fd2909e7c8ca3a7b24ec1d8132e483d58e791d881048", size = 233510, upload-time = "2026-01-26T02:45:19.356Z" }, + { url = "https://files.pythonhosted.org/packages/4b/25/8167187f62ae3cbd52da7893f58cb036b47ea3fb67138787c76800158982/multidict-6.7.1-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:283ddac99f7ac25a4acadbf004cb5ae34480bbeb063520f70ce397b281859362", size = 264094, upload-time = "2026-01-26T02:45:20.834Z" }, + { url = "https://files.pythonhosted.org/packages/a1/e7/69a3a83b7b030cf283fb06ce074a05a02322359783424d7edf0f15fe5022/multidict-6.7.1-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:538cec1e18c067d0e6103aa9a74f9e832904c957adc260e61cd9d8cf0c3b3d37", size = 260786, upload-time = "2026-01-26T02:45:22.818Z" }, + { url = "https://files.pythonhosted.org/packages/fe/3b/8ec5074bcfc450fe84273713b4b0a0dd47c0249358f5d82eb8104ffe2520/multidict-6.7.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7eee46ccb30ff48a1e35bb818cc90846c6be2b68240e42a78599166722cea709", size = 248483, upload-time = "2026-01-26T02:45:24.368Z" }, + { url = "https://files.pythonhosted.org/packages/48/5a/d5a99e3acbca0e29c5d9cba8f92ceb15dce78bab963b308ae692981e3a5d/multidict-6.7.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fa263a02f4f2dd2d11a7b1bb4362aa7cb1049f84a9235d31adf63f30143469a0", size = 248403, upload-time = "2026-01-26T02:45:25.982Z" }, + { url = "https://files.pythonhosted.org/packages/35/48/e58cd31f6c7d5102f2a4bf89f96b9cf7e00b6c6f3d04ecc44417c00a5a3c/multidict-6.7.1-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:2e1425e2f99ec5bd36c15a01b690a1a2456209c5deed58f95469ffb46039ccbb", size = 240315, upload-time = "2026-01-26T02:45:27.487Z" }, + { url = "https://files.pythonhosted.org/packages/94/33/1cd210229559cb90b6786c30676bb0c58249ff42f942765f88793b41fdce/multidict-6.7.1-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:497394b3239fc6f0e13a78a3e1b61296e72bf1c5f94b4c4eb80b265c37a131cd", size = 245528, upload-time = "2026-01-26T02:45:28.991Z" }, + { url = "https://files.pythonhosted.org/packages/64/f2/6e1107d226278c876c783056b7db43d800bb64c6131cec9c8dfb6903698e/multidict-6.7.1-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:233b398c29d3f1b9676b4b6f75c518a06fcb2ea0b925119fb2c1bc35c05e1601", size = 258784, upload-time = "2026-01-26T02:45:30.503Z" }, + { url = "https://files.pythonhosted.org/packages/4d/c1/11f664f14d525e4a1b5327a82d4de61a1db604ab34c6603bb3c2cc63ad34/multidict-6.7.1-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:93b1818e4a6e0930454f0f2af7dfce69307ca03cdcfb3739bf4d91241967b6c1", size = 251980, upload-time = "2026-01-26T02:45:32.603Z" }, + { url = "https://files.pythonhosted.org/packages/e1/9f/75a9ac888121d0c5bbd4ecf4eead45668b1766f6baabfb3b7f66a410e231/multidict-6.7.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:f33dc2a3abe9249ea5d8360f969ec7f4142e7ac45ee7014d8f8d5acddf178b7b", size = 243602, upload-time = "2026-01-26T02:45:34.043Z" }, + { url = "https://files.pythonhosted.org/packages/9a/e7/50bf7b004cc8525d80dbbbedfdc7aed3e4c323810890be4413e589074032/multidict-6.7.1-cp314-cp314-win32.whl", hash = "sha256:3ab8b9d8b75aef9df299595d5388b14530839f6422333357af1339443cff777d", size = 40930, upload-time = "2026-01-26T02:45:36.278Z" }, + { url = "https://files.pythonhosted.org/packages/e0/bf/52f25716bbe93745595800f36fb17b73711f14da59ed0bb2eba141bc9f0f/multidict-6.7.1-cp314-cp314-win_amd64.whl", hash = "sha256:5e01429a929600e7dab7b166062d9bb54a5eed752384c7384c968c2afab8f50f", size = 45074, upload-time = "2026-01-26T02:45:37.546Z" }, + { url = "https://files.pythonhosted.org/packages/97/ab/22803b03285fa3a525f48217963da3a65ae40f6a1b6f6cf2768879e208f9/multidict-6.7.1-cp314-cp314-win_arm64.whl", hash = "sha256:4885cb0e817aef5d00a2e8451d4665c1808378dc27c2705f1bf4ef8505c0d2e5", size = 42471, upload-time = "2026-01-26T02:45:38.889Z" }, + { url = "https://files.pythonhosted.org/packages/e0/6d/f9293baa6146ba9507e360ea0292b6422b016907c393e2f63fc40ab7b7b5/multidict-6.7.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:0458c978acd8e6ea53c81eefaddbbee9c6c5e591f41b3f5e8e194780fe026581", size = 82401, upload-time = "2026-01-26T02:45:40.254Z" }, + { url = "https://files.pythonhosted.org/packages/7a/68/53b5494738d83558d87c3c71a486504d8373421c3e0dbb6d0db48ad42ee0/multidict-6.7.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:c0abd12629b0af3cf590982c0b413b1e7395cd4ec026f30986818ab95bfaa94a", size = 48143, upload-time = "2026-01-26T02:45:41.635Z" }, + { url = "https://files.pythonhosted.org/packages/37/e8/5284c53310dcdc99ce5d66563f6e5773531a9b9fe9ec7a615e9bc306b05f/multidict-6.7.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:14525a5f61d7d0c94b368a42cff4c9a4e7ba2d52e2672a7b23d84dc86fb02b0c", size = 46507, upload-time = "2026-01-26T02:45:42.99Z" }, + { url = "https://files.pythonhosted.org/packages/e4/fc/6800d0e5b3875568b4083ecf5f310dcf91d86d52573160834fb4bfcf5e4f/multidict-6.7.1-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:17307b22c217b4cf05033dabefe68255a534d637c6c9b0cc8382718f87be4262", size = 239358, upload-time = "2026-01-26T02:45:44.376Z" }, + { url = "https://files.pythonhosted.org/packages/41/75/4ad0973179361cdf3a113905e6e088173198349131be2b390f9fa4da5fc6/multidict-6.7.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7a7e590ff876a3eaf1c02a4dfe0724b6e69a9e9de6d8f556816f29c496046e59", size = 246884, upload-time = "2026-01-26T02:45:47.167Z" }, + { url = "https://files.pythonhosted.org/packages/c3/9c/095bb28b5da139bd41fb9a5d5caff412584f377914bd8787c2aa98717130/multidict-6.7.1-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:5fa6a95dfee63893d80a34758cd0e0c118a30b8dcb46372bf75106c591b77889", size = 225878, upload-time = "2026-01-26T02:45:48.698Z" }, + { url = "https://files.pythonhosted.org/packages/07/d0/c0a72000243756e8f5a277b6b514fa005f2c73d481b7d9e47cd4568aa2e4/multidict-6.7.1-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a0543217a6a017692aa6ae5cc39adb75e587af0f3a82288b1492eb73dd6cc2a4", size = 253542, upload-time = "2026-01-26T02:45:50.164Z" }, + { url = "https://files.pythonhosted.org/packages/c0/6b/f69da15289e384ecf2a68837ec8b5ad8c33e973aa18b266f50fe55f24b8c/multidict-6.7.1-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f99fe611c312b3c1c0ace793f92464d8cd263cc3b26b5721950d977b006b6c4d", size = 252403, upload-time = "2026-01-26T02:45:51.779Z" }, + { url = "https://files.pythonhosted.org/packages/a2/76/b9669547afa5a1a25cd93eaca91c0da1c095b06b6d2d8ec25b713588d3a1/multidict-6.7.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9004d8386d133b7e6135679424c91b0b854d2d164af6ea3f289f8f2761064609", size = 244889, upload-time = "2026-01-26T02:45:53.27Z" }, + { url = "https://files.pythonhosted.org/packages/7e/a9/a50d2669e506dad33cfc45b5d574a205587b7b8a5f426f2fbb2e90882588/multidict-6.7.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e628ef0e6859ffd8273c69412a2465c4be4a9517d07261b33334b5ec6f3c7489", size = 241982, upload-time = "2026-01-26T02:45:54.919Z" }, + { url = "https://files.pythonhosted.org/packages/c5/bb/1609558ad8b456b4827d3c5a5b775c93b87878fd3117ed3db3423dfbce1b/multidict-6.7.1-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:841189848ba629c3552035a6a7f5bf3b02eb304e9fea7492ca220a8eda6b0e5c", size = 232415, upload-time = "2026-01-26T02:45:56.981Z" }, + { url = "https://files.pythonhosted.org/packages/d8/59/6f61039d2aa9261871e03ab9dc058a550d240f25859b05b67fd70f80d4b3/multidict-6.7.1-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:ce1bbd7d780bb5a0da032e095c951f7014d6b0a205f8318308140f1a6aba159e", size = 240337, upload-time = "2026-01-26T02:45:58.698Z" }, + { url = "https://files.pythonhosted.org/packages/a1/29/fdc6a43c203890dc2ae9249971ecd0c41deaedfe00d25cb6564b2edd99eb/multidict-6.7.1-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:b26684587228afed0d50cf804cc71062cc9c1cdf55051c4c6345d372947b268c", size = 248788, upload-time = "2026-01-26T02:46:00.862Z" }, + { url = "https://files.pythonhosted.org/packages/a9/14/a153a06101323e4cf086ecee3faadba52ff71633d471f9685c42e3736163/multidict-6.7.1-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:9f9af11306994335398293f9958071019e3ab95e9a707dc1383a35613f6abcb9", size = 242842, upload-time = "2026-01-26T02:46:02.824Z" }, + { url = "https://files.pythonhosted.org/packages/41/5f/604ae839e64a4a6efc80db94465348d3b328ee955e37acb24badbcd24d83/multidict-6.7.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:b4938326284c4f1224178a560987b6cf8b4d38458b113d9b8c1db1a836e640a2", size = 240237, upload-time = "2026-01-26T02:46:05.898Z" }, + { url = "https://files.pythonhosted.org/packages/5f/60/c3a5187bf66f6fb546ff4ab8fb5a077cbdd832d7b1908d4365c7f74a1917/multidict-6.7.1-cp314-cp314t-win32.whl", hash = "sha256:98655c737850c064a65e006a3df7c997cd3b220be4ec8fe26215760b9697d4d7", size = 48008, upload-time = "2026-01-26T02:46:07.468Z" }, + { url = "https://files.pythonhosted.org/packages/0c/f7/addf1087b860ac60e6f382240f64fb99f8bfb532bb06f7c542b83c29ca61/multidict-6.7.1-cp314-cp314t-win_amd64.whl", hash = "sha256:497bde6223c212ba11d462853cfa4f0ae6ef97465033e7dc9940cdb3ab5b48e5", size = 53542, upload-time = "2026-01-26T02:46:08.809Z" }, + { url = "https://files.pythonhosted.org/packages/4c/81/4629d0aa32302ef7b2ec65c75a728cc5ff4fa410c50096174c1632e70b3e/multidict-6.7.1-cp314-cp314t-win_arm64.whl", hash = "sha256:2bbd113e0d4af5db41d5ebfe9ccaff89de2120578164f86a5d17d5a576d1e5b2", size = 44719, upload-time = "2026-01-26T02:46:11.146Z" }, + { url = "https://files.pythonhosted.org/packages/81/08/7036c080d7117f28a4af526d794aab6a84463126db031b007717c1a6676e/multidict-6.7.1-py3-none-any.whl", hash = "sha256:55d97cc6dae627efa6a6e548885712d4864b81110ac76fa4e534c03819fa4a56", size = 12319, upload-time = "2026-01-26T02:46:44.004Z" }, +] + +[[package]] +name = "opentelemetry-api" +version = "1.39.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "importlib-metadata" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/97/b9/3161be15bb8e3ad01be8be5a968a9237c3027c5be504362ff800fca3e442/opentelemetry_api-1.39.1.tar.gz", hash = "sha256:fbde8c80e1b937a2c61f20347e91c0c18a1940cecf012d62e65a7caf08967c9c", size = 65767, upload-time = "2025-12-11T13:32:39.182Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cf/df/d3f1ddf4bb4cb50ed9b1139cc7b1c54c34a1e7ce8fd1b9a37c0d1551a6bd/opentelemetry_api-1.39.1-py3-none-any.whl", hash = "sha256:2edd8463432a7f8443edce90972169b195e7d6a05500cd29e6d13898187c9950", size = 66356, upload-time = "2025-12-11T13:32:17.304Z" }, +] + +[[package]] +name = "opentelemetry-instrumentation" +version = "0.60b1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "packaging" }, + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/41/0f/7e6b713ac117c1f5e4e3300748af699b9902a2e5e34c9cf443dde25a01fa/opentelemetry_instrumentation-0.60b1.tar.gz", hash = "sha256:57ddc7974c6eb35865af0426d1a17132b88b2ed8586897fee187fd5b8944bd6a", size = 31706, upload-time = "2025-12-11T13:36:42.515Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/77/d2/6788e83c5c86a2690101681aeef27eeb2a6bf22df52d3f263a22cee20915/opentelemetry_instrumentation-0.60b1-py3-none-any.whl", hash = "sha256:04480db952b48fb1ed0073f822f0ee26012b7be7c3eac1a3793122737c78632d", size = 33096, upload-time = "2025-12-11T13:35:33.067Z" }, +] + +[[package]] +name = "opentelemetry-instrumentation-threading" +version = "0.60b1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-instrumentation" }, + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/9b/0a/e36123ec4c0910a3936b92982545a53e9bca5b26a28df06883751a783f84/opentelemetry_instrumentation_threading-0.60b1.tar.gz", hash = "sha256:20b18a68abe5801fa9474336b7c27487d4af3e00b66f6a8734e4fdd75c8b0b43", size = 8768, upload-time = "2025-12-11T13:37:16.29Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/a3/448738b927bcc1843ace7d4ed55dd54441a71363075eeeee89c5944dd740/opentelemetry_instrumentation_threading-0.60b1-py3-none-any.whl", hash = "sha256:92a52a60fee5e32bc6aa8f5acd749b15691ad0bc4457a310f5736b76a6d9d1de", size = 9312, upload-time = "2025-12-11T13:36:28.434Z" }, +] + +[[package]] +name = "opentelemetry-sdk" +version = "1.39.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/eb/fb/c76080c9ba07e1e8235d24cdcc4d125ef7aa3edf23eb4e497c2e50889adc/opentelemetry_sdk-1.39.1.tar.gz", hash = "sha256:cf4d4563caf7bff906c9f7967e2be22d0d6b349b908be0d90fb21c8e9c995cc6", size = 171460, upload-time = "2025-12-11T13:32:49.369Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/98/e91cf858f203d86f4eccdf763dcf01cf03f1dae80c3750f7e635bfa206b6/opentelemetry_sdk-1.39.1-py3-none-any.whl", hash = "sha256:4d5482c478513ecb0a5d938dcc61394e647066e0cc2676bee9f3af3f3f45f01c", size = 132565, upload-time = "2025-12-11T13:32:35.069Z" }, +] + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.60b1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/91/df/553f93ed38bf22f4b999d9be9c185adb558982214f33eae539d3b5cd0858/opentelemetry_semantic_conventions-0.60b1.tar.gz", hash = "sha256:87c228b5a0669b748c76d76df6c364c369c28f1c465e50f661e39737e84bc953", size = 137935, upload-time = "2025-12-11T13:32:50.487Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7a/5e/5958555e09635d09b75de3c4f8b9cae7335ca545d77392ffe7331534c402/opentelemetry_semantic_conventions-0.60b1-py3-none-any.whl", hash = "sha256:9fa8c8b0c110da289809292b0591220d3a7b53c1526a23021e977d68597893fb", size = 219982, upload-time = "2025-12-11T13:32:36.955Z" }, +] + +[[package]] +name = "packaging" +version = "26.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, +] + +[[package]] +name = "peewee" +version = "3.19.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/88/b0/79462b42e89764998756e0557f2b58a15610a5b4512fbbcccae58fba7237/peewee-3.19.0.tar.gz", hash = "sha256:f88292a6f0d7b906cb26bca9c8599b8f4d8920ebd36124400d0cbaaaf915511f", size = 974035, upload-time = "2026-01-07T17:24:59.597Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1a/41/19c65578ef9a54b3083253c68a607f099642747168fe00f3a2bceb7c3a34/peewee-3.19.0-py3-none-any.whl", hash = "sha256:de220b94766e6008c466e00ce4ba5299b9a832117d9eb36d45d0062f3cfd7417", size = 411885, upload-time = "2026-01-07T17:24:58.33Z" }, +] + +[[package]] +name = "pillow" +version = "12.1.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1f/42/5c74462b4fd957fcd7b13b04fb3205ff8349236ea74c7c375766d6c82288/pillow-12.1.1.tar.gz", hash = "sha256:9ad8fa5937ab05218e2b6a4cff30295ad35afd2f83ac592e68c0d871bb0fdbc4", size = 46980264, upload-time = "2026-02-11T04:23:07.146Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d5/11/6db24d4bd7685583caeae54b7009584e38da3c3d4488ed4cd25b439de486/pillow-12.1.1-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:d242e8ac078781f1de88bf823d70c1a9b3c7950a44cdf4b7c012e22ccbcd8e4e", size = 4062689, upload-time = "2026-02-11T04:21:06.804Z" }, + { url = "https://files.pythonhosted.org/packages/33/c0/ce6d3b1fe190f0021203e0d9b5b99e57843e345f15f9ef22fcd43842fd21/pillow-12.1.1-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:02f84dfad02693676692746df05b89cf25597560db2857363a208e393429f5e9", size = 4138535, upload-time = "2026-02-11T04:21:08.452Z" }, + { url = "https://files.pythonhosted.org/packages/a0/c6/d5eb6a4fb32a3f9c21a8c7613ec706534ea1cf9f4b3663e99f0d83f6fca8/pillow-12.1.1-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:e65498daf4b583091ccbb2556c7000abf0f3349fcd57ef7adc9a84a394ed29f6", size = 3601364, upload-time = "2026-02-11T04:21:10.194Z" }, + { url = "https://files.pythonhosted.org/packages/14/a1/16c4b823838ba4c9c52c0e6bbda903a3fe5a1bdbf1b8eb4fff7156f3e318/pillow-12.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:6c6db3b84c87d48d0088943bf33440e0c42370b99b1c2a7989216f7b42eede60", size = 5262561, upload-time = "2026-02-11T04:21:11.742Z" }, + { url = "https://files.pythonhosted.org/packages/bb/ad/ad9dc98ff24f485008aa5cdedaf1a219876f6f6c42a4626c08bc4e80b120/pillow-12.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:8b7e5304e34942bf62e15184219a7b5ad4ff7f3bb5cca4d984f37df1a0e1aee2", size = 4657460, upload-time = "2026-02-11T04:21:13.786Z" }, + { url = "https://files.pythonhosted.org/packages/9e/1b/f1a4ea9a895b5732152789326202a82464d5254759fbacae4deea3069334/pillow-12.1.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:18e5bddd742a44b7e6b1e773ab5db102bd7a94c32555ba656e76d319d19c3850", size = 6232698, upload-time = "2026-02-11T04:21:15.949Z" }, + { url = "https://files.pythonhosted.org/packages/95/f4/86f51b8745070daf21fd2e5b1fe0eb35d4db9ca26e6d58366562fb56a743/pillow-12.1.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc44ef1f3de4f45b50ccf9136999d71abb99dca7706bc75d222ed350b9fd2289", size = 8041706, upload-time = "2026-02-11T04:21:17.723Z" }, + { url = "https://files.pythonhosted.org/packages/29/9b/d6ecd956bb1266dd1045e995cce9b8d77759e740953a1c9aad9502a0461e/pillow-12.1.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a8eb7ed8d4198bccbd07058416eeec51686b498e784eda166395a23eb99138e", size = 6346621, upload-time = "2026-02-11T04:21:19.547Z" }, + { url = "https://files.pythonhosted.org/packages/71/24/538bff45bde96535d7d998c6fed1a751c75ac7c53c37c90dc2601b243893/pillow-12.1.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:47b94983da0c642de92ced1702c5b6c292a84bd3a8e1d1702ff923f183594717", size = 7038069, upload-time = "2026-02-11T04:21:21.378Z" }, + { url = "https://files.pythonhosted.org/packages/94/0e/58cb1a6bc48f746bc4cb3adb8cabff73e2742c92b3bf7a220b7cf69b9177/pillow-12.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:518a48c2aab7ce596d3bf79d0e275661b846e86e4d0e7dec34712c30fe07f02a", size = 6460040, upload-time = "2026-02-11T04:21:23.148Z" }, + { url = "https://files.pythonhosted.org/packages/6c/57/9045cb3ff11eeb6c1adce3b2d60d7d299d7b273a2e6c8381a524abfdc474/pillow-12.1.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a550ae29b95c6dc13cf69e2c9dc5747f814c54eeb2e32d683e5e93af56caa029", size = 7164523, upload-time = "2026-02-11T04:21:25.01Z" }, + { url = "https://files.pythonhosted.org/packages/73/f2/9be9cb99f2175f0d4dbadd6616ce1bf068ee54a28277ea1bf1fbf729c250/pillow-12.1.1-cp313-cp313-win32.whl", hash = "sha256:a003d7422449f6d1e3a34e3dd4110c22148336918ddbfc6a32581cd54b2e0b2b", size = 6332552, upload-time = "2026-02-11T04:21:27.238Z" }, + { url = "https://files.pythonhosted.org/packages/3f/eb/b0834ad8b583d7d9d42b80becff092082a1c3c156bb582590fcc973f1c7c/pillow-12.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:344cf1e3dab3be4b1fa08e449323d98a2a3f819ad20f4b22e77a0ede31f0faa1", size = 7040108, upload-time = "2026-02-11T04:21:29.462Z" }, + { url = "https://files.pythonhosted.org/packages/d5/7d/fc09634e2aabdd0feabaff4a32f4a7d97789223e7c2042fd805ea4b4d2c2/pillow-12.1.1-cp313-cp313-win_arm64.whl", hash = "sha256:5c0dd1636633e7e6a0afe7bf6a51a14992b7f8e60de5789018ebbdfae55b040a", size = 2453712, upload-time = "2026-02-11T04:21:31.072Z" }, + { url = "https://files.pythonhosted.org/packages/19/2a/b9d62794fc8a0dd14c1943df68347badbd5511103e0d04c035ffe5cf2255/pillow-12.1.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0330d233c1a0ead844fc097a7d16c0abff4c12e856c0b325f231820fee1f39da", size = 5264880, upload-time = "2026-02-11T04:21:32.865Z" }, + { url = "https://files.pythonhosted.org/packages/26/9d/e03d857d1347fa5ed9247e123fcd2a97b6220e15e9cb73ca0a8d91702c6e/pillow-12.1.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5dae5f21afb91322f2ff791895ddd8889e5e947ff59f71b46041c8ce6db790bc", size = 4660616, upload-time = "2026-02-11T04:21:34.97Z" }, + { url = "https://files.pythonhosted.org/packages/f7/ec/8a6d22afd02570d30954e043f09c32772bfe143ba9285e2fdb11284952cd/pillow-12.1.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2e0c664be47252947d870ac0d327fea7e63985a08794758aa8af5b6cb6ec0c9c", size = 6269008, upload-time = "2026-02-11T04:21:36.623Z" }, + { url = "https://files.pythonhosted.org/packages/3d/1d/6d875422c9f28a4a361f495a5f68d9de4a66941dc2c619103ca335fa6446/pillow-12.1.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:691ab2ac363b8217f7d31b3497108fb1f50faab2f75dfb03284ec2f217e87bf8", size = 8073226, upload-time = "2026-02-11T04:21:38.585Z" }, + { url = "https://files.pythonhosted.org/packages/a1/cd/134b0b6ee5eda6dc09e25e24b40fdafe11a520bc725c1d0bbaa5e00bf95b/pillow-12.1.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e9e8064fb1cc019296958595f6db671fba95209e3ceb0c4734c9baf97de04b20", size = 6380136, upload-time = "2026-02-11T04:21:40.562Z" }, + { url = "https://files.pythonhosted.org/packages/7a/a9/7628f013f18f001c1b98d8fffe3452f306a70dc6aba7d931019e0492f45e/pillow-12.1.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:472a8d7ded663e6162dafdf20015c486a7009483ca671cece7a9279b512fcb13", size = 7067129, upload-time = "2026-02-11T04:21:42.521Z" }, + { url = "https://files.pythonhosted.org/packages/1e/f8/66ab30a2193b277785601e82ee2d49f68ea575d9637e5e234faaa98efa4c/pillow-12.1.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:89b54027a766529136a06cfebeecb3a04900397a3590fd252160b888479517bf", size = 6491807, upload-time = "2026-02-11T04:21:44.22Z" }, + { url = "https://files.pythonhosted.org/packages/da/0b/a877a6627dc8318fdb84e357c5e1a758c0941ab1ddffdafd231983788579/pillow-12.1.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:86172b0831b82ce4f7877f280055892b31179e1576aa00d0df3bb1bbf8c3e524", size = 7190954, upload-time = "2026-02-11T04:21:46.114Z" }, + { url = "https://files.pythonhosted.org/packages/83/43/6f732ff85743cf746b1361b91665d9f5155e1483817f693f8d57ea93147f/pillow-12.1.1-cp313-cp313t-win32.whl", hash = "sha256:44ce27545b6efcf0fdbdceb31c9a5bdea9333e664cda58a7e674bb74608b3986", size = 6336441, upload-time = "2026-02-11T04:21:48.22Z" }, + { url = "https://files.pythonhosted.org/packages/3b/44/e865ef3986611bb75bfabdf94a590016ea327833f434558801122979cd0e/pillow-12.1.1-cp313-cp313t-win_amd64.whl", hash = "sha256:a285e3eb7a5a45a2ff504e31f4a8d1b12ef62e84e5411c6804a42197c1cf586c", size = 7045383, upload-time = "2026-02-11T04:21:50.015Z" }, + { url = "https://files.pythonhosted.org/packages/a8/c6/f4fb24268d0c6908b9f04143697ea18b0379490cb74ba9e8d41b898bd005/pillow-12.1.1-cp313-cp313t-win_arm64.whl", hash = "sha256:cc7d296b5ea4d29e6570dabeaed58d31c3fea35a633a69679fb03d7664f43fb3", size = 2456104, upload-time = "2026-02-11T04:21:51.633Z" }, + { url = "https://files.pythonhosted.org/packages/03/d0/bebb3ffbf31c5a8e97241476c4cf8b9828954693ce6744b4a2326af3e16b/pillow-12.1.1-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:417423db963cb4be8bac3fc1204fe61610f6abeed1580a7a2cbb2fbda20f12af", size = 4062652, upload-time = "2026-02-11T04:21:53.19Z" }, + { url = "https://files.pythonhosted.org/packages/2d/c0/0e16fb0addda4851445c28f8350d8c512f09de27bbb0d6d0bbf8b6709605/pillow-12.1.1-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:b957b71c6b2387610f556a7eb0828afbe40b4a98036fc0d2acfa5a44a0c2036f", size = 4138823, upload-time = "2026-02-11T04:22:03.088Z" }, + { url = "https://files.pythonhosted.org/packages/6b/fb/6170ec655d6f6bb6630a013dd7cf7bc218423d7b5fa9071bf63dc32175ae/pillow-12.1.1-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:097690ba1f2efdeb165a20469d59d8bb03c55fb6621eb2041a060ae8ea3e9642", size = 3601143, upload-time = "2026-02-11T04:22:04.909Z" }, + { url = "https://files.pythonhosted.org/packages/59/04/dc5c3f297510ba9a6837cbb318b87dd2b8f73eb41a43cc63767f65cb599c/pillow-12.1.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:2815a87ab27848db0321fb78c7f0b2c8649dee134b7f2b80c6a45c6831d75ccd", size = 5266254, upload-time = "2026-02-11T04:22:07.656Z" }, + { url = "https://files.pythonhosted.org/packages/05/30/5db1236b0d6313f03ebf97f5e17cda9ca060f524b2fcc875149a8360b21c/pillow-12.1.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:f7ed2c6543bad5a7d5530eb9e78c53132f93dfa44a28492db88b41cdab885202", size = 4657499, upload-time = "2026-02-11T04:22:09.613Z" }, + { url = "https://files.pythonhosted.org/packages/6f/18/008d2ca0eb612e81968e8be0bbae5051efba24d52debf930126d7eaacbba/pillow-12.1.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:652a2c9ccfb556235b2b501a3a7cf3742148cd22e04b5625c5fe057ea3e3191f", size = 6232137, upload-time = "2026-02-11T04:22:11.434Z" }, + { url = "https://files.pythonhosted.org/packages/70/f1/f14d5b8eeb4b2cd62b9f9f847eb6605f103df89ef619ac68f92f748614ea/pillow-12.1.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d6e4571eedf43af33d0fc233a382a76e849badbccdf1ac438841308652a08e1f", size = 8042721, upload-time = "2026-02-11T04:22:13.321Z" }, + { url = "https://files.pythonhosted.org/packages/5a/d6/17824509146e4babbdabf04d8171491fa9d776f7061ff6e727522df9bd03/pillow-12.1.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b574c51cf7d5d62e9be37ba446224b59a2da26dc4c1bb2ecbe936a4fb1a7cb7f", size = 6347798, upload-time = "2026-02-11T04:22:15.449Z" }, + { url = "https://files.pythonhosted.org/packages/d1/ee/c85a38a9ab92037a75615aba572c85ea51e605265036e00c5b67dfafbfe2/pillow-12.1.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a37691702ed687799de29a518d63d4682d9016932db66d4e90c345831b02fb4e", size = 7039315, upload-time = "2026-02-11T04:22:17.24Z" }, + { url = "https://files.pythonhosted.org/packages/ec/f3/bc8ccc6e08a148290d7523bde4d9a0d6c981db34631390dc6e6ec34cacf6/pillow-12.1.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:f95c00d5d6700b2b890479664a06e754974848afaae5e21beb4d83c106923fd0", size = 6462360, upload-time = "2026-02-11T04:22:19.111Z" }, + { url = "https://files.pythonhosted.org/packages/f6/ab/69a42656adb1d0665ab051eec58a41f169ad295cf81ad45406963105408f/pillow-12.1.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:559b38da23606e68681337ad74622c4dbba02254fc9cb4488a305dd5975c7eeb", size = 7165438, upload-time = "2026-02-11T04:22:21.041Z" }, + { url = "https://files.pythonhosted.org/packages/02/46/81f7aa8941873f0f01d4b55cc543b0a3d03ec2ee30d617a0448bf6bd6dec/pillow-12.1.1-cp314-cp314-win32.whl", hash = "sha256:03edcc34d688572014ff223c125a3f77fb08091e4607e7745002fc214070b35f", size = 6431503, upload-time = "2026-02-11T04:22:22.833Z" }, + { url = "https://files.pythonhosted.org/packages/40/72/4c245f7d1044b67affc7f134a09ea619d4895333d35322b775b928180044/pillow-12.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:50480dcd74fa63b8e78235957d302d98d98d82ccbfac4c7e12108ba9ecbdba15", size = 7176748, upload-time = "2026-02-11T04:22:24.64Z" }, + { url = "https://files.pythonhosted.org/packages/e4/ad/8a87bdbe038c5c698736e3348af5c2194ffb872ea52f11894c95f9305435/pillow-12.1.1-cp314-cp314-win_arm64.whl", hash = "sha256:5cb1785d97b0c3d1d1a16bc1d710c4a0049daefc4935f3a8f31f827f4d3d2e7f", size = 2544314, upload-time = "2026-02-11T04:22:26.685Z" }, + { url = "https://files.pythonhosted.org/packages/6c/9d/efd18493f9de13b87ede7c47e69184b9e859e4427225ea962e32e56a49bc/pillow-12.1.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:1f90cff8aa76835cba5769f0b3121a22bd4eb9e6884cfe338216e557a9a548b8", size = 5268612, upload-time = "2026-02-11T04:22:29.884Z" }, + { url = "https://files.pythonhosted.org/packages/f8/f1/4f42eb2b388eb2ffc660dcb7f7b556c1015c53ebd5f7f754965ef997585b/pillow-12.1.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1f1be78ce9466a7ee64bfda57bdba0f7cc499d9794d518b854816c41bf0aa4e9", size = 4660567, upload-time = "2026-02-11T04:22:31.799Z" }, + { url = "https://files.pythonhosted.org/packages/01/54/df6ef130fa43e4b82e32624a7b821a2be1c5653a5fdad8469687a7db4e00/pillow-12.1.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:42fc1f4677106188ad9a55562bbade416f8b55456f522430fadab3cef7cd4e60", size = 6269951, upload-time = "2026-02-11T04:22:33.921Z" }, + { url = "https://files.pythonhosted.org/packages/a9/48/618752d06cc44bb4aae8ce0cd4e6426871929ed7b46215638088270d9b34/pillow-12.1.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:98edb152429ab62a1818039744d8fbb3ccab98a7c29fc3d5fcef158f3f1f68b7", size = 8074769, upload-time = "2026-02-11T04:22:35.877Z" }, + { url = "https://files.pythonhosted.org/packages/c3/bd/f1d71eb39a72fa088d938655afba3e00b38018d052752f435838961127d8/pillow-12.1.1-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d470ab1178551dd17fdba0fef463359c41aaa613cdcd7ff8373f54be629f9f8f", size = 6381358, upload-time = "2026-02-11T04:22:37.698Z" }, + { url = "https://files.pythonhosted.org/packages/64/ef/c784e20b96674ed36a5af839305f55616f8b4f8aa8eeccf8531a6e312243/pillow-12.1.1-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6408a7b064595afcab0a49393a413732a35788f2a5092fdc6266952ed67de586", size = 7068558, upload-time = "2026-02-11T04:22:39.597Z" }, + { url = "https://files.pythonhosted.org/packages/73/cb/8059688b74422ae61278202c4e1ad992e8a2e7375227be0a21c6b87ca8d5/pillow-12.1.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5d8c41325b382c07799a3682c1c258469ea2ff97103c53717b7893862d0c98ce", size = 6493028, upload-time = "2026-02-11T04:22:42.73Z" }, + { url = "https://files.pythonhosted.org/packages/c6/da/e3c008ed7d2dd1f905b15949325934510b9d1931e5df999bb15972756818/pillow-12.1.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:c7697918b5be27424e9ce568193efd13d925c4481dd364e43f5dff72d33e10f8", size = 7191940, upload-time = "2026-02-11T04:22:44.543Z" }, + { url = "https://files.pythonhosted.org/packages/01/4a/9202e8d11714c1fc5951f2e1ef362f2d7fbc595e1f6717971d5dd750e969/pillow-12.1.1-cp314-cp314t-win32.whl", hash = "sha256:d2912fd8114fc5545aa3a4b5576512f64c55a03f3ebcca4c10194d593d43ea36", size = 6438736, upload-time = "2026-02-11T04:22:46.347Z" }, + { url = "https://files.pythonhosted.org/packages/f3/ca/cbce2327eb9885476b3957b2e82eb12c866a8b16ad77392864ad601022ce/pillow-12.1.1-cp314-cp314t-win_amd64.whl", hash = "sha256:4ceb838d4bd9dab43e06c363cab2eebf63846d6a4aeaea283bbdfd8f1a8ed58b", size = 7182894, upload-time = "2026-02-11T04:22:48.114Z" }, + { url = "https://files.pythonhosted.org/packages/ec/d2/de599c95ba0a973b94410477f8bf0b6f0b5e67360eb89bcb1ad365258beb/pillow-12.1.1-cp314-cp314t-win_arm64.whl", hash = "sha256:7b03048319bfc6170e93bd60728a1af51d3dd7704935feb228c4d4faab35d334", size = 2546446, upload-time = "2026-02-11T04:22:50.342Z" }, +] + +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, +] + +[[package]] +name = "prompt-toolkit" +version = "3.0.52" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/96/06e01a7b38dce6fe1db213e061a4602dd6032a8a97ef6c1a862537732421/prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855", size = 434198, upload-time = "2025-08-27T15:24:02.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" }, +] + +[[package]] +name = "propcache" +version = "0.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9e/da/e9fc233cf63743258bff22b3dfa7ea5baef7b5bc324af47a0ad89b8ffc6f/propcache-0.4.1.tar.gz", hash = "sha256:f48107a8c637e80362555f37ecf49abe20370e557cc4ab374f04ec4423c97c3d", size = 46442, upload-time = "2025-10-08T19:49:02.291Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bf/df/6d9c1b6ac12b003837dde8a10231a7344512186e87b36e855bef32241942/propcache-0.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:43eedf29202c08550aac1d14e0ee619b0430aaef78f85864c1a892294fbc28cf", size = 77750, upload-time = "2025-10-08T19:47:07.648Z" }, + { url = "https://files.pythonhosted.org/packages/8b/e8/677a0025e8a2acf07d3418a2e7ba529c9c33caf09d3c1f25513023c1db56/propcache-0.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d62cdfcfd89ccb8de04e0eda998535c406bf5e060ffd56be6c586cbcc05b3311", size = 44780, upload-time = "2025-10-08T19:47:08.851Z" }, + { url = "https://files.pythonhosted.org/packages/89/a4/92380f7ca60f99ebae761936bc48a72a639e8a47b29050615eef757cb2a7/propcache-0.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:cae65ad55793da34db5f54e4029b89d3b9b9490d8abe1b4c7ab5d4b8ec7ebf74", size = 46308, upload-time = "2025-10-08T19:47:09.982Z" }, + { url = "https://files.pythonhosted.org/packages/2d/48/c5ac64dee5262044348d1d78a5f85dd1a57464a60d30daee946699963eb3/propcache-0.4.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:333ddb9031d2704a301ee3e506dc46b1fe5f294ec198ed6435ad5b6a085facfe", size = 208182, upload-time = "2025-10-08T19:47:11.319Z" }, + { url = "https://files.pythonhosted.org/packages/c6/0c/cd762dd011a9287389a6a3eb43aa30207bde253610cca06824aeabfe9653/propcache-0.4.1-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:fd0858c20f078a32cf55f7e81473d96dcf3b93fd2ccdb3d40fdf54b8573df3af", size = 211215, upload-time = "2025-10-08T19:47:13.146Z" }, + { url = "https://files.pythonhosted.org/packages/30/3e/49861e90233ba36890ae0ca4c660e95df565b2cd15d4a68556ab5865974e/propcache-0.4.1-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:678ae89ebc632c5c204c794f8dab2837c5f159aeb59e6ed0539500400577298c", size = 218112, upload-time = "2025-10-08T19:47:14.913Z" }, + { url = "https://files.pythonhosted.org/packages/f1/8b/544bc867e24e1bd48f3118cecd3b05c694e160a168478fa28770f22fd094/propcache-0.4.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d472aeb4fbf9865e0c6d622d7f4d54a4e101a89715d8904282bb5f9a2f476c3f", size = 204442, upload-time = "2025-10-08T19:47:16.277Z" }, + { url = "https://files.pythonhosted.org/packages/50/a6/4282772fd016a76d3e5c0df58380a5ea64900afd836cec2c2f662d1b9bb3/propcache-0.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4d3df5fa7e36b3225954fba85589da77a0fe6a53e3976de39caf04a0db4c36f1", size = 199398, upload-time = "2025-10-08T19:47:17.962Z" }, + { url = "https://files.pythonhosted.org/packages/3e/ec/d8a7cd406ee1ddb705db2139f8a10a8a427100347bd698e7014351c7af09/propcache-0.4.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:ee17f18d2498f2673e432faaa71698032b0127ebf23ae5974eeaf806c279df24", size = 196920, upload-time = "2025-10-08T19:47:19.355Z" }, + { url = "https://files.pythonhosted.org/packages/f6/6c/f38ab64af3764f431e359f8baf9e0a21013e24329e8b85d2da32e8ed07ca/propcache-0.4.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:580e97762b950f993ae618e167e7be9256b8353c2dcd8b99ec100eb50f5286aa", size = 203748, upload-time = "2025-10-08T19:47:21.338Z" }, + { url = "https://files.pythonhosted.org/packages/d6/e3/fa846bd70f6534d647886621388f0a265254d30e3ce47e5c8e6e27dbf153/propcache-0.4.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:501d20b891688eb8e7aa903021f0b72d5a55db40ffaab27edefd1027caaafa61", size = 205877, upload-time = "2025-10-08T19:47:23.059Z" }, + { url = "https://files.pythonhosted.org/packages/e2/39/8163fc6f3133fea7b5f2827e8eba2029a0277ab2c5beee6c1db7b10fc23d/propcache-0.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:9a0bd56e5b100aef69bd8562b74b46254e7c8812918d3baa700c8a8009b0af66", size = 199437, upload-time = "2025-10-08T19:47:24.445Z" }, + { url = "https://files.pythonhosted.org/packages/93/89/caa9089970ca49c7c01662bd0eeedfe85494e863e8043565aeb6472ce8fe/propcache-0.4.1-cp313-cp313-win32.whl", hash = "sha256:bcc9aaa5d80322bc2fb24bb7accb4a30f81e90ab8d6ba187aec0744bc302ad81", size = 37586, upload-time = "2025-10-08T19:47:25.736Z" }, + { url = "https://files.pythonhosted.org/packages/f5/ab/f76ec3c3627c883215b5c8080debb4394ef5a7a29be811f786415fc1e6fd/propcache-0.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:381914df18634f5494334d201e98245c0596067504b9372d8cf93f4bb23e025e", size = 40790, upload-time = "2025-10-08T19:47:26.847Z" }, + { url = "https://files.pythonhosted.org/packages/59/1b/e71ae98235f8e2ba5004d8cb19765a74877abf189bc53fc0c80d799e56c3/propcache-0.4.1-cp313-cp313-win_arm64.whl", hash = "sha256:8873eb4460fd55333ea49b7d189749ecf6e55bf85080f11b1c4530ed3034cba1", size = 37158, upload-time = "2025-10-08T19:47:27.961Z" }, + { url = "https://files.pythonhosted.org/packages/83/ce/a31bbdfc24ee0dcbba458c8175ed26089cf109a55bbe7b7640ed2470cfe9/propcache-0.4.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:92d1935ee1f8d7442da9c0c4fa7ac20d07e94064184811b685f5c4fada64553b", size = 81451, upload-time = "2025-10-08T19:47:29.445Z" }, + { url = "https://files.pythonhosted.org/packages/25/9c/442a45a470a68456e710d96cacd3573ef26a1d0a60067e6a7d5e655621ed/propcache-0.4.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:473c61b39e1460d386479b9b2f337da492042447c9b685f28be4f74d3529e566", size = 46374, upload-time = "2025-10-08T19:47:30.579Z" }, + { url = "https://files.pythonhosted.org/packages/f4/bf/b1d5e21dbc3b2e889ea4327044fb16312a736d97640fb8b6aa3f9c7b3b65/propcache-0.4.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:c0ef0aaafc66fbd87842a3fe3902fd889825646bc21149eafe47be6072725835", size = 48396, upload-time = "2025-10-08T19:47:31.79Z" }, + { url = "https://files.pythonhosted.org/packages/f4/04/5b4c54a103d480e978d3c8a76073502b18db0c4bc17ab91b3cb5092ad949/propcache-0.4.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f95393b4d66bfae908c3ca8d169d5f79cd65636ae15b5e7a4f6e67af675adb0e", size = 275950, upload-time = "2025-10-08T19:47:33.481Z" }, + { url = "https://files.pythonhosted.org/packages/b4/c1/86f846827fb969c4b78b0af79bba1d1ea2156492e1b83dea8b8a6ae27395/propcache-0.4.1-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c07fda85708bc48578467e85099645167a955ba093be0a2dcba962195676e859", size = 273856, upload-time = "2025-10-08T19:47:34.906Z" }, + { url = "https://files.pythonhosted.org/packages/36/1d/fc272a63c8d3bbad6878c336c7a7dea15e8f2d23a544bda43205dfa83ada/propcache-0.4.1-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:af223b406d6d000830c6f65f1e6431783fc3f713ba3e6cc8c024d5ee96170a4b", size = 280420, upload-time = "2025-10-08T19:47:36.338Z" }, + { url = "https://files.pythonhosted.org/packages/07/0c/01f2219d39f7e53d52e5173bcb09c976609ba30209912a0680adfb8c593a/propcache-0.4.1-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a78372c932c90ee474559c5ddfffd718238e8673c340dc21fe45c5b8b54559a0", size = 263254, upload-time = "2025-10-08T19:47:37.692Z" }, + { url = "https://files.pythonhosted.org/packages/2d/18/cd28081658ce597898f0c4d174d4d0f3c5b6d4dc27ffafeef835c95eb359/propcache-0.4.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:564d9f0d4d9509e1a870c920a89b2fec951b44bf5ba7d537a9e7c1ccec2c18af", size = 261205, upload-time = "2025-10-08T19:47:39.659Z" }, + { url = "https://files.pythonhosted.org/packages/7a/71/1f9e22eb8b8316701c2a19fa1f388c8a3185082607da8e406a803c9b954e/propcache-0.4.1-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:17612831fda0138059cc5546f4d12a2aacfb9e47068c06af35c400ba58ba7393", size = 247873, upload-time = "2025-10-08T19:47:41.084Z" }, + { url = "https://files.pythonhosted.org/packages/4a/65/3d4b61f36af2b4eddba9def857959f1016a51066b4f1ce348e0cf7881f58/propcache-0.4.1-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:41a89040cb10bd345b3c1a873b2bf36413d48da1def52f268a055f7398514874", size = 262739, upload-time = "2025-10-08T19:47:42.51Z" }, + { url = "https://files.pythonhosted.org/packages/2a/42/26746ab087faa77c1c68079b228810436ccd9a5ce9ac85e2b7307195fd06/propcache-0.4.1-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:e35b88984e7fa64aacecea39236cee32dd9bd8c55f57ba8a75cf2399553f9bd7", size = 263514, upload-time = "2025-10-08T19:47:43.927Z" }, + { url = "https://files.pythonhosted.org/packages/94/13/630690fe201f5502d2403dd3cfd451ed8858fe3c738ee88d095ad2ff407b/propcache-0.4.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:6f8b465489f927b0df505cbe26ffbeed4d6d8a2bbc61ce90eb074ff129ef0ab1", size = 257781, upload-time = "2025-10-08T19:47:45.448Z" }, + { url = "https://files.pythonhosted.org/packages/92/f7/1d4ec5841505f423469efbfc381d64b7b467438cd5a4bbcbb063f3b73d27/propcache-0.4.1-cp313-cp313t-win32.whl", hash = "sha256:2ad890caa1d928c7c2965b48f3a3815c853180831d0e5503d35cf00c472f4717", size = 41396, upload-time = "2025-10-08T19:47:47.202Z" }, + { url = "https://files.pythonhosted.org/packages/48/f0/615c30622316496d2cbbc29f5985f7777d3ada70f23370608c1d3e081c1f/propcache-0.4.1-cp313-cp313t-win_amd64.whl", hash = "sha256:f7ee0e597f495cf415bcbd3da3caa3bd7e816b74d0d52b8145954c5e6fd3ff37", size = 44897, upload-time = "2025-10-08T19:47:48.336Z" }, + { url = "https://files.pythonhosted.org/packages/fd/ca/6002e46eccbe0e33dcd4069ef32f7f1c9e243736e07adca37ae8c4830ec3/propcache-0.4.1-cp313-cp313t-win_arm64.whl", hash = "sha256:929d7cbe1f01bb7baffb33dc14eb5691c95831450a26354cd210a8155170c93a", size = 39789, upload-time = "2025-10-08T19:47:49.876Z" }, + { url = "https://files.pythonhosted.org/packages/8e/5c/bca52d654a896f831b8256683457ceddd490ec18d9ec50e97dfd8fc726a8/propcache-0.4.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:3f7124c9d820ba5548d431afb4632301acf965db49e666aa21c305cbe8c6de12", size = 78152, upload-time = "2025-10-08T19:47:51.051Z" }, + { url = "https://files.pythonhosted.org/packages/65/9b/03b04e7d82a5f54fb16113d839f5ea1ede58a61e90edf515f6577c66fa8f/propcache-0.4.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:c0d4b719b7da33599dfe3b22d3db1ef789210a0597bc650b7cee9c77c2be8c5c", size = 44869, upload-time = "2025-10-08T19:47:52.594Z" }, + { url = "https://files.pythonhosted.org/packages/b2/fa/89a8ef0468d5833a23fff277b143d0573897cf75bd56670a6d28126c7d68/propcache-0.4.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9f302f4783709a78240ebc311b793f123328716a60911d667e0c036bc5dcbded", size = 46596, upload-time = "2025-10-08T19:47:54.073Z" }, + { url = "https://files.pythonhosted.org/packages/86/bd/47816020d337f4a746edc42fe8d53669965138f39ee117414c7d7a340cfe/propcache-0.4.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c80ee5802e3fb9ea37938e7eecc307fb984837091d5fd262bb37238b1ae97641", size = 206981, upload-time = "2025-10-08T19:47:55.715Z" }, + { url = "https://files.pythonhosted.org/packages/df/f6/c5fa1357cc9748510ee55f37173eb31bfde6d94e98ccd9e6f033f2fc06e1/propcache-0.4.1-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ed5a841e8bb29a55fb8159ed526b26adc5bdd7e8bd7bf793ce647cb08656cdf4", size = 211490, upload-time = "2025-10-08T19:47:57.499Z" }, + { url = "https://files.pythonhosted.org/packages/80/1e/e5889652a7c4a3846683401a48f0f2e5083ce0ec1a8a5221d8058fbd1adf/propcache-0.4.1-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:55c72fd6ea2da4c318e74ffdf93c4fe4e926051133657459131a95c846d16d44", size = 215371, upload-time = "2025-10-08T19:47:59.317Z" }, + { url = "https://files.pythonhosted.org/packages/b2/f2/889ad4b2408f72fe1a4f6a19491177b30ea7bf1a0fd5f17050ca08cfc882/propcache-0.4.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8326e144341460402713f91df60ade3c999d601e7eb5ff8f6f7862d54de0610d", size = 201424, upload-time = "2025-10-08T19:48:00.67Z" }, + { url = "https://files.pythonhosted.org/packages/27/73/033d63069b57b0812c8bd19f311faebeceb6ba31b8f32b73432d12a0b826/propcache-0.4.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:060b16ae65bc098da7f6d25bf359f1f31f688384858204fe5d652979e0015e5b", size = 197566, upload-time = "2025-10-08T19:48:02.604Z" }, + { url = "https://files.pythonhosted.org/packages/dc/89/ce24f3dc182630b4e07aa6d15f0ff4b14ed4b9955fae95a0b54c58d66c05/propcache-0.4.1-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:89eb3fa9524f7bec9de6e83cf3faed9d79bffa560672c118a96a171a6f55831e", size = 193130, upload-time = "2025-10-08T19:48:04.499Z" }, + { url = "https://files.pythonhosted.org/packages/a9/24/ef0d5fd1a811fb5c609278d0209c9f10c35f20581fcc16f818da959fc5b4/propcache-0.4.1-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:dee69d7015dc235f526fe80a9c90d65eb0039103fe565776250881731f06349f", size = 202625, upload-time = "2025-10-08T19:48:06.213Z" }, + { url = "https://files.pythonhosted.org/packages/f5/02/98ec20ff5546f68d673df2f7a69e8c0d076b5abd05ca882dc7ee3a83653d/propcache-0.4.1-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:5558992a00dfd54ccbc64a32726a3357ec93825a418a401f5cc67df0ac5d9e49", size = 204209, upload-time = "2025-10-08T19:48:08.432Z" }, + { url = "https://files.pythonhosted.org/packages/a0/87/492694f76759b15f0467a2a93ab68d32859672b646aa8a04ce4864e7932d/propcache-0.4.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c9b822a577f560fbd9554812526831712c1436d2c046cedee4c3796d3543b144", size = 197797, upload-time = "2025-10-08T19:48:09.968Z" }, + { url = "https://files.pythonhosted.org/packages/ee/36/66367de3575db1d2d3f3d177432bd14ee577a39d3f5d1b3d5df8afe3b6e2/propcache-0.4.1-cp314-cp314-win32.whl", hash = "sha256:ab4c29b49d560fe48b696cdcb127dd36e0bc2472548f3bf56cc5cb3da2b2984f", size = 38140, upload-time = "2025-10-08T19:48:11.232Z" }, + { url = "https://files.pythonhosted.org/packages/0c/2a/a758b47de253636e1b8aef181c0b4f4f204bf0dd964914fb2af90a95b49b/propcache-0.4.1-cp314-cp314-win_amd64.whl", hash = "sha256:5a103c3eb905fcea0ab98be99c3a9a5ab2de60228aa5aceedc614c0281cf6153", size = 41257, upload-time = "2025-10-08T19:48:12.707Z" }, + { url = "https://files.pythonhosted.org/packages/34/5e/63bd5896c3fec12edcbd6f12508d4890d23c265df28c74b175e1ef9f4f3b/propcache-0.4.1-cp314-cp314-win_arm64.whl", hash = "sha256:74c1fb26515153e482e00177a1ad654721bf9207da8a494a0c05e797ad27b992", size = 38097, upload-time = "2025-10-08T19:48:13.923Z" }, + { url = "https://files.pythonhosted.org/packages/99/85/9ff785d787ccf9bbb3f3106f79884a130951436f58392000231b4c737c80/propcache-0.4.1-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:824e908bce90fb2743bd6b59db36eb4f45cd350a39637c9f73b1c1ea66f5b75f", size = 81455, upload-time = "2025-10-08T19:48:15.16Z" }, + { url = "https://files.pythonhosted.org/packages/90/85/2431c10c8e7ddb1445c1f7c4b54d886e8ad20e3c6307e7218f05922cad67/propcache-0.4.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:c2b5e7db5328427c57c8e8831abda175421b709672f6cfc3d630c3b7e2146393", size = 46372, upload-time = "2025-10-08T19:48:16.424Z" }, + { url = "https://files.pythonhosted.org/packages/01/20/b0972d902472da9bcb683fa595099911f4d2e86e5683bcc45de60dd05dc3/propcache-0.4.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:6f6ff873ed40292cd4969ef5310179afd5db59fdf055897e282485043fc80ad0", size = 48411, upload-time = "2025-10-08T19:48:17.577Z" }, + { url = "https://files.pythonhosted.org/packages/e2/e3/7dc89f4f21e8f99bad3d5ddb3a3389afcf9da4ac69e3deb2dcdc96e74169/propcache-0.4.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49a2dc67c154db2c1463013594c458881a069fcf98940e61a0569016a583020a", size = 275712, upload-time = "2025-10-08T19:48:18.901Z" }, + { url = "https://files.pythonhosted.org/packages/20/67/89800c8352489b21a8047c773067644e3897f02ecbbd610f4d46b7f08612/propcache-0.4.1-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:005f08e6a0529984491e37d8dbc3dd86f84bd78a8ceb5fa9a021f4c48d4984be", size = 273557, upload-time = "2025-10-08T19:48:20.762Z" }, + { url = "https://files.pythonhosted.org/packages/e2/a1/b52b055c766a54ce6d9c16d9aca0cad8059acd9637cdf8aa0222f4a026ef/propcache-0.4.1-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5c3310452e0d31390da9035c348633b43d7e7feb2e37be252be6da45abd1abcc", size = 280015, upload-time = "2025-10-08T19:48:22.592Z" }, + { url = "https://files.pythonhosted.org/packages/48/c8/33cee30bd890672c63743049f3c9e4be087e6780906bfc3ec58528be59c1/propcache-0.4.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4c3c70630930447f9ef1caac7728c8ad1c56bc5015338b20fed0d08ea2480b3a", size = 262880, upload-time = "2025-10-08T19:48:23.947Z" }, + { url = "https://files.pythonhosted.org/packages/0c/b1/8f08a143b204b418285c88b83d00edbd61afbc2c6415ffafc8905da7038b/propcache-0.4.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8e57061305815dfc910a3634dcf584f08168a8836e6999983569f51a8544cd89", size = 260938, upload-time = "2025-10-08T19:48:25.656Z" }, + { url = "https://files.pythonhosted.org/packages/cf/12/96e4664c82ca2f31e1c8dff86afb867348979eb78d3cb8546a680287a1e9/propcache-0.4.1-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:521a463429ef54143092c11a77e04056dd00636f72e8c45b70aaa3140d639726", size = 247641, upload-time = "2025-10-08T19:48:27.207Z" }, + { url = "https://files.pythonhosted.org/packages/18/ed/e7a9cfca28133386ba52278136d42209d3125db08d0a6395f0cba0c0285c/propcache-0.4.1-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:120c964da3fdc75e3731aa392527136d4ad35868cc556fd09bb6d09172d9a367", size = 262510, upload-time = "2025-10-08T19:48:28.65Z" }, + { url = "https://files.pythonhosted.org/packages/f5/76/16d8bf65e8845dd62b4e2b57444ab81f07f40caa5652b8969b87ddcf2ef6/propcache-0.4.1-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:d8f353eb14ee3441ee844ade4277d560cdd68288838673273b978e3d6d2c8f36", size = 263161, upload-time = "2025-10-08T19:48:30.133Z" }, + { url = "https://files.pythonhosted.org/packages/e7/70/c99e9edb5d91d5ad8a49fa3c1e8285ba64f1476782fed10ab251ff413ba1/propcache-0.4.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ab2943be7c652f09638800905ee1bab2c544e537edb57d527997a24c13dc1455", size = 257393, upload-time = "2025-10-08T19:48:31.567Z" }, + { url = "https://files.pythonhosted.org/packages/08/02/87b25304249a35c0915d236575bc3574a323f60b47939a2262b77632a3ee/propcache-0.4.1-cp314-cp314t-win32.whl", hash = "sha256:05674a162469f31358c30bcaa8883cb7829fa3110bf9c0991fe27d7896c42d85", size = 42546, upload-time = "2025-10-08T19:48:32.872Z" }, + { url = "https://files.pythonhosted.org/packages/cb/ef/3c6ecf8b317aa982f309835e8f96987466123c6e596646d4e6a1dfcd080f/propcache-0.4.1-cp314-cp314t-win_amd64.whl", hash = "sha256:990f6b3e2a27d683cb7602ed6c86f15ee6b43b1194736f9baaeb93d0016633b1", size = 46259, upload-time = "2025-10-08T19:48:34.226Z" }, + { url = "https://files.pythonhosted.org/packages/c4/2d/346e946d4951f37eca1e4f55be0f0174c52cd70720f84029b02f296f4a38/propcache-0.4.1-cp314-cp314t-win_arm64.whl", hash = "sha256:ecef2343af4cc68e05131e45024ba34f6095821988a9d0a02aa7c73fcc448aa9", size = 40428, upload-time = "2025-10-08T19:48:35.441Z" }, + { url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" }, +] + +[[package]] +name = "pycparser" +version = "3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" }, +] + +[[package]] +name = "pydantic" +version = "2.12.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" }, +] + +[[package]] +name = "pydantic-core" +version = "2.41.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/71/70/23b021c950c2addd24ec408e9ab05d59b035b39d97cdc1130e1bce647bb6/pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e", size = 460952, upload-time = "2025-11-04T13:43:49.098Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/06/8806241ff1f70d9939f9af039c6c35f2360cf16e93c2ca76f184e76b1564/pydantic_core-2.41.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:941103c9be18ac8daf7b7adca8228f8ed6bb7a1849020f643b3a14d15b1924d9", size = 2120403, upload-time = "2025-11-04T13:40:25.248Z" }, + { url = "https://files.pythonhosted.org/packages/94/02/abfa0e0bda67faa65fef1c84971c7e45928e108fe24333c81f3bfe35d5f5/pydantic_core-2.41.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:112e305c3314f40c93998e567879e887a3160bb8689ef3d2c04b6cc62c33ac34", size = 1896206, upload-time = "2025-11-04T13:40:27.099Z" }, + { url = "https://files.pythonhosted.org/packages/15/df/a4c740c0943e93e6500f9eb23f4ca7ec9bf71b19e608ae5b579678c8d02f/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbaad15cb0c90aa221d43c00e77bb33c93e8d36e0bf74760cd00e732d10a6a0", size = 1919307, upload-time = "2025-11-04T13:40:29.806Z" }, + { url = "https://files.pythonhosted.org/packages/9a/e3/6324802931ae1d123528988e0e86587c2072ac2e5394b4bc2bc34b61ff6e/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:03ca43e12fab6023fc79d28ca6b39b05f794ad08ec2feccc59a339b02f2b3d33", size = 2063258, upload-time = "2025-11-04T13:40:33.544Z" }, + { url = "https://files.pythonhosted.org/packages/c9/d4/2230d7151d4957dd79c3044ea26346c148c98fbf0ee6ebd41056f2d62ab5/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc799088c08fa04e43144b164feb0c13f9a0bc40503f8df3e9fde58a3c0c101e", size = 2214917, upload-time = "2025-11-04T13:40:35.479Z" }, + { url = "https://files.pythonhosted.org/packages/e6/9f/eaac5df17a3672fef0081b6c1bb0b82b33ee89aa5cec0d7b05f52fd4a1fa/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97aeba56665b4c3235a0e52b2c2f5ae9cd071b8a8310ad27bddb3f7fb30e9aa2", size = 2332186, upload-time = "2025-11-04T13:40:37.436Z" }, + { url = "https://files.pythonhosted.org/packages/cf/4e/35a80cae583a37cf15604b44240e45c05e04e86f9cfd766623149297e971/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586", size = 2073164, upload-time = "2025-11-04T13:40:40.289Z" }, + { url = "https://files.pythonhosted.org/packages/bf/e3/f6e262673c6140dd3305d144d032f7bd5f7497d3871c1428521f19f9efa2/pydantic_core-2.41.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d", size = 2179146, upload-time = "2025-11-04T13:40:42.809Z" }, + { url = "https://files.pythonhosted.org/packages/75/c7/20bd7fc05f0c6ea2056a4565c6f36f8968c0924f19b7d97bbfea55780e73/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740", size = 2137788, upload-time = "2025-11-04T13:40:44.752Z" }, + { url = "https://files.pythonhosted.org/packages/3a/8d/34318ef985c45196e004bc46c6eab2eda437e744c124ef0dbe1ff2c9d06b/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e", size = 2340133, upload-time = "2025-11-04T13:40:46.66Z" }, + { url = "https://files.pythonhosted.org/packages/9c/59/013626bf8c78a5a5d9350d12e7697d3d4de951a75565496abd40ccd46bee/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858", size = 2324852, upload-time = "2025-11-04T13:40:48.575Z" }, + { url = "https://files.pythonhosted.org/packages/1a/d9/c248c103856f807ef70c18a4f986693a46a8ffe1602e5d361485da502d20/pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36", size = 1994679, upload-time = "2025-11-04T13:40:50.619Z" }, + { url = "https://files.pythonhosted.org/packages/9e/8b/341991b158ddab181cff136acd2552c9f35bd30380422a639c0671e99a91/pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11", size = 2019766, upload-time = "2025-11-04T13:40:52.631Z" }, + { url = "https://files.pythonhosted.org/packages/73/7d/f2f9db34af103bea3e09735bb40b021788a5e834c81eedb541991badf8f5/pydantic_core-2.41.5-cp313-cp313-win_arm64.whl", hash = "sha256:3f84d5c1b4ab906093bdc1ff10484838aca54ef08de4afa9de0f5f14d69639cd", size = 1981005, upload-time = "2025-11-04T13:40:54.734Z" }, + { url = "https://files.pythonhosted.org/packages/ea/28/46b7c5c9635ae96ea0fbb779e271a38129df2550f763937659ee6c5dbc65/pydantic_core-2.41.5-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3f37a19d7ebcdd20b96485056ba9e8b304e27d9904d233d7b1015db320e51f0a", size = 2119622, upload-time = "2025-11-04T13:40:56.68Z" }, + { url = "https://files.pythonhosted.org/packages/74/1a/145646e5687e8d9a1e8d09acb278c8535ebe9e972e1f162ed338a622f193/pydantic_core-2.41.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1d1d9764366c73f996edd17abb6d9d7649a7eb690006ab6adbda117717099b14", size = 1891725, upload-time = "2025-11-04T13:40:58.807Z" }, + { url = "https://files.pythonhosted.org/packages/23/04/e89c29e267b8060b40dca97bfc64a19b2a3cf99018167ea1677d96368273/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e1c2af0fce638d5f1988b686f3b3ea8cd7de5f244ca147c777769e798a9cd1", size = 1915040, upload-time = "2025-11-04T13:41:00.853Z" }, + { url = "https://files.pythonhosted.org/packages/84/a3/15a82ac7bd97992a82257f777b3583d3e84bdb06ba6858f745daa2ec8a85/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:506d766a8727beef16b7adaeb8ee6217c64fc813646b424d0804d67c16eddb66", size = 2063691, upload-time = "2025-11-04T13:41:03.504Z" }, + { url = "https://files.pythonhosted.org/packages/74/9b/0046701313c6ef08c0c1cf0e028c67c770a4e1275ca73131563c5f2a310a/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4819fa52133c9aa3c387b3328f25c1facc356491e6135b459f1de698ff64d869", size = 2213897, upload-time = "2025-11-04T13:41:05.804Z" }, + { url = "https://files.pythonhosted.org/packages/8a/cd/6bac76ecd1b27e75a95ca3a9a559c643b3afcd2dd62086d4b7a32a18b169/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2", size = 2333302, upload-time = "2025-11-04T13:41:07.809Z" }, + { url = "https://files.pythonhosted.org/packages/4c/d2/ef2074dc020dd6e109611a8be4449b98cd25e1b9b8a303c2f0fca2f2bcf7/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22f0fb8c1c583a3b6f24df2470833b40207e907b90c928cc8d3594b76f874375", size = 2064877, upload-time = "2025-11-04T13:41:09.827Z" }, + { url = "https://files.pythonhosted.org/packages/18/66/e9db17a9a763d72f03de903883c057b2592c09509ccfe468187f2a2eef29/pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c870e99878c634505236d81e5443092fba820f0373997ff75f90f68cd553", size = 2180680, upload-time = "2025-11-04T13:41:12.379Z" }, + { url = "https://files.pythonhosted.org/packages/d3/9e/3ce66cebb929f3ced22be85d4c2399b8e85b622db77dad36b73c5387f8f8/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0177272f88ab8312479336e1d777f6b124537d47f2123f89cb37e0accea97f90", size = 2138960, upload-time = "2025-11-04T13:41:14.627Z" }, + { url = "https://files.pythonhosted.org/packages/a6/62/205a998f4327d2079326b01abee48e502ea739d174f0a89295c481a2272e/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:63510af5e38f8955b8ee5687740d6ebf7c2a0886d15a6d65c32814613681bc07", size = 2339102, upload-time = "2025-11-04T13:41:16.868Z" }, + { url = "https://files.pythonhosted.org/packages/3c/0d/f05e79471e889d74d3d88f5bd20d0ed189ad94c2423d81ff8d0000aab4ff/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:e56ba91f47764cc14f1daacd723e3e82d1a89d783f0f5afe9c364b8bb491ccdb", size = 2326039, upload-time = "2025-11-04T13:41:18.934Z" }, + { url = "https://files.pythonhosted.org/packages/ec/e1/e08a6208bb100da7e0c4b288eed624a703f4d129bde2da475721a80cab32/pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:aec5cf2fd867b4ff45b9959f8b20ea3993fc93e63c7363fe6851424c8a7e7c23", size = 1995126, upload-time = "2025-11-04T13:41:21.418Z" }, + { url = "https://files.pythonhosted.org/packages/48/5d/56ba7b24e9557f99c9237e29f5c09913c81eeb2f3217e40e922353668092/pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:8e7c86f27c585ef37c35e56a96363ab8de4e549a95512445b85c96d3e2f7c1bf", size = 2015489, upload-time = "2025-11-04T13:41:24.076Z" }, + { url = "https://files.pythonhosted.org/packages/4e/bb/f7a190991ec9e3e0ba22e4993d8755bbc4a32925c0b5b42775c03e8148f9/pydantic_core-2.41.5-cp314-cp314-win_arm64.whl", hash = "sha256:e672ba74fbc2dc8eea59fb6d4aed6845e6905fc2a8afe93175d94a83ba2a01a0", size = 1977288, upload-time = "2025-11-04T13:41:26.33Z" }, + { url = "https://files.pythonhosted.org/packages/92/ed/77542d0c51538e32e15afe7899d79efce4b81eee631d99850edc2f5e9349/pydantic_core-2.41.5-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8566def80554c3faa0e65ac30ab0932b9e3a5cd7f8323764303d468e5c37595a", size = 2120255, upload-time = "2025-11-04T13:41:28.569Z" }, + { url = "https://files.pythonhosted.org/packages/bb/3d/6913dde84d5be21e284439676168b28d8bbba5600d838b9dca99de0fad71/pydantic_core-2.41.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b80aa5095cd3109962a298ce14110ae16b8c1aece8b72f9dafe81cf597ad80b3", size = 1863760, upload-time = "2025-11-04T13:41:31.055Z" }, + { url = "https://files.pythonhosted.org/packages/5a/f0/e5e6b99d4191da102f2b0eb9687aaa7f5bea5d9964071a84effc3e40f997/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3006c3dd9ba34b0c094c544c6006cc79e87d8612999f1a5d43b769b89181f23c", size = 1878092, upload-time = "2025-11-04T13:41:33.21Z" }, + { url = "https://files.pythonhosted.org/packages/71/48/36fb760642d568925953bcc8116455513d6e34c4beaa37544118c36aba6d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72f6c8b11857a856bcfa48c86f5368439f74453563f951e473514579d44aa612", size = 2053385, upload-time = "2025-11-04T13:41:35.508Z" }, + { url = "https://files.pythonhosted.org/packages/20/25/92dc684dd8eb75a234bc1c764b4210cf2646479d54b47bf46061657292a8/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d", size = 2218832, upload-time = "2025-11-04T13:41:37.732Z" }, + { url = "https://files.pythonhosted.org/packages/e2/09/f53e0b05023d3e30357d82eb35835d0f6340ca344720a4599cd663dca599/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3d54f38609ff308209bd43acea66061494157703364ae40c951f83ba99a1a9", size = 2327585, upload-time = "2025-11-04T13:41:40Z" }, + { url = "https://files.pythonhosted.org/packages/aa/4e/2ae1aa85d6af35a39b236b1b1641de73f5a6ac4d5a7509f77b814885760c/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660", size = 2041078, upload-time = "2025-11-04T13:41:42.323Z" }, + { url = "https://files.pythonhosted.org/packages/cd/13/2e215f17f0ef326fc72afe94776edb77525142c693767fc347ed6288728d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9", size = 2173914, upload-time = "2025-11-04T13:41:45.221Z" }, + { url = "https://files.pythonhosted.org/packages/02/7a/f999a6dcbcd0e5660bc348a3991c8915ce6599f4f2c6ac22f01d7a10816c/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3", size = 2129560, upload-time = "2025-11-04T13:41:47.474Z" }, + { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf", size = 2329244, upload-time = "2025-11-04T13:41:49.992Z" }, + { url = "https://files.pythonhosted.org/packages/d9/02/3c562f3a51afd4d88fff8dffb1771b30cfdfd79befd9883ee094f5b6c0d8/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470", size = 2331955, upload-time = "2025-11-04T13:41:54.079Z" }, + { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" }, + { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" }, + { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" }, +] + +[[package]] +name = "pydantic-settings" +version = "2.12.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pydantic" }, + { name = "python-dotenv" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/43/4b/ac7e0aae12027748076d72a8764ff1c9d82ca75a7a52622e67ed3f765c54/pydantic_settings-2.12.0.tar.gz", hash = "sha256:005538ef951e3c2a68e1c08b292b5f2e71490def8589d4221b95dab00dafcfd0", size = 194184, upload-time = "2025-11-10T14:25:47.013Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/60/5d4751ba3f4a40a6891f24eec885f51afd78d208498268c734e256fb13c4/pydantic_settings-2.12.0-py3-none-any.whl", hash = "sha256:fddb9fd99a5b18da837b29710391e945b1e30c135477f484084ee513adb93809", size = 51880, upload-time = "2025-11-10T14:25:45.546Z" }, +] + +[[package]] +name = "pygments" +version = "2.20.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c3/b2/bc9c9196916376152d655522fdcebac55e66de6603a76a02bca1b6414f6c/pygments-2.20.0.tar.gz", hash = "sha256:6757cd03768053ff99f3039c1a36d6c0aa0b263438fcab17520b30a303a82b5f", size = 4955991, upload-time = "2026-03-29T13:29:33.898Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/7e/a72dd26f3b0f4f2bf1dd8923c85f7ceb43172af56d63c7383eb62b332364/pygments-2.20.0-py3-none-any.whl", hash = "sha256:81a9e26dd42fd28a23a2d169d86d7ac03b46e2f8b59ed4698fb4785f946d0176", size = 1231151, upload-time = "2026-03-29T13:29:30.038Z" }, +] + +[[package]] +name = "pyjwt" +version = "2.12.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a8/10/e8192be5f38f3e8e7e046716de4cae33d56fd5ae08927a823bb916be36c1/pyjwt-2.12.0.tar.gz", hash = "sha256:2f62390b667cd8257de560b850bb5a883102a388829274147f1d724453f8fb02", size = 102511, upload-time = "2026-03-12T17:15:30.831Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/15/70/70f895f404d363d291dcf62c12c85fdd47619ad9674ac0f53364d035925a/pyjwt-2.12.0-py3-none-any.whl", hash = "sha256:9bb459d1bdd0387967d287f5656bf7ec2b9a26645d1961628cda1764e087fd6e", size = 29700, upload-time = "2026-03-12T17:15:29.257Z" }, +] + +[package.optional-dependencies] +crypto = [ + { name = "cryptography" }, +] + +[[package]] +name = "pytest" +version = "9.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, +] + +[[package]] +name = "pytest-asyncio" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/90/2c/8af215c0f776415f3590cac4f9086ccefd6fd463befeae41cd4d3f193e5a/pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5", size = 50087, upload-time = "2025-11-10T16:07:47.256Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/35/f8b19922b6a25bc0880171a2f1a003eaeb93657475193ab516fd87cac9da/pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5", size = 15075, upload-time = "2025-11-10T16:07:45.537Z" }, +] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, +] + +[[package]] +name = "python-dotenv" +version = "1.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" }, +] + +[[package]] +name = "python-multipart" +version = "0.0.22" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/01/979e98d542a70714b0cb2b6728ed0b7c46792b695e3eaec3e20711271ca3/python_multipart-0.0.22.tar.gz", hash = "sha256:7340bef99a7e0032613f56dc36027b959fd3b30a787ed62d310e951f7c3a3a58", size = 37612, upload-time = "2026-01-25T10:15:56.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1b/d0/397f9626e711ff749a95d96b7af99b9c566a9bb5129b8e4c10fc4d100304/python_multipart-0.0.22-py3-none-any.whl", hash = "sha256:2b2cd894c83d21bf49d702499531c7bafd057d730c201782048f7945d82de155", size = 24579, upload-time = "2026-01-25T10:15:54.811Z" }, +] + +[[package]] +name = "pywin32" +version = "311" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a5/be/3fd5de0979fcb3994bfee0d65ed8ca9506a8a1260651b86174f6a86f52b3/pywin32-311-cp313-cp313-win32.whl", hash = "sha256:f95ba5a847cba10dd8c4d8fefa9f2a6cf283b8b88ed6178fa8a6c1ab16054d0d", size = 8705700, upload-time = "2025-07-14T20:13:26.471Z" }, + { url = "https://files.pythonhosted.org/packages/e3/28/e0a1909523c6890208295a29e05c2adb2126364e289826c0a8bc7297bd5c/pywin32-311-cp313-cp313-win_amd64.whl", hash = "sha256:718a38f7e5b058e76aee1c56ddd06908116d35147e133427e59a3983f703a20d", size = 9494700, upload-time = "2025-07-14T20:13:28.243Z" }, + { url = "https://files.pythonhosted.org/packages/04/bf/90339ac0f55726dce7d794e6d79a18a91265bdf3aa70b6b9ca52f35e022a/pywin32-311-cp313-cp313-win_arm64.whl", hash = "sha256:7b4075d959648406202d92a2310cb990fea19b535c7f4a78d3f5e10b926eeb8a", size = 8709318, upload-time = "2025-07-14T20:13:30.348Z" }, + { url = "https://files.pythonhosted.org/packages/c9/31/097f2e132c4f16d99a22bfb777e0fd88bd8e1c634304e102f313af69ace5/pywin32-311-cp314-cp314-win32.whl", hash = "sha256:b7a2c10b93f8986666d0c803ee19b5990885872a7de910fc460f9b0c2fbf92ee", size = 8840714, upload-time = "2025-07-14T20:13:32.449Z" }, + { url = "https://files.pythonhosted.org/packages/90/4b/07c77d8ba0e01349358082713400435347df8426208171ce297da32c313d/pywin32-311-cp314-cp314-win_amd64.whl", hash = "sha256:3aca44c046bd2ed8c90de9cb8427f581c479e594e99b5c0bb19b29c10fd6cb87", size = 9656800, upload-time = "2025-07-14T20:13:34.312Z" }, + { url = "https://files.pythonhosted.org/packages/c0/d2/21af5c535501a7233e734b8af901574572da66fcc254cb35d0609c9080dd/pywin32-311-cp314-cp314-win_arm64.whl", hash = "sha256:a508e2d9025764a8270f93111a970e1d0fbfc33f4153b388bb649b7eec4f9b42", size = 8932540, upload-time = "2025-07-14T20:13:36.379Z" }, +] + +[[package]] +name = "pyyaml" +version = "6.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" }, + { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" }, + { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" }, + { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" }, + { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" }, + { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" }, + { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" }, + { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" }, + { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" }, + { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" }, + { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" }, + { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" }, + { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" }, + { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" }, + { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" }, + { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" }, + { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" }, + { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" }, + { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" }, + { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" }, + { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" }, + { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" }, + { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" }, + { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, + { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, + { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, +] + +[[package]] +name = "referencing" +version = "0.37.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "rpds-py" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/22/f5/df4e9027acead3ecc63e50fe1e36aca1523e1719559c499951bb4b53188f/referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8", size = 78036, upload-time = "2025-10-13T15:30:48.871Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/58/ca301544e1fa93ed4f80d724bf5b194f6e4b945841c5bfd555878eea9fcb/referencing-0.37.0-py3-none-any.whl", hash = "sha256:381329a9f99628c9069361716891d34ad94af76e461dcb0335825aecc7692231", size = 26766, upload-time = "2025-10-13T15:30:47.625Z" }, +] + +[[package]] +name = "requests" +version = "2.33.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "charset-normalizer" }, + { name = "idna" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/34/64/8860370b167a9721e8956ae116825caff829224fbca0ca6e7bf8ddef8430/requests-2.33.0.tar.gz", hash = "sha256:c7ebc5e8b0f21837386ad0e1c8fe8b829fa5f544d8df3b2253bff14ef29d7652", size = 134232, upload-time = "2026-03-25T15:10:41.586Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/56/5d/c814546c2333ceea4ba42262d8c4d55763003e767fa169adc693bd524478/requests-2.33.0-py3-none-any.whl", hash = "sha256:3324635456fa185245e24865e810cecec7b4caf933d7eb133dcde67d48cee69b", size = 65017, upload-time = "2026-03-25T15:10:40.382Z" }, +] + +[[package]] +name = "rich" +version = "14.3.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markdown-it-py" }, + { name = "pygments" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/74/99/a4cab2acbb884f80e558b0771e97e21e939c5dfb460f488d19df485e8298/rich-14.3.2.tar.gz", hash = "sha256:e712f11c1a562a11843306f5ed999475f09ac31ffb64281f73ab29ffdda8b3b8", size = 230143, upload-time = "2026-02-01T16:20:47.908Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/45/615f5babd880b4bd7d405cc0dc348234c5ffb6ed1ea33e152ede08b2072d/rich-14.3.2-py3-none-any.whl", hash = "sha256:08e67c3e90884651da3239ea668222d19bea7b589149d8014a21c633420dbb69", size = 309963, upload-time = "2026-02-01T16:20:46.078Z" }, +] + +[[package]] +name = "rpds-py" +version = "0.30.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/af/3f2f423103f1113b36230496629986e0ef7e199d2aa8392452b484b38ced/rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84", size = 69469, upload-time = "2025-11-30T20:24:38.837Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ed/dc/d61221eb88ff410de3c49143407f6f3147acf2538c86f2ab7ce65ae7d5f9/rpds_py-0.30.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:f83424d738204d9770830d35290ff3273fbb02b41f919870479fab14b9d303b2", size = 374887, upload-time = "2025-11-30T20:22:41.812Z" }, + { url = "https://files.pythonhosted.org/packages/fd/32/55fb50ae104061dbc564ef15cc43c013dc4a9f4527a1f4d99baddf56fe5f/rpds_py-0.30.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e7536cd91353c5273434b4e003cbda89034d67e7710eab8761fd918ec6c69cf8", size = 358904, upload-time = "2025-11-30T20:22:43.479Z" }, + { url = "https://files.pythonhosted.org/packages/58/70/faed8186300e3b9bdd138d0273109784eea2396c68458ed580f885dfe7ad/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2771c6c15973347f50fece41fc447c054b7ac2ae0502388ce3b6738cd366e3d4", size = 389945, upload-time = "2025-11-30T20:22:44.819Z" }, + { url = "https://files.pythonhosted.org/packages/bd/a8/073cac3ed2c6387df38f71296d002ab43496a96b92c823e76f46b8af0543/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0a59119fc6e3f460315fe9d08149f8102aa322299deaa5cab5b40092345c2136", size = 407783, upload-time = "2025-11-30T20:22:46.103Z" }, + { url = "https://files.pythonhosted.org/packages/77/57/5999eb8c58671f1c11eba084115e77a8899d6e694d2a18f69f0ba471ec8b/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:76fec018282b4ead0364022e3c54b60bf368b9d926877957a8624b58419169b7", size = 515021, upload-time = "2025-11-30T20:22:47.458Z" }, + { url = "https://files.pythonhosted.org/packages/e0/af/5ab4833eadc36c0a8ed2bc5c0de0493c04f6c06de223170bd0798ff98ced/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:692bef75a5525db97318e8cd061542b5a79812d711ea03dbc1f6f8dbb0c5f0d2", size = 414589, upload-time = "2025-11-30T20:22:48.872Z" }, + { url = "https://files.pythonhosted.org/packages/b7/de/f7192e12b21b9e9a68a6d0f249b4af3fdcdff8418be0767a627564afa1f1/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9027da1ce107104c50c81383cae773ef5c24d296dd11c99e2629dbd7967a20c6", size = 394025, upload-time = "2025-11-30T20:22:50.196Z" }, + { url = "https://files.pythonhosted.org/packages/91/c4/fc70cd0249496493500e7cc2de87504f5aa6509de1e88623431fec76d4b6/rpds_py-0.30.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:9cf69cdda1f5968a30a359aba2f7f9aa648a9ce4b580d6826437f2b291cfc86e", size = 408895, upload-time = "2025-11-30T20:22:51.87Z" }, + { url = "https://files.pythonhosted.org/packages/58/95/d9275b05ab96556fefff73a385813eb66032e4c99f411d0795372d9abcea/rpds_py-0.30.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a4796a717bf12b9da9d3ad002519a86063dcac8988b030e405704ef7d74d2d9d", size = 422799, upload-time = "2025-11-30T20:22:53.341Z" }, + { url = "https://files.pythonhosted.org/packages/06/c1/3088fc04b6624eb12a57eb814f0d4997a44b0d208d6cace713033ff1a6ba/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5d4c2aa7c50ad4728a094ebd5eb46c452e9cb7edbfdb18f9e1221f597a73e1e7", size = 572731, upload-time = "2025-11-30T20:22:54.778Z" }, + { url = "https://files.pythonhosted.org/packages/d8/42/c612a833183b39774e8ac8fecae81263a68b9583ee343db33ab571a7ce55/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ba81a9203d07805435eb06f536d95a266c21e5b2dfbf6517748ca40c98d19e31", size = 599027, upload-time = "2025-11-30T20:22:56.212Z" }, + { url = "https://files.pythonhosted.org/packages/5f/60/525a50f45b01d70005403ae0e25f43c0384369ad24ffe46e8d9068b50086/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:945dccface01af02675628334f7cf49c2af4c1c904748efc5cf7bbdf0b579f95", size = 563020, upload-time = "2025-11-30T20:22:58.2Z" }, + { url = "https://files.pythonhosted.org/packages/0b/5d/47c4655e9bcd5ca907148535c10e7d489044243cc9941c16ed7cd53be91d/rpds_py-0.30.0-cp313-cp313-win32.whl", hash = "sha256:b40fb160a2db369a194cb27943582b38f79fc4887291417685f3ad693c5a1d5d", size = 223139, upload-time = "2025-11-30T20:23:00.209Z" }, + { url = "https://files.pythonhosted.org/packages/f2/e1/485132437d20aa4d3e1d8b3fb5a5e65aa8139f1e097080c2a8443201742c/rpds_py-0.30.0-cp313-cp313-win_amd64.whl", hash = "sha256:806f36b1b605e2d6a72716f321f20036b9489d29c51c91f4dd29a3e3afb73b15", size = 240224, upload-time = "2025-11-30T20:23:02.008Z" }, + { url = "https://files.pythonhosted.org/packages/24/95/ffd128ed1146a153d928617b0ef673960130be0009c77d8fbf0abe306713/rpds_py-0.30.0-cp313-cp313-win_arm64.whl", hash = "sha256:d96c2086587c7c30d44f31f42eae4eac89b60dabbac18c7669be3700f13c3ce1", size = 230645, upload-time = "2025-11-30T20:23:03.43Z" }, + { url = "https://files.pythonhosted.org/packages/ff/1b/b10de890a0def2a319a2626334a7f0ae388215eb60914dbac8a3bae54435/rpds_py-0.30.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:eb0b93f2e5c2189ee831ee43f156ed34e2a89a78a66b98cadad955972548be5a", size = 364443, upload-time = "2025-11-30T20:23:04.878Z" }, + { url = "https://files.pythonhosted.org/packages/0d/bf/27e39f5971dc4f305a4fb9c672ca06f290f7c4e261c568f3dea16a410d47/rpds_py-0.30.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:922e10f31f303c7c920da8981051ff6d8c1a56207dbdf330d9047f6d30b70e5e", size = 353375, upload-time = "2025-11-30T20:23:06.342Z" }, + { url = "https://files.pythonhosted.org/packages/40/58/442ada3bba6e8e6615fc00483135c14a7538d2ffac30e2d933ccf6852232/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdc62c8286ba9bf7f47befdcea13ea0e26bf294bda99758fd90535cbaf408000", size = 383850, upload-time = "2025-11-30T20:23:07.825Z" }, + { url = "https://files.pythonhosted.org/packages/14/14/f59b0127409a33c6ef6f5c1ebd5ad8e32d7861c9c7adfa9a624fc3889f6c/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:47f9a91efc418b54fb8190a6b4aa7813a23fb79c51f4bb84e418f5476c38b8db", size = 392812, upload-time = "2025-11-30T20:23:09.228Z" }, + { url = "https://files.pythonhosted.org/packages/b3/66/e0be3e162ac299b3a22527e8913767d869e6cc75c46bd844aa43fb81ab62/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1f3587eb9b17f3789ad50824084fa6f81921bbf9a795826570bda82cb3ed91f2", size = 517841, upload-time = "2025-11-30T20:23:11.186Z" }, + { url = "https://files.pythonhosted.org/packages/3d/55/fa3b9cf31d0c963ecf1ba777f7cf4b2a2c976795ac430d24a1f43d25a6ba/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39c02563fc592411c2c61d26b6c5fe1e51eaa44a75aa2c8735ca88b0d9599daa", size = 408149, upload-time = "2025-11-30T20:23:12.864Z" }, + { url = "https://files.pythonhosted.org/packages/60/ca/780cf3b1a32b18c0f05c441958d3758f02544f1d613abf9488cd78876378/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51a1234d8febafdfd33a42d97da7a43f5dcb120c1060e352a3fbc0c6d36e2083", size = 383843, upload-time = "2025-11-30T20:23:14.638Z" }, + { url = "https://files.pythonhosted.org/packages/82/86/d5f2e04f2aa6247c613da0c1dd87fcd08fa17107e858193566048a1e2f0a/rpds_py-0.30.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:eb2c4071ab598733724c08221091e8d80e89064cd472819285a9ab0f24bcedb9", size = 396507, upload-time = "2025-11-30T20:23:16.105Z" }, + { url = "https://files.pythonhosted.org/packages/4b/9a/453255d2f769fe44e07ea9785c8347edaf867f7026872e76c1ad9f7bed92/rpds_py-0.30.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6bdfdb946967d816e6adf9a3d8201bfad269c67efe6cefd7093ef959683c8de0", size = 414949, upload-time = "2025-11-30T20:23:17.539Z" }, + { url = "https://files.pythonhosted.org/packages/a3/31/622a86cdc0c45d6df0e9ccb6becdba5074735e7033c20e401a6d9d0e2ca0/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c77afbd5f5250bf27bf516c7c4a016813eb2d3e116139aed0096940c5982da94", size = 565790, upload-time = "2025-11-30T20:23:19.029Z" }, + { url = "https://files.pythonhosted.org/packages/1c/5d/15bbf0fb4a3f58a3b1c67855ec1efcc4ceaef4e86644665fff03e1b66d8d/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:61046904275472a76c8c90c9ccee9013d70a6d0f73eecefd38c1ae7c39045a08", size = 590217, upload-time = "2025-11-30T20:23:20.885Z" }, + { url = "https://files.pythonhosted.org/packages/6d/61/21b8c41f68e60c8cc3b2e25644f0e3681926020f11d06ab0b78e3c6bbff1/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c5f36a861bc4b7da6516dbdf302c55313afa09b81931e8280361a4f6c9a2d27", size = 555806, upload-time = "2025-11-30T20:23:22.488Z" }, + { url = "https://files.pythonhosted.org/packages/f9/39/7e067bb06c31de48de3eb200f9fc7c58982a4d3db44b07e73963e10d3be9/rpds_py-0.30.0-cp313-cp313t-win32.whl", hash = "sha256:3d4a69de7a3e50ffc214ae16d79d8fbb0922972da0356dcf4d0fdca2878559c6", size = 211341, upload-time = "2025-11-30T20:23:24.449Z" }, + { url = "https://files.pythonhosted.org/packages/0a/4d/222ef0b46443cf4cf46764d9c630f3fe4abaa7245be9417e56e9f52b8f65/rpds_py-0.30.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f14fc5df50a716f7ece6a80b6c78bb35ea2ca47c499e422aa4463455dd96d56d", size = 225768, upload-time = "2025-11-30T20:23:25.908Z" }, + { url = "https://files.pythonhosted.org/packages/86/81/dad16382ebbd3d0e0328776d8fd7ca94220e4fa0798d1dc5e7da48cb3201/rpds_py-0.30.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:68f19c879420aa08f61203801423f6cd5ac5f0ac4ac82a2368a9fcd6a9a075e0", size = 362099, upload-time = "2025-11-30T20:23:27.316Z" }, + { url = "https://files.pythonhosted.org/packages/2b/60/19f7884db5d5603edf3c6bce35408f45ad3e97e10007df0e17dd57af18f8/rpds_py-0.30.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ec7c4490c672c1a0389d319b3a9cfcd098dcdc4783991553c332a15acf7249be", size = 353192, upload-time = "2025-11-30T20:23:29.151Z" }, + { url = "https://files.pythonhosted.org/packages/bf/c4/76eb0e1e72d1a9c4703c69607cec123c29028bff28ce41588792417098ac/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f251c812357a3fed308d684a5079ddfb9d933860fc6de89f2b7ab00da481e65f", size = 384080, upload-time = "2025-11-30T20:23:30.785Z" }, + { url = "https://files.pythonhosted.org/packages/72/87/87ea665e92f3298d1b26d78814721dc39ed8d2c74b86e83348d6b48a6f31/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac98b175585ecf4c0348fd7b29c3864bda53b805c773cbf7bfdaffc8070c976f", size = 394841, upload-time = "2025-11-30T20:23:32.209Z" }, + { url = "https://files.pythonhosted.org/packages/77/ad/7783a89ca0587c15dcbf139b4a8364a872a25f861bdb88ed99f9b0dec985/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3e62880792319dbeb7eb866547f2e35973289e7d5696c6e295476448f5b63c87", size = 516670, upload-time = "2025-11-30T20:23:33.742Z" }, + { url = "https://files.pythonhosted.org/packages/5b/3c/2882bdac942bd2172f3da574eab16f309ae10a3925644e969536553cb4ee/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4e7fc54e0900ab35d041b0601431b0a0eb495f0851a0639b6ef90f7741b39a18", size = 408005, upload-time = "2025-11-30T20:23:35.253Z" }, + { url = "https://files.pythonhosted.org/packages/ce/81/9a91c0111ce1758c92516a3e44776920b579d9a7c09b2b06b642d4de3f0f/rpds_py-0.30.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47e77dc9822d3ad616c3d5759ea5631a75e5809d5a28707744ef79d7a1bcfcad", size = 382112, upload-time = "2025-11-30T20:23:36.842Z" }, + { url = "https://files.pythonhosted.org/packages/cf/8e/1da49d4a107027e5fbc64daeab96a0706361a2918da10cb41769244b805d/rpds_py-0.30.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:b4dc1a6ff022ff85ecafef7979a2c6eb423430e05f1165d6688234e62ba99a07", size = 399049, upload-time = "2025-11-30T20:23:38.343Z" }, + { url = "https://files.pythonhosted.org/packages/df/5a/7ee239b1aa48a127570ec03becbb29c9d5a9eb092febbd1699d567cae859/rpds_py-0.30.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4559c972db3a360808309e06a74628b95eaccbf961c335c8fe0d590cf587456f", size = 415661, upload-time = "2025-11-30T20:23:40.263Z" }, + { url = "https://files.pythonhosted.org/packages/70/ea/caa143cf6b772f823bc7929a45da1fa83569ee49b11d18d0ada7f5ee6fd6/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0ed177ed9bded28f8deb6ab40c183cd1192aa0de40c12f38be4d59cd33cb5c65", size = 565606, upload-time = "2025-11-30T20:23:42.186Z" }, + { url = "https://files.pythonhosted.org/packages/64/91/ac20ba2d69303f961ad8cf55bf7dbdb4763f627291ba3d0d7d67333cced9/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:ad1fa8db769b76ea911cb4e10f049d80bf518c104f15b3edb2371cc65375c46f", size = 591126, upload-time = "2025-11-30T20:23:44.086Z" }, + { url = "https://files.pythonhosted.org/packages/21/20/7ff5f3c8b00c8a95f75985128c26ba44503fb35b8e0259d812766ea966c7/rpds_py-0.30.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:46e83c697b1f1c72b50e5ee5adb4353eef7406fb3f2043d64c33f20ad1c2fc53", size = 553371, upload-time = "2025-11-30T20:23:46.004Z" }, + { url = "https://files.pythonhosted.org/packages/72/c7/81dadd7b27c8ee391c132a6b192111ca58d866577ce2d9b0ca157552cce0/rpds_py-0.30.0-cp314-cp314-win32.whl", hash = "sha256:ee454b2a007d57363c2dfd5b6ca4a5d7e2c518938f8ed3b706e37e5d470801ed", size = 215298, upload-time = "2025-11-30T20:23:47.696Z" }, + { url = "https://files.pythonhosted.org/packages/3e/d2/1aaac33287e8cfb07aab2e6b8ac1deca62f6f65411344f1433c55e6f3eb8/rpds_py-0.30.0-cp314-cp314-win_amd64.whl", hash = "sha256:95f0802447ac2d10bcc69f6dc28fe95fdf17940367b21d34e34c737870758950", size = 228604, upload-time = "2025-11-30T20:23:49.501Z" }, + { url = "https://files.pythonhosted.org/packages/e8/95/ab005315818cc519ad074cb7784dae60d939163108bd2b394e60dc7b5461/rpds_py-0.30.0-cp314-cp314-win_arm64.whl", hash = "sha256:613aa4771c99f03346e54c3f038e4cc574ac09a3ddfb0e8878487335e96dead6", size = 222391, upload-time = "2025-11-30T20:23:50.96Z" }, + { url = "https://files.pythonhosted.org/packages/9e/68/154fe0194d83b973cdedcdcc88947a2752411165930182ae41d983dcefa6/rpds_py-0.30.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:7e6ecfcb62edfd632e56983964e6884851786443739dbfe3582947e87274f7cb", size = 364868, upload-time = "2025-11-30T20:23:52.494Z" }, + { url = "https://files.pythonhosted.org/packages/83/69/8bbc8b07ec854d92a8b75668c24d2abcb1719ebf890f5604c61c9369a16f/rpds_py-0.30.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:a1d0bc22a7cdc173fedebb73ef81e07faef93692b8c1ad3733b67e31e1b6e1b8", size = 353747, upload-time = "2025-11-30T20:23:54.036Z" }, + { url = "https://files.pythonhosted.org/packages/ab/00/ba2e50183dbd9abcce9497fa5149c62b4ff3e22d338a30d690f9af970561/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d08f00679177226c4cb8c5265012eea897c8ca3b93f429e546600c971bcbae7", size = 383795, upload-time = "2025-11-30T20:23:55.556Z" }, + { url = "https://files.pythonhosted.org/packages/05/6f/86f0272b84926bcb0e4c972262f54223e8ecc556b3224d281e6598fc9268/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5965af57d5848192c13534f90f9dd16464f3c37aaf166cc1da1cae1fd5a34898", size = 393330, upload-time = "2025-11-30T20:23:57.033Z" }, + { url = "https://files.pythonhosted.org/packages/cb/e9/0e02bb2e6dc63d212641da45df2b0bf29699d01715913e0d0f017ee29438/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a4e86e34e9ab6b667c27f3211ca48f73dba7cd3d90f8d5b11be56e5dbc3fb4e", size = 518194, upload-time = "2025-11-30T20:23:58.637Z" }, + { url = "https://files.pythonhosted.org/packages/ee/ca/be7bca14cf21513bdf9c0606aba17d1f389ea2b6987035eb4f62bd923f25/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d3e6b26f2c785d65cc25ef1e5267ccbe1b069c5c21b8cc724efee290554419", size = 408340, upload-time = "2025-11-30T20:24:00.2Z" }, + { url = "https://files.pythonhosted.org/packages/c2/c7/736e00ebf39ed81d75544c0da6ef7b0998f8201b369acf842f9a90dc8fce/rpds_py-0.30.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:626a7433c34566535b6e56a1b39a7b17ba961e97ce3b80ec62e6f1312c025551", size = 383765, upload-time = "2025-11-30T20:24:01.759Z" }, + { url = "https://files.pythonhosted.org/packages/4a/3f/da50dfde9956aaf365c4adc9533b100008ed31aea635f2b8d7b627e25b49/rpds_py-0.30.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:acd7eb3f4471577b9b5a41baf02a978e8bdeb08b4b355273994f8b87032000a8", size = 396834, upload-time = "2025-11-30T20:24:03.687Z" }, + { url = "https://files.pythonhosted.org/packages/4e/00/34bcc2565b6020eab2623349efbdec810676ad571995911f1abdae62a3a0/rpds_py-0.30.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fe5fa731a1fa8a0a56b0977413f8cacac1768dad38d16b3a296712709476fbd5", size = 415470, upload-time = "2025-11-30T20:24:05.232Z" }, + { url = "https://files.pythonhosted.org/packages/8c/28/882e72b5b3e6f718d5453bd4d0d9cf8df36fddeb4ddbbab17869d5868616/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:74a3243a411126362712ee1524dfc90c650a503502f135d54d1b352bd01f2404", size = 565630, upload-time = "2025-11-30T20:24:06.878Z" }, + { url = "https://files.pythonhosted.org/packages/3b/97/04a65539c17692de5b85c6e293520fd01317fd878ea1995f0367d4532fb1/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:3e8eeb0544f2eb0d2581774be4c3410356eba189529a6b3e36bbbf9696175856", size = 591148, upload-time = "2025-11-30T20:24:08.445Z" }, + { url = "https://files.pythonhosted.org/packages/85/70/92482ccffb96f5441aab93e26c4d66489eb599efdcf96fad90c14bbfb976/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:dbd936cde57abfee19ab3213cf9c26be06d60750e60a8e4dd85d1ab12c8b1f40", size = 556030, upload-time = "2025-11-30T20:24:10.956Z" }, + { url = "https://files.pythonhosted.org/packages/20/53/7c7e784abfa500a2b6b583b147ee4bb5a2b3747a9166bab52fec4b5b5e7d/rpds_py-0.30.0-cp314-cp314t-win32.whl", hash = "sha256:dc824125c72246d924f7f796b4f63c1e9dc810c7d9e2355864b3c3a73d59ade0", size = 211570, upload-time = "2025-11-30T20:24:12.735Z" }, + { url = "https://files.pythonhosted.org/packages/d0/02/fa464cdfbe6b26e0600b62c528b72d8608f5cc49f96b8d6e38c95d60c676/rpds_py-0.30.0-cp314-cp314t-win_amd64.whl", hash = "sha256:27f4b0e92de5bfbc6f86e43959e6edd1425c33b5e69aab0984a72047f2bcf1e3", size = 226532, upload-time = "2025-11-30T20:24:14.634Z" }, +] + +[[package]] +name = "ruamel-yaml" +version = "0.17.40" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d1/d6/eb2833ccba5ea36f8f4de4bcfa0d1a91eb618f832d430b70e3086821f251/ruamel.yaml-0.17.40.tar.gz", hash = "sha256:6024b986f06765d482b5b07e086cc4b4cd05dd22ddcbc758fa23d54873cf313d", size = 137672, upload-time = "2023-10-20T12:53:56.073Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/35/79/5e2cffa1c77432f11cd93a5351f30732c997a239d3a3090856a72d6d8ba7/ruamel.yaml-0.17.40-py3-none-any.whl", hash = "sha256:b16b6c3816dff0a93dca12acf5e70afd089fa5acb80604afd1ffa8b465b7722c", size = 113666, upload-time = "2023-10-20T12:53:52.628Z" }, +] + +[[package]] +name = "ruff" +version = "0.15.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c8/39/5cee96809fbca590abea6b46c6d1c586b49663d1d2830a751cc8fc42c666/ruff-0.15.0.tar.gz", hash = "sha256:6bdea47cdbea30d40f8f8d7d69c0854ba7c15420ec75a26f463290949d7f7e9a", size = 4524893, upload-time = "2026-02-03T17:53:35.357Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/88/3fd1b0aa4b6330d6aaa63a285bc96c9f71970351579152d231ed90914586/ruff-0.15.0-py3-none-linux_armv6l.whl", hash = "sha256:aac4ebaa612a82b23d45964586f24ae9bc23ca101919f5590bdb368d74ad5455", size = 10354332, upload-time = "2026-02-03T17:52:54.892Z" }, + { url = "https://files.pythonhosted.org/packages/72/f6/62e173fbb7eb75cc29fe2576a1e20f0a46f671a2587b5f604bfb0eaf5f6f/ruff-0.15.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:dcd4be7cc75cfbbca24a98d04d0b9b36a270d0833241f776b788d59f4142b14d", size = 10767189, upload-time = "2026-02-03T17:53:19.778Z" }, + { url = "https://files.pythonhosted.org/packages/99/e4/968ae17b676d1d2ff101d56dc69cf333e3a4c985e1ec23803df84fc7bf9e/ruff-0.15.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d747e3319b2bce179c7c1eaad3d884dc0a199b5f4d5187620530adf9105268ce", size = 10075384, upload-time = "2026-02-03T17:53:29.241Z" }, + { url = "https://files.pythonhosted.org/packages/a2/bf/9843c6044ab9e20af879c751487e61333ca79a2c8c3058b15722386b8cae/ruff-0.15.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:650bd9c56ae03102c51a5e4b554d74d825ff3abe4db22b90fd32d816c2e90621", size = 10481363, upload-time = "2026-02-03T17:52:43.332Z" }, + { url = "https://files.pythonhosted.org/packages/55/d9/4ada5ccf4cd1f532db1c8d44b6f664f2208d3d93acbeec18f82315e15193/ruff-0.15.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a6664b7eac559e3048223a2da77769c2f92b43a6dfd4720cef42654299a599c9", size = 10187736, upload-time = "2026-02-03T17:53:00.522Z" }, + { url = "https://files.pythonhosted.org/packages/86/e2/f25eaecd446af7bb132af0a1d5b135a62971a41f5366ff41d06d25e77a91/ruff-0.15.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f811f97b0f092b35320d1556f3353bf238763420ade5d9e62ebd2b73f2ff179", size = 10968415, upload-time = "2026-02-03T17:53:15.705Z" }, + { url = "https://files.pythonhosted.org/packages/e7/dc/f06a8558d06333bf79b497d29a50c3a673d9251214e0d7ec78f90b30aa79/ruff-0.15.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:761ec0a66680fab6454236635a39abaf14198818c8cdf691e036f4bc0f406b2d", size = 11809643, upload-time = "2026-02-03T17:53:23.031Z" }, + { url = "https://files.pythonhosted.org/packages/dd/45/0ece8db2c474ad7df13af3a6d50f76e22a09d078af63078f005057ca59eb/ruff-0.15.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:940f11c2604d317e797b289f4f9f3fa5555ffe4fb574b55ed006c3d9b6f0eb78", size = 11234787, upload-time = "2026-02-03T17:52:46.432Z" }, + { url = "https://files.pythonhosted.org/packages/8a/d9/0e3a81467a120fd265658d127db648e4d3acfe3e4f6f5d4ea79fac47e587/ruff-0.15.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bcbca3d40558789126da91d7ef9a7c87772ee107033db7191edefa34e2c7f1b4", size = 11112797, upload-time = "2026-02-03T17:52:49.274Z" }, + { url = "https://files.pythonhosted.org/packages/b2/cb/8c0b3b0c692683f8ff31351dfb6241047fa873a4481a76df4335a8bff716/ruff-0.15.0-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:9a121a96db1d75fa3eb39c4539e607f628920dd72ff1f7c5ee4f1b768ac62d6e", size = 11033133, upload-time = "2026-02-03T17:53:33.105Z" }, + { url = "https://files.pythonhosted.org/packages/f8/5e/23b87370cf0f9081a8c89a753e69a4e8778805b8802ccfe175cc410e50b9/ruff-0.15.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:5298d518e493061f2eabd4abd067c7e4fb89e2f63291c94332e35631c07c3662", size = 10442646, upload-time = "2026-02-03T17:53:06.278Z" }, + { url = "https://files.pythonhosted.org/packages/e1/9a/3c94de5ce642830167e6d00b5c75aacd73e6347b4c7fc6828699b150a5ee/ruff-0.15.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:afb6e603d6375ff0d6b0cee563fa21ab570fd15e65c852cb24922cef25050cf1", size = 10195750, upload-time = "2026-02-03T17:53:26.084Z" }, + { url = "https://files.pythonhosted.org/packages/30/15/e396325080d600b436acc970848d69df9c13977942fb62bb8722d729bee8/ruff-0.15.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:77e515f6b15f828b94dc17d2b4ace334c9ddb7d9468c54b2f9ed2b9c1593ef16", size = 10676120, upload-time = "2026-02-03T17:53:09.363Z" }, + { url = "https://files.pythonhosted.org/packages/8d/c9/229a23d52a2983de1ad0fb0ee37d36e0257e6f28bfd6b498ee2c76361874/ruff-0.15.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6f6e80850a01eb13b3e42ee0ebdf6e4497151b48c35051aab51c101266d187a3", size = 11201636, upload-time = "2026-02-03T17:52:57.281Z" }, + { url = "https://files.pythonhosted.org/packages/6f/b0/69adf22f4e24f3677208adb715c578266842e6e6a3cc77483f48dd999ede/ruff-0.15.0-py3-none-win32.whl", hash = "sha256:238a717ef803e501b6d51e0bdd0d2c6e8513fe9eec14002445134d3907cd46c3", size = 10465945, upload-time = "2026-02-03T17:53:12.591Z" }, + { url = "https://files.pythonhosted.org/packages/51/ad/f813b6e2c97e9b4598be25e94a9147b9af7e60523b0cb5d94d307c15229d/ruff-0.15.0-py3-none-win_amd64.whl", hash = "sha256:dd5e4d3301dc01de614da3cdffc33d4b1b96fb89e45721f1598e5532ccf78b18", size = 11564657, upload-time = "2026-02-03T17:52:51.893Z" }, + { url = "https://files.pythonhosted.org/packages/f6/b0/2d823f6e77ebe560f4e397d078487e8d52c1516b331e3521bc75db4272ca/ruff-0.15.0-py3-none-win_arm64.whl", hash = "sha256:c480d632cc0ca3f0727acac8b7d053542d9e114a462a145d0b00e7cd658c515a", size = 10865753, upload-time = "2026-02-03T17:53:03.014Z" }, +] + +[[package]] +name = "s3transfer" +version = "0.16.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/05/04/74127fc843314818edfa81b5540e26dd537353b123a4edc563109d8f17dd/s3transfer-0.16.0.tar.gz", hash = "sha256:8e990f13268025792229cd52fa10cb7163744bf56e719e0b9cb925ab79abf920", size = 153827, upload-time = "2025-12-01T02:30:59.114Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl", hash = "sha256:18e25d66fed509e3868dc1572b3f427ff947dd2c56f844a5bf09481ad3f3b2fe", size = 86830, upload-time = "2025-12-01T02:30:57.729Z" }, +] + +[[package]] +name = "semgrep" +version = "1.79.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "boltons" }, + { name = "click" }, + { name = "click-option-group" }, + { name = "colorama" }, + { name = "defusedxml" }, + { name = "exceptiongroup" }, + { name = "glom" }, + { name = "jsonschema" }, + { name = "packaging" }, + { name = "peewee" }, + { name = "requests" }, + { name = "rich" }, + { name = "ruamel-yaml" }, + { name = "tomli" }, + { name = "typing-extensions" }, + { name = "urllib3" }, + { name = "wcmatch" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/45/61/9ee9e601ddc9f9073708d4e6886d0c7021b59b3180b6cb53c0bd01b393d9/semgrep-1.79.0.tar.gz", hash = "sha256:fde15d090b4beb865e12c2c727404c8dee2f41b9d793a7f972b278cdefb22bea", size = 27421587, upload-time = "2024-07-10T10:06:05.122Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/56/4a/469abc30b134354632d8b8e83249121447fca74a615509a09bed90340813/semgrep-1.79.0-cp38.cp39.cp310.cp311.py37.py38.py39.py310.py311-none-any.whl", hash = "sha256:5a28858c1f5249bf4fed3f180f2db2589ce1832c1058eb6d18a0f086c91dadc1", size = 27832465, upload-time = "2024-07-10T10:05:45.254Z" }, + { url = "https://files.pythonhosted.org/packages/e9/23/eff37582f900cf742b5fc7709dfd0e63cc2bc0faefe6ec200f150666fa7a/semgrep-1.79.0-cp38.cp39.cp310.cp311.py37.py38.py39.py310.py311-none-macosx_10_14_x86_64.whl", hash = "sha256:4b22b5f4db17204648baf8bc58fcd74c09eb5f41bd407422193253b4de9af18e", size = 28050871, upload-time = "2024-07-10T10:05:52.086Z" }, + { url = "https://files.pythonhosted.org/packages/d6/88/35615a4e1142755cb3d2c86d63160f63ab770c111c82de9318bba27fb888/semgrep-1.79.0-cp38.cp39.cp310.cp311.py37.py38.py39.py310.py311-none-macosx_11_0_arm64.whl", hash = "sha256:fe5cf0ac8afdb786cbd4e6c97c418ca7adc8f4c42584a69dc7c10e15db5f7d9b", size = 33805877, upload-time = "2024-07-10T10:05:56.42Z" }, + { url = "https://files.pythonhosted.org/packages/fd/84/3b6afc829f54b331f47d55c565096ef74a98d96d794612d2e5330062d373/semgrep-1.79.0-cp38.cp39.cp310.cp311.py37.py38.py39.py310.py311-none-musllinux_1_0_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41797931371d05c41a6e09861b3d631136b4fe644d80802b2fd48af650e5fa5a", size = 32479030, upload-time = "2024-07-10T10:06:00.777Z" }, +] + +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, +] + +[[package]] +name = "slack-bolt" +version = "1.27.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "slack-sdk" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4c/28/50ed0b86e48b48e6ddcc71de93b91c8ac14a55d1249e4bff0586494a2f90/slack_bolt-1.27.0.tar.gz", hash = "sha256:3db91d64e277e176a565c574ae82748aa8554f19e41a4fceadca4d65374ce1e0", size = 129101, upload-time = "2025-11-13T20:17:46.878Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/01/a8/1acb355759747ba4da5f45c1a33d641994b9e04b914908c9434f18bd97e8/slack_bolt-1.27.0-py2.py3-none-any.whl", hash = "sha256:c43c94bf34740f2adeb9b55566c83f1e73fed6ba2878bd346cdfd6fd8ad22360", size = 230428, upload-time = "2025-11-13T20:17:45.465Z" }, +] + +[[package]] +name = "slack-sdk" +version = "3.40.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/45/f7/4a968e1d091569ff7f6da929695728c3eacf10afcda7c424b70df2c3700b/slack_sdk-3.40.0.tar.gz", hash = "sha256:87b9a79d1d6e19a2b1877727a0ec6f016d82d30a6a410389fba87c221c99f10e", size = 249478, upload-time = "2026-02-10T22:12:13.445Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/72/428fb01a1043ddbb3f66297363406d6e69ddff5ad89c4d07945a3753a235/slack_sdk-3.40.0-py2.py3-none-any.whl", hash = "sha256:f2bada5ed3adb10a01e154e90db01d6d8938d0461b5790c12bcb807b2d28bbe2", size = 312786, upload-time = "2026-02-10T22:12:11.258Z" }, +] + +[[package]] +name = "soupsieve" +version = "2.8.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7b/ae/2d9c981590ed9999a0d91755b47fc74f74de286b0f5cee14c9269041e6c4/soupsieve-2.8.3.tar.gz", hash = "sha256:3267f1eeea4251fb42728b6dfb746edc9acaffc4a45b27e19450b676586e8349", size = 118627, upload-time = "2026-01-20T04:27:02.457Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/46/2c/1462b1d0a634697ae9e55b3cecdcb64788e8b7d63f54d923fcd0bb140aed/soupsieve-2.8.3-py3-none-any.whl", hash = "sha256:ed64f2ba4eebeab06cc4962affce381647455978ffc1e36bb79a545b91f45a95", size = 37016, upload-time = "2026-01-20T04:27:01.012Z" }, +] + +[[package]] +name = "sse-starlette" +version = "3.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "starlette" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8b/8d/00d280c03ffd39aaee0e86ec81e2d3b9253036a0f93f51d10503adef0e65/sse_starlette-3.2.0.tar.gz", hash = "sha256:8127594edfb51abe44eac9c49e59b0b01f1039d0c7461c6fd91d4e03b70da422", size = 27253, upload-time = "2026-01-17T13:11:05.62Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/96/7f/832f015020844a8b8f7a9cbc103dd76ba8e3875004c41e08440ea3a2b41a/sse_starlette-3.2.0-py3-none-any.whl", hash = "sha256:5876954bd51920fc2cd51baee47a080eb88a37b5b784e615abb0b283f801cdbf", size = 12763, upload-time = "2026-01-17T13:11:03.775Z" }, +] + +[[package]] +name = "starlette" +version = "0.52.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c4/68/79977123bb7be889ad680d79a40f339082c1978b5cfcf62c2d8d196873ac/starlette-0.52.1.tar.gz", hash = "sha256:834edd1b0a23167694292e94f597773bc3f89f362be6effee198165a35d62933", size = 2653702, upload-time = "2026-01-18T13:34:11.062Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/0d/13d1d239a25cbfb19e740db83143e95c772a1fe10202dda4b76792b114dd/starlette-0.52.1-py3-none-any.whl", hash = "sha256:0029d43eb3d273bc4f83a08720b4912ea4b071087a3b48db01b7c839f7954d74", size = 74272, upload-time = "2026-01-18T13:34:09.188Z" }, +] + +[[package]] +name = "stevedore" +version = "5.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/6d/90764092216fa560f6587f83bb70113a8ba510ba436c6476a2b47359057c/stevedore-5.7.0.tar.gz", hash = "sha256:31dd6fe6b3cbe921e21dcefabc9a5f1cf848cf538a1f27543721b8ca09948aa3", size = 516200, upload-time = "2026-02-20T13:27:06.765Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/06/36d260a695f383345ab5bbc3fd447249594ae2fa8dfd19c533d5ae23f46b/stevedore-5.7.0-py3-none-any.whl", hash = "sha256:fd25efbb32f1abb4c9e502f385f0018632baac11f9ee5d1b70f88cc5e22ad4ed", size = 54483, upload-time = "2026-02-20T13:27:05.561Z" }, +] + +[[package]] +name = "strands-agents" +version = "1.26.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "boto3" }, + { name = "botocore" }, + { name = "docstring-parser" }, + { name = "jsonschema" }, + { name = "mcp" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-instrumentation-threading" }, + { name = "opentelemetry-sdk" }, + { name = "pydantic" }, + { name = "typing-extensions" }, + { name = "watchdog" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/00/95/c7c2b4fc3069bc14ee328e7ec0d314e069866612109985be95fdb8ba452b/strands_agents-1.26.0.tar.gz", hash = "sha256:29a297ba4db53007deba2dd7f1caa6a803a4b7e90ab6b82972cf2c753021479f", size = 707613, upload-time = "2026-02-11T20:03:40.231Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3d/dd/226a383c03482cb0ec9e6226ec3bf9b1e89c0564867c2e9aeba8b9427d55/strands_agents-1.26.0-py3-none-any.whl", hash = "sha256:23175519ac2285566936a3a248cd67b92c831a3417a1d92b64bc2a677c942ad8", size = 349888, upload-time = "2026-02-11T20:03:37.382Z" }, +] + +[[package]] +name = "strands-agents-tools" +version = "0.2.23" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aiohttp" }, + { name = "aws-requests-auth" }, + { name = "botocore" }, + { name = "dill" }, + { name = "markdownify" }, + { name = "pillow" }, + { name = "prompt-toolkit" }, + { name = "pyjwt" }, + { name = "requests" }, + { name = "rich" }, + { name = "slack-bolt" }, + { name = "strands-agents" }, + { name = "sympy" }, + { name = "tenacity" }, + { name = "typing-extensions" }, + { name = "tzdata", marker = "sys_platform == 'win32'" }, + { name = "watchdog" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a8/0b/95529cd0bbb0f3186adcae0476f506d3f098b95b552621195f190dcda14b/strands_agents_tools-0.2.23.tar.gz", hash = "sha256:40d0c30859f0cf00c62f5efacdb917ae541c7fe9affceee5adb63ba49657f887", size = 474100, upload-time = "2026-03-19T14:29:29.917Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/3c/b5d8586448878e755590bfc59824abc2b137b21b8a8183008e13c8316b13/strands_agents_tools-0.2.23-py3-none-any.whl", hash = "sha256:d98b602002eb978850a32d597f94262dd3a95c6871ab2007fbb0ad165799d8df", size = 312782, upload-time = "2026-03-19T14:29:27.768Z" }, +] + +[[package]] +name = "sympy" +version = "1.14.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mpmath" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, +] + +[[package]] +name = "tenacity" +version = "9.1.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/47/c6/ee486fd809e357697ee8a44d3d69222b344920433d3b6666ccd9b374630c/tenacity-9.1.4.tar.gz", hash = "sha256:adb31d4c263f2bd041081ab33b498309a57c77f9acf2db65aadf0898179cf93a", size = 49413, upload-time = "2026-02-07T10:45:33.841Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d7/c1/eb8f9debc45d3b7918a32ab756658a0904732f75e555402972246b0b8e71/tenacity-9.1.4-py3-none-any.whl", hash = "sha256:6095a360c919085f28c6527de529e76a06ad89b23659fa881ae0649b867a9d55", size = 28926, upload-time = "2026-02-07T10:45:32.24Z" }, +] + +[[package]] +name = "tomli" +version = "2.0.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/35/b9/de2a5c0144d7d75a57ff355c0c24054f965b2dc3036456ae03a51ea6264b/tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed", size = 16096, upload-time = "2024-10-02T10:46:13.208Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cf/db/ce8eda256fa131af12e0a76d481711abe4681b6923c27efb9a255c9e4594/tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38", size = 13237, upload-time = "2024-10-02T10:46:11.806Z" }, +] + +[[package]] +name = "typing-extensions" +version = "4.15.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, +] + +[[package]] +name = "typing-inspection" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, +] + +[[package]] +name = "tzdata" +version = "2025.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5e/a7/c202b344c5ca7daf398f3b8a477eeb205cf3b6f32e7ec3a6bac0629ca975/tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7", size = 196772, upload-time = "2025-12-13T17:45:35.667Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521, upload-time = "2025-12-13T17:45:33.889Z" }, +] + +[[package]] +name = "urllib3" +version = "2.6.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, +] + +[[package]] +name = "uvicorn" +version = "0.40.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c3/d1/8f3c683c9561a4e6689dd3b1d345c815f10f86acd044ee1fb9a4dcd0b8c5/uvicorn-0.40.0.tar.gz", hash = "sha256:839676675e87e73694518b5574fd0f24c9d97b46bea16df7b8c05ea1a51071ea", size = 81761, upload-time = "2025-12-21T14:16:22.45Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3d/d8/2083a1daa7439a66f3a48589a57d576aa117726762618f6bb09fe3798796/uvicorn-0.40.0-py3-none-any.whl", hash = "sha256:c6c8f55bc8bf13eb6fa9ff87ad62308bbbc33d0b67f84293151efe87e0d5f2ee", size = 68502, upload-time = "2025-12-21T14:16:21.041Z" }, +] + +[[package]] +name = "watchdog" +version = "6.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/db/7d/7f3d619e951c88ed75c6037b246ddcf2d322812ee8ea189be89511721d54/watchdog-6.0.0.tar.gz", hash = "sha256:9ddf7c82fda3ae8e24decda1338ede66e1c99883db93711d8fb941eaa2d8c282", size = 131220, upload-time = "2024-11-01T14:07:13.037Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/98/b0345cabdce2041a01293ba483333582891a3bd5769b08eceb0d406056ef/watchdog-6.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:490ab2ef84f11129844c23fb14ecf30ef3d8a6abafd3754a6f75ca1e6654136c", size = 96480, upload-time = "2024-11-01T14:06:42.952Z" }, + { url = "https://files.pythonhosted.org/packages/85/83/cdf13902c626b28eedef7ec4f10745c52aad8a8fe7eb04ed7b1f111ca20e/watchdog-6.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:76aae96b00ae814b181bb25b1b98076d5fc84e8a53cd8885a318b42b6d3a5134", size = 88451, upload-time = "2024-11-01T14:06:45.084Z" }, + { url = "https://files.pythonhosted.org/packages/fe/c4/225c87bae08c8b9ec99030cd48ae9c4eca050a59bf5c2255853e18c87b50/watchdog-6.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a175f755fc2279e0b7312c0035d52e27211a5bc39719dd529625b1930917345b", size = 89057, upload-time = "2024-11-01T14:06:47.324Z" }, + { url = "https://files.pythonhosted.org/packages/a9/c7/ca4bf3e518cb57a686b2feb4f55a1892fd9a3dd13f470fca14e00f80ea36/watchdog-6.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7607498efa04a3542ae3e05e64da8202e58159aa1fa4acddf7678d34a35d4f13", size = 79079, upload-time = "2024-11-01T14:06:59.472Z" }, + { url = "https://files.pythonhosted.org/packages/5c/51/d46dc9332f9a647593c947b4b88e2381c8dfc0942d15b8edc0310fa4abb1/watchdog-6.0.0-py3-none-manylinux2014_armv7l.whl", hash = "sha256:9041567ee8953024c83343288ccc458fd0a2d811d6a0fd68c4c22609e3490379", size = 79078, upload-time = "2024-11-01T14:07:01.431Z" }, + { url = "https://files.pythonhosted.org/packages/d4/57/04edbf5e169cd318d5f07b4766fee38e825d64b6913ca157ca32d1a42267/watchdog-6.0.0-py3-none-manylinux2014_i686.whl", hash = "sha256:82dc3e3143c7e38ec49d61af98d6558288c415eac98486a5c581726e0737c00e", size = 79076, upload-time = "2024-11-01T14:07:02.568Z" }, + { url = "https://files.pythonhosted.org/packages/ab/cc/da8422b300e13cb187d2203f20b9253e91058aaf7db65b74142013478e66/watchdog-6.0.0-py3-none-manylinux2014_ppc64.whl", hash = "sha256:212ac9b8bf1161dc91bd09c048048a95ca3a4c4f5e5d4a7d1b1a7d5752a7f96f", size = 79077, upload-time = "2024-11-01T14:07:03.893Z" }, + { url = "https://files.pythonhosted.org/packages/2c/3b/b8964e04ae1a025c44ba8e4291f86e97fac443bca31de8bd98d3263d2fcf/watchdog-6.0.0-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:e3df4cbb9a450c6d49318f6d14f4bbc80d763fa587ba46ec86f99f9e6876bb26", size = 79078, upload-time = "2024-11-01T14:07:05.189Z" }, + { url = "https://files.pythonhosted.org/packages/62/ae/a696eb424bedff7407801c257d4b1afda455fe40821a2be430e173660e81/watchdog-6.0.0-py3-none-manylinux2014_s390x.whl", hash = "sha256:2cce7cfc2008eb51feb6aab51251fd79b85d9894e98ba847408f662b3395ca3c", size = 79077, upload-time = "2024-11-01T14:07:06.376Z" }, + { url = "https://files.pythonhosted.org/packages/b5/e8/dbf020b4d98251a9860752a094d09a65e1b436ad181faf929983f697048f/watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:20ffe5b202af80ab4266dcd3e91aae72bf2da48c0d33bdb15c66658e685e94e2", size = 79078, upload-time = "2024-11-01T14:07:07.547Z" }, + { url = "https://files.pythonhosted.org/packages/07/f6/d0e5b343768e8bcb4cda79f0f2f55051bf26177ecd5651f84c07567461cf/watchdog-6.0.0-py3-none-win32.whl", hash = "sha256:07df1fdd701c5d4c8e55ef6cf55b8f0120fe1aef7ef39a1c6fc6bc2e606d517a", size = 79065, upload-time = "2024-11-01T14:07:09.525Z" }, + { url = "https://files.pythonhosted.org/packages/db/d9/c495884c6e548fce18a8f40568ff120bc3a4b7b99813081c8ac0c936fa64/watchdog-6.0.0-py3-none-win_amd64.whl", hash = "sha256:cbafb470cf848d93b5d013e2ecb245d4aa1c8fd0504e863ccefa32445359d680", size = 79070, upload-time = "2024-11-01T14:07:10.686Z" }, + { url = "https://files.pythonhosted.org/packages/33/e8/e40370e6d74ddba47f002a32919d91310d6074130fe4e17dabcafc15cbf1/watchdog-6.0.0-py3-none-win_ia64.whl", hash = "sha256:a1914259fa9e1454315171103c6a30961236f508b9b623eae470268bbcc6a22f", size = 79067, upload-time = "2024-11-01T14:07:11.845Z" }, +] + +[[package]] +name = "wcmatch" +version = "8.5.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "bracex" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ea/c4/55e0d36da61d7b8b2a49fd273e6b296fd5e8471c72ebbe438635d1af3968/wcmatch-8.5.2.tar.gz", hash = "sha256:a70222b86dea82fb382dd87b73278c10756c138bd6f8f714e2183128887b9eb2", size = 114983, upload-time = "2024-05-15T12:51:08.054Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/09/78/533ef890536e5ba0fd4f7df37482b5800ecaaceae9afc30978a1a7f88ff1/wcmatch-8.5.2-py3-none-any.whl", hash = "sha256:17d3ad3758f9d0b5b4dedc770b65420d4dac62e680229c287bf24c9db856a478", size = 39397, upload-time = "2024-05-15T12:51:06.2Z" }, +] + +[[package]] +name = "wcwidth" +version = "0.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/35/a2/8e3becb46433538a38726c948d3399905a4c7cabd0df578ede5dc51f0ec2/wcwidth-0.6.0.tar.gz", hash = "sha256:cdc4e4262d6ef9a1a57e018384cbeb1208d8abbc64176027e2c2455c81313159", size = 159684, upload-time = "2026-02-06T19:19:40.919Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/5a/199c59e0a824a3db2b89c5d2dade7ab5f9624dbf6448dc291b46d5ec94d3/wcwidth-0.6.0-py3-none-any.whl", hash = "sha256:1a3a1e510b553315f8e146c54764f4fb6264ffad731b3d78088cdb1478ffbdad", size = 94189, upload-time = "2026-02-06T19:19:39.646Z" }, +] + +[[package]] +name = "wrapt" +version = "1.17.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/95/8f/aeb76c5b46e273670962298c23e7ddde79916cb74db802131d49a85e4b7d/wrapt-1.17.3.tar.gz", hash = "sha256:f66eb08feaa410fe4eebd17f2a2c8e2e46d3476e9f8c783daa8e09e0faa666d0", size = 55547, upload-time = "2025-08-12T05:53:21.714Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fc/f6/759ece88472157acb55fc195e5b116e06730f1b651b5b314c66291729193/wrapt-1.17.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a47681378a0439215912ef542c45a783484d4dd82bac412b71e59cf9c0e1cea0", size = 54003, upload-time = "2025-08-12T05:51:48.627Z" }, + { url = "https://files.pythonhosted.org/packages/4f/a9/49940b9dc6d47027dc850c116d79b4155f15c08547d04db0f07121499347/wrapt-1.17.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:54a30837587c6ee3cd1a4d1c2ec5d24e77984d44e2f34547e2323ddb4e22eb77", size = 39025, upload-time = "2025-08-12T05:51:37.156Z" }, + { url = "https://files.pythonhosted.org/packages/45/35/6a08de0f2c96dcdd7fe464d7420ddb9a7655a6561150e5fc4da9356aeaab/wrapt-1.17.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:16ecf15d6af39246fe33e507105d67e4b81d8f8d2c6598ff7e3ca1b8a37213f7", size = 39108, upload-time = "2025-08-12T05:51:58.425Z" }, + { url = "https://files.pythonhosted.org/packages/0c/37/6faf15cfa41bf1f3dba80cd3f5ccc6622dfccb660ab26ed79f0178c7497f/wrapt-1.17.3-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6fd1ad24dc235e4ab88cda009e19bf347aabb975e44fd5c2fb22a3f6e4141277", size = 88072, upload-time = "2025-08-12T05:52:37.53Z" }, + { url = "https://files.pythonhosted.org/packages/78/f2/efe19ada4a38e4e15b6dff39c3e3f3f73f5decf901f66e6f72fe79623a06/wrapt-1.17.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ed61b7c2d49cee3c027372df5809a59d60cf1b6c2f81ee980a091f3afed6a2d", size = 88214, upload-time = "2025-08-12T05:52:15.886Z" }, + { url = "https://files.pythonhosted.org/packages/40/90/ca86701e9de1622b16e09689fc24b76f69b06bb0150990f6f4e8b0eeb576/wrapt-1.17.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:423ed5420ad5f5529db9ce89eac09c8a2f97da18eb1c870237e84c5a5c2d60aa", size = 87105, upload-time = "2025-08-12T05:52:17.914Z" }, + { url = "https://files.pythonhosted.org/packages/fd/e0/d10bd257c9a3e15cbf5523025252cc14d77468e8ed644aafb2d6f54cb95d/wrapt-1.17.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e01375f275f010fcbf7f643b4279896d04e571889b8a5b3f848423d91bf07050", size = 87766, upload-time = "2025-08-12T05:52:39.243Z" }, + { url = "https://files.pythonhosted.org/packages/e8/cf/7d848740203c7b4b27eb55dbfede11aca974a51c3d894f6cc4b865f42f58/wrapt-1.17.3-cp313-cp313-win32.whl", hash = "sha256:53e5e39ff71b3fc484df8a522c933ea2b7cdd0d5d15ae82e5b23fde87d44cbd8", size = 36711, upload-time = "2025-08-12T05:53:10.074Z" }, + { url = "https://files.pythonhosted.org/packages/57/54/35a84d0a4d23ea675994104e667ceff49227ce473ba6a59ba2c84f250b74/wrapt-1.17.3-cp313-cp313-win_amd64.whl", hash = "sha256:1f0b2f40cf341ee8cc1a97d51ff50dddb9fcc73241b9143ec74b30fc4f44f6cb", size = 38885, upload-time = "2025-08-12T05:53:08.695Z" }, + { url = "https://files.pythonhosted.org/packages/01/77/66e54407c59d7b02a3c4e0af3783168fff8e5d61def52cda8728439d86bc/wrapt-1.17.3-cp313-cp313-win_arm64.whl", hash = "sha256:7425ac3c54430f5fc5e7b6f41d41e704db073309acfc09305816bc6a0b26bb16", size = 36896, upload-time = "2025-08-12T05:52:55.34Z" }, + { url = "https://files.pythonhosted.org/packages/02/a2/cd864b2a14f20d14f4c496fab97802001560f9f41554eef6df201cd7f76c/wrapt-1.17.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:cf30f6e3c077c8e6a9a7809c94551203c8843e74ba0c960f4a98cd80d4665d39", size = 54132, upload-time = "2025-08-12T05:51:49.864Z" }, + { url = "https://files.pythonhosted.org/packages/d5/46/d011725b0c89e853dc44cceb738a307cde5d240d023d6d40a82d1b4e1182/wrapt-1.17.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e228514a06843cae89621384cfe3a80418f3c04aadf8a3b14e46a7be704e4235", size = 39091, upload-time = "2025-08-12T05:51:38.935Z" }, + { url = "https://files.pythonhosted.org/packages/2e/9e/3ad852d77c35aae7ddebdbc3b6d35ec8013af7d7dddad0ad911f3d891dae/wrapt-1.17.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:5ea5eb3c0c071862997d6f3e02af1d055f381b1d25b286b9d6644b79db77657c", size = 39172, upload-time = "2025-08-12T05:51:59.365Z" }, + { url = "https://files.pythonhosted.org/packages/c3/f7/c983d2762bcce2326c317c26a6a1e7016f7eb039c27cdf5c4e30f4160f31/wrapt-1.17.3-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:281262213373b6d5e4bb4353bc36d1ba4084e6d6b5d242863721ef2bf2c2930b", size = 87163, upload-time = "2025-08-12T05:52:40.965Z" }, + { url = "https://files.pythonhosted.org/packages/e4/0f/f673f75d489c7f22d17fe0193e84b41540d962f75fce579cf6873167c29b/wrapt-1.17.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dc4a8d2b25efb6681ecacad42fca8859f88092d8732b170de6a5dddd80a1c8fa", size = 87963, upload-time = "2025-08-12T05:52:20.326Z" }, + { url = "https://files.pythonhosted.org/packages/df/61/515ad6caca68995da2fac7a6af97faab8f78ebe3bf4f761e1b77efbc47b5/wrapt-1.17.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:373342dd05b1d07d752cecbec0c41817231f29f3a89aa8b8843f7b95992ed0c7", size = 86945, upload-time = "2025-08-12T05:52:21.581Z" }, + { url = "https://files.pythonhosted.org/packages/d3/bd/4e70162ce398462a467bc09e768bee112f1412e563620adc353de9055d33/wrapt-1.17.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d40770d7c0fd5cbed9d84b2c3f2e156431a12c9a37dc6284060fb4bec0b7ffd4", size = 86857, upload-time = "2025-08-12T05:52:43.043Z" }, + { url = "https://files.pythonhosted.org/packages/2b/b8/da8560695e9284810b8d3df8a19396a6e40e7518059584a1a394a2b35e0a/wrapt-1.17.3-cp314-cp314-win32.whl", hash = "sha256:fbd3c8319de8e1dc79d346929cd71d523622da527cca14e0c1d257e31c2b8b10", size = 37178, upload-time = "2025-08-12T05:53:12.605Z" }, + { url = "https://files.pythonhosted.org/packages/db/c8/b71eeb192c440d67a5a0449aaee2310a1a1e8eca41676046f99ed2487e9f/wrapt-1.17.3-cp314-cp314-win_amd64.whl", hash = "sha256:e1a4120ae5705f673727d3253de3ed0e016f7cd78dc463db1b31e2463e1f3cf6", size = 39310, upload-time = "2025-08-12T05:53:11.106Z" }, + { url = "https://files.pythonhosted.org/packages/45/20/2cda20fd4865fa40f86f6c46ed37a2a8356a7a2fde0773269311f2af56c7/wrapt-1.17.3-cp314-cp314-win_arm64.whl", hash = "sha256:507553480670cab08a800b9463bdb881b2edeed77dc677b0a5915e6106e91a58", size = 37266, upload-time = "2025-08-12T05:52:56.531Z" }, + { url = "https://files.pythonhosted.org/packages/77/ed/dd5cf21aec36c80443c6f900449260b80e2a65cf963668eaef3b9accce36/wrapt-1.17.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:ed7c635ae45cfbc1a7371f708727bf74690daedc49b4dba310590ca0bd28aa8a", size = 56544, upload-time = "2025-08-12T05:51:51.109Z" }, + { url = "https://files.pythonhosted.org/packages/8d/96/450c651cc753877ad100c7949ab4d2e2ecc4d97157e00fa8f45df682456a/wrapt-1.17.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:249f88ed15503f6492a71f01442abddd73856a0032ae860de6d75ca62eed8067", size = 40283, upload-time = "2025-08-12T05:51:39.912Z" }, + { url = "https://files.pythonhosted.org/packages/d1/86/2fcad95994d9b572db57632acb6f900695a648c3e063f2cd344b3f5c5a37/wrapt-1.17.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:5a03a38adec8066d5a37bea22f2ba6bbf39fcdefbe2d91419ab864c3fb515454", size = 40366, upload-time = "2025-08-12T05:52:00.693Z" }, + { url = "https://files.pythonhosted.org/packages/64/0e/f4472f2fdde2d4617975144311f8800ef73677a159be7fe61fa50997d6c0/wrapt-1.17.3-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5d4478d72eb61c36e5b446e375bbc49ed002430d17cdec3cecb36993398e1a9e", size = 108571, upload-time = "2025-08-12T05:52:44.521Z" }, + { url = "https://files.pythonhosted.org/packages/cc/01/9b85a99996b0a97c8a17484684f206cbb6ba73c1ce6890ac668bcf3838fb/wrapt-1.17.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223db574bb38637e8230eb14b185565023ab624474df94d2af18f1cdb625216f", size = 113094, upload-time = "2025-08-12T05:52:22.618Z" }, + { url = "https://files.pythonhosted.org/packages/25/02/78926c1efddcc7b3aa0bc3d6b33a822f7d898059f7cd9ace8c8318e559ef/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:e405adefb53a435f01efa7ccdec012c016b5a1d3f35459990afc39b6be4d5056", size = 110659, upload-time = "2025-08-12T05:52:24.057Z" }, + { url = "https://files.pythonhosted.org/packages/dc/ee/c414501ad518ac3e6fe184753632fe5e5ecacdcf0effc23f31c1e4f7bfcf/wrapt-1.17.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:88547535b787a6c9ce4086917b6e1d291aa8ed914fdd3a838b3539dc95c12804", size = 106946, upload-time = "2025-08-12T05:52:45.976Z" }, + { url = "https://files.pythonhosted.org/packages/be/44/a1bd64b723d13bb151d6cc91b986146a1952385e0392a78567e12149c7b4/wrapt-1.17.3-cp314-cp314t-win32.whl", hash = "sha256:41b1d2bc74c2cac6f9074df52b2efbef2b30bdfe5f40cb78f8ca22963bc62977", size = 38717, upload-time = "2025-08-12T05:53:15.214Z" }, + { url = "https://files.pythonhosted.org/packages/79/d9/7cfd5a312760ac4dd8bf0184a6ee9e43c33e47f3dadc303032ce012b8fa3/wrapt-1.17.3-cp314-cp314t-win_amd64.whl", hash = "sha256:73d496de46cd2cdbdbcce4ae4bcdb4afb6a11234a1df9c085249d55166b95116", size = 41334, upload-time = "2025-08-12T05:53:14.178Z" }, + { url = "https://files.pythonhosted.org/packages/46/78/10ad9781128ed2f99dbc474f43283b13fea8ba58723e98844367531c18e9/wrapt-1.17.3-cp314-cp314t-win_arm64.whl", hash = "sha256:f38e60678850c42461d4202739f9bf1e3a737c7ad283638251e79cc49effb6b6", size = 38471, upload-time = "2025-08-12T05:52:57.784Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f6/a933bd70f98e9cf3e08167fc5cd7aaaca49147e48411c0bd5ae701bb2194/wrapt-1.17.3-py3-none-any.whl", hash = "sha256:7171ae35d2c33d326ac19dd8facb1e82e5fd04ef8c6c0e394d7af55a55051c22", size = 23591, upload-time = "2025-08-12T05:53:20.674Z" }, +] + +[[package]] +name = "yarl" +version = "1.22.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "idna" }, + { name = "multidict" }, + { name = "propcache" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/57/63/0c6ebca57330cd313f6102b16dd57ffaf3ec4c83403dcb45dbd15c6f3ea1/yarl-1.22.0.tar.gz", hash = "sha256:bebf8557577d4401ba8bd9ff33906f1376c877aa78d1fe216ad01b4d6745af71", size = 187169, upload-time = "2025-10-06T14:12:55.963Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ea/f3/d67de7260456ee105dc1d162d43a019ecad6b91e2f51809d6cddaa56690e/yarl-1.22.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8dee9c25c74997f6a750cd317b8ca63545169c098faee42c84aa5e506c819b53", size = 139980, upload-time = "2025-10-06T14:10:14.601Z" }, + { url = "https://files.pythonhosted.org/packages/01/88/04d98af0b47e0ef42597b9b28863b9060bb515524da0a65d5f4db160b2d5/yarl-1.22.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:01e73b85a5434f89fc4fe27dcda2aff08ddf35e4d47bbbea3bdcd25321af538a", size = 93424, upload-time = "2025-10-06T14:10:16.115Z" }, + { url = "https://files.pythonhosted.org/packages/18/91/3274b215fd8442a03975ce6bee5fe6aa57a8326b29b9d3d56234a1dca244/yarl-1.22.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:22965c2af250d20c873cdbee8ff958fb809940aeb2e74ba5f20aaf6b7ac8c70c", size = 93821, upload-time = "2025-10-06T14:10:17.993Z" }, + { url = "https://files.pythonhosted.org/packages/61/3a/caf4e25036db0f2da4ca22a353dfeb3c9d3c95d2761ebe9b14df8fc16eb0/yarl-1.22.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b4f15793aa49793ec8d1c708ab7f9eded1aa72edc5174cae703651555ed1b601", size = 373243, upload-time = "2025-10-06T14:10:19.44Z" }, + { url = "https://files.pythonhosted.org/packages/6e/9e/51a77ac7516e8e7803b06e01f74e78649c24ee1021eca3d6a739cb6ea49c/yarl-1.22.0-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e5542339dcf2747135c5c85f68680353d5cb9ffd741c0f2e8d832d054d41f35a", size = 342361, upload-time = "2025-10-06T14:10:21.124Z" }, + { url = "https://files.pythonhosted.org/packages/d4/f8/33b92454789dde8407f156c00303e9a891f1f51a0330b0fad7c909f87692/yarl-1.22.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:5c401e05ad47a75869c3ab3e35137f8468b846770587e70d71e11de797d113df", size = 387036, upload-time = "2025-10-06T14:10:22.902Z" }, + { url = "https://files.pythonhosted.org/packages/d9/9a/c5db84ea024f76838220280f732970aa4ee154015d7f5c1bfb60a267af6f/yarl-1.22.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:243dda95d901c733f5b59214d28b0120893d91777cb8aa043e6ef059d3cddfe2", size = 397671, upload-time = "2025-10-06T14:10:24.523Z" }, + { url = "https://files.pythonhosted.org/packages/11/c9/cd8538dc2e7727095e0c1d867bad1e40c98f37763e6d995c1939f5fdc7b1/yarl-1.22.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bec03d0d388060058f5d291a813f21c011041938a441c593374da6077fe21b1b", size = 377059, upload-time = "2025-10-06T14:10:26.406Z" }, + { url = "https://files.pythonhosted.org/packages/a1/b9/ab437b261702ced75122ed78a876a6dec0a1b0f5e17a4ac7a9a2482d8abe/yarl-1.22.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b0748275abb8c1e1e09301ee3cf90c8a99678a4e92e4373705f2a2570d581273", size = 365356, upload-time = "2025-10-06T14:10:28.461Z" }, + { url = "https://files.pythonhosted.org/packages/b2/9d/8e1ae6d1d008a9567877b08f0ce4077a29974c04c062dabdb923ed98e6fe/yarl-1.22.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:47fdb18187e2a4e18fda2c25c05d8251a9e4a521edaed757fef033e7d8498d9a", size = 361331, upload-time = "2025-10-06T14:10:30.541Z" }, + { url = "https://files.pythonhosted.org/packages/ca/5a/09b7be3905962f145b73beb468cdd53db8aa171cf18c80400a54c5b82846/yarl-1.22.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c7044802eec4524fde550afc28edda0dd5784c4c45f0be151a2d3ba017daca7d", size = 382590, upload-time = "2025-10-06T14:10:33.352Z" }, + { url = "https://files.pythonhosted.org/packages/aa/7f/59ec509abf90eda5048b0bc3e2d7b5099dffdb3e6b127019895ab9d5ef44/yarl-1.22.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:139718f35149ff544caba20fce6e8a2f71f1e39b92c700d8438a0b1d2a631a02", size = 385316, upload-time = "2025-10-06T14:10:35.034Z" }, + { url = "https://files.pythonhosted.org/packages/e5/84/891158426bc8036bfdfd862fabd0e0fa25df4176ec793e447f4b85cf1be4/yarl-1.22.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e1b51bebd221006d3d2f95fbe124b22b247136647ae5dcc8c7acafba66e5ee67", size = 374431, upload-time = "2025-10-06T14:10:37.76Z" }, + { url = "https://files.pythonhosted.org/packages/bb/49/03da1580665baa8bef5e8ed34c6df2c2aca0a2f28bf397ed238cc1bbc6f2/yarl-1.22.0-cp313-cp313-win32.whl", hash = "sha256:d3e32536234a95f513bd374e93d717cf6b2231a791758de6c509e3653f234c95", size = 81555, upload-time = "2025-10-06T14:10:39.649Z" }, + { url = "https://files.pythonhosted.org/packages/9a/ee/450914ae11b419eadd067c6183ae08381cfdfcb9798b90b2b713bbebddda/yarl-1.22.0-cp313-cp313-win_amd64.whl", hash = "sha256:47743b82b76d89a1d20b83e60d5c20314cbd5ba2befc9cda8f28300c4a08ed4d", size = 86965, upload-time = "2025-10-06T14:10:41.313Z" }, + { url = "https://files.pythonhosted.org/packages/98/4d/264a01eae03b6cf629ad69bae94e3b0e5344741e929073678e84bf7a3e3b/yarl-1.22.0-cp313-cp313-win_arm64.whl", hash = "sha256:5d0fcda9608875f7d052eff120c7a5da474a6796fe4d83e152e0e4d42f6d1a9b", size = 81205, upload-time = "2025-10-06T14:10:43.167Z" }, + { url = "https://files.pythonhosted.org/packages/88/fc/6908f062a2f77b5f9f6d69cecb1747260831ff206adcbc5b510aff88df91/yarl-1.22.0-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:719ae08b6972befcba4310e49edb1161a88cdd331e3a694b84466bd938a6ab10", size = 146209, upload-time = "2025-10-06T14:10:44.643Z" }, + { url = "https://files.pythonhosted.org/packages/65/47/76594ae8eab26210b4867be6f49129861ad33da1f1ebdf7051e98492bf62/yarl-1.22.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:47d8a5c446df1c4db9d21b49619ffdba90e77c89ec6e283f453856c74b50b9e3", size = 95966, upload-time = "2025-10-06T14:10:46.554Z" }, + { url = "https://files.pythonhosted.org/packages/ab/ce/05e9828a49271ba6b5b038b15b3934e996980dd78abdfeb52a04cfb9467e/yarl-1.22.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:cfebc0ac8333520d2d0423cbbe43ae43c8838862ddb898f5ca68565e395516e9", size = 97312, upload-time = "2025-10-06T14:10:48.007Z" }, + { url = "https://files.pythonhosted.org/packages/d1/c5/7dffad5e4f2265b29c9d7ec869c369e4223166e4f9206fc2243ee9eea727/yarl-1.22.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4398557cbf484207df000309235979c79c4356518fd5c99158c7d38203c4da4f", size = 361967, upload-time = "2025-10-06T14:10:49.997Z" }, + { url = "https://files.pythonhosted.org/packages/50/b2/375b933c93a54bff7fc041e1a6ad2c0f6f733ffb0c6e642ce56ee3b39970/yarl-1.22.0-cp313-cp313t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:2ca6fd72a8cd803be290d42f2dec5cdcd5299eeb93c2d929bf060ad9efaf5de0", size = 323949, upload-time = "2025-10-06T14:10:52.004Z" }, + { url = "https://files.pythonhosted.org/packages/66/50/bfc2a29a1d78644c5a7220ce2f304f38248dc94124a326794e677634b6cf/yarl-1.22.0-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ca1f59c4e1ab6e72f0a23c13fca5430f889634166be85dbf1013683e49e3278e", size = 361818, upload-time = "2025-10-06T14:10:54.078Z" }, + { url = "https://files.pythonhosted.org/packages/46/96/f3941a46af7d5d0f0498f86d71275696800ddcdd20426298e572b19b91ff/yarl-1.22.0-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c5010a52015e7c70f86eb967db0f37f3c8bd503a695a49f8d45700144667708", size = 372626, upload-time = "2025-10-06T14:10:55.767Z" }, + { url = "https://files.pythonhosted.org/packages/c1/42/8b27c83bb875cd89448e42cd627e0fb971fa1675c9ec546393d18826cb50/yarl-1.22.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d7672ecf7557476642c88497c2f8d8542f8e36596e928e9bcba0e42e1e7d71f", size = 341129, upload-time = "2025-10-06T14:10:57.985Z" }, + { url = "https://files.pythonhosted.org/packages/49/36/99ca3122201b382a3cf7cc937b95235b0ac944f7e9f2d5331d50821ed352/yarl-1.22.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:3b7c88eeef021579d600e50363e0b6ee4f7f6f728cd3486b9d0f3ee7b946398d", size = 346776, upload-time = "2025-10-06T14:10:59.633Z" }, + { url = "https://files.pythonhosted.org/packages/85/b4/47328bf996acd01a4c16ef9dcd2f59c969f495073616586f78cd5f2efb99/yarl-1.22.0-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:f4afb5c34f2c6fecdcc182dfcfc6af6cccf1aa923eed4d6a12e9d96904e1a0d8", size = 334879, upload-time = "2025-10-06T14:11:01.454Z" }, + { url = "https://files.pythonhosted.org/packages/c2/ad/b77d7b3f14a4283bffb8e92c6026496f6de49751c2f97d4352242bba3990/yarl-1.22.0-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:59c189e3e99a59cf8d83cbb31d4db02d66cda5a1a4374e8a012b51255341abf5", size = 350996, upload-time = "2025-10-06T14:11:03.452Z" }, + { url = "https://files.pythonhosted.org/packages/81/c8/06e1d69295792ba54d556f06686cbd6a7ce39c22307100e3fb4a2c0b0a1d/yarl-1.22.0-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:5a3bf7f62a289fa90f1990422dc8dff5a458469ea71d1624585ec3a4c8d6960f", size = 356047, upload-time = "2025-10-06T14:11:05.115Z" }, + { url = "https://files.pythonhosted.org/packages/4b/b8/4c0e9e9f597074b208d18cef227d83aac36184bfbc6eab204ea55783dbc5/yarl-1.22.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:de6b9a04c606978fdfe72666fa216ffcf2d1a9f6a381058d4378f8d7b1e5de62", size = 342947, upload-time = "2025-10-06T14:11:08.137Z" }, + { url = "https://files.pythonhosted.org/packages/e0/e5/11f140a58bf4c6ad7aca69a892bff0ee638c31bea4206748fc0df4ebcb3a/yarl-1.22.0-cp313-cp313t-win32.whl", hash = "sha256:1834bb90991cc2999f10f97f5f01317f99b143284766d197e43cd5b45eb18d03", size = 86943, upload-time = "2025-10-06T14:11:10.284Z" }, + { url = "https://files.pythonhosted.org/packages/31/74/8b74bae38ed7fe6793d0c15a0c8207bbb819cf287788459e5ed230996cdd/yarl-1.22.0-cp313-cp313t-win_amd64.whl", hash = "sha256:ff86011bd159a9d2dfc89c34cfd8aff12875980e3bd6a39ff097887520e60249", size = 93715, upload-time = "2025-10-06T14:11:11.739Z" }, + { url = "https://files.pythonhosted.org/packages/69/66/991858aa4b5892d57aef7ee1ba6b4d01ec3b7eb3060795d34090a3ca3278/yarl-1.22.0-cp313-cp313t-win_arm64.whl", hash = "sha256:7861058d0582b847bc4e3a4a4c46828a410bca738673f35a29ba3ca5db0b473b", size = 83857, upload-time = "2025-10-06T14:11:13.586Z" }, + { url = "https://files.pythonhosted.org/packages/46/b3/e20ef504049f1a1c54a814b4b9bed96d1ac0e0610c3b4da178f87209db05/yarl-1.22.0-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:34b36c2c57124530884d89d50ed2c1478697ad7473efd59cfd479945c95650e4", size = 140520, upload-time = "2025-10-06T14:11:15.465Z" }, + { url = "https://files.pythonhosted.org/packages/e4/04/3532d990fdbab02e5ede063676b5c4260e7f3abea2151099c2aa745acc4c/yarl-1.22.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:0dd9a702591ca2e543631c2a017e4a547e38a5c0f29eece37d9097e04a7ac683", size = 93504, upload-time = "2025-10-06T14:11:17.106Z" }, + { url = "https://files.pythonhosted.org/packages/11/63/ff458113c5c2dac9a9719ac68ee7c947cb621432bcf28c9972b1c0e83938/yarl-1.22.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:594fcab1032e2d2cc3321bb2e51271e7cd2b516c7d9aee780ece81b07ff8244b", size = 94282, upload-time = "2025-10-06T14:11:19.064Z" }, + { url = "https://files.pythonhosted.org/packages/a7/bc/315a56aca762d44a6aaaf7ad253f04d996cb6b27bad34410f82d76ea8038/yarl-1.22.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f3d7a87a78d46a2e3d5b72587ac14b4c16952dd0887dbb051451eceac774411e", size = 372080, upload-time = "2025-10-06T14:11:20.996Z" }, + { url = "https://files.pythonhosted.org/packages/3f/3f/08e9b826ec2e099ea6e7c69a61272f4f6da62cb5b1b63590bb80ca2e4a40/yarl-1.22.0-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:852863707010316c973162e703bddabec35e8757e67fcb8ad58829de1ebc8590", size = 338696, upload-time = "2025-10-06T14:11:22.847Z" }, + { url = "https://files.pythonhosted.org/packages/e3/9f/90360108e3b32bd76789088e99538febfea24a102380ae73827f62073543/yarl-1.22.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:131a085a53bfe839a477c0845acf21efc77457ba2bcf5899618136d64f3303a2", size = 387121, upload-time = "2025-10-06T14:11:24.889Z" }, + { url = "https://files.pythonhosted.org/packages/98/92/ab8d4657bd5b46a38094cfaea498f18bb70ce6b63508fd7e909bd1f93066/yarl-1.22.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:078a8aefd263f4d4f923a9677b942b445a2be970ca24548a8102689a3a8ab8da", size = 394080, upload-time = "2025-10-06T14:11:27.307Z" }, + { url = "https://files.pythonhosted.org/packages/f5/e7/d8c5a7752fef68205296201f8ec2bf718f5c805a7a7e9880576c67600658/yarl-1.22.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bca03b91c323036913993ff5c738d0842fc9c60c4648e5c8d98331526df89784", size = 372661, upload-time = "2025-10-06T14:11:29.387Z" }, + { url = "https://files.pythonhosted.org/packages/b6/2e/f4d26183c8db0bb82d491b072f3127fb8c381a6206a3a56332714b79b751/yarl-1.22.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:68986a61557d37bb90d3051a45b91fa3d5c516d177dfc6dd6f2f436a07ff2b6b", size = 364645, upload-time = "2025-10-06T14:11:31.423Z" }, + { url = "https://files.pythonhosted.org/packages/80/7c/428e5812e6b87cd00ee8e898328a62c95825bf37c7fa87f0b6bb2ad31304/yarl-1.22.0-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:4792b262d585ff0dff6bcb787f8492e40698443ec982a3568c2096433660c694", size = 355361, upload-time = "2025-10-06T14:11:33.055Z" }, + { url = "https://files.pythonhosted.org/packages/ec/2a/249405fd26776f8b13c067378ef4d7dd49c9098d1b6457cdd152a99e96a9/yarl-1.22.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:ebd4549b108d732dba1d4ace67614b9545b21ece30937a63a65dd34efa19732d", size = 381451, upload-time = "2025-10-06T14:11:35.136Z" }, + { url = "https://files.pythonhosted.org/packages/67/a8/fb6b1adbe98cf1e2dd9fad71003d3a63a1bc22459c6e15f5714eb9323b93/yarl-1.22.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:f87ac53513d22240c7d59203f25cc3beac1e574c6cd681bbfd321987b69f95fd", size = 383814, upload-time = "2025-10-06T14:11:37.094Z" }, + { url = "https://files.pythonhosted.org/packages/d9/f9/3aa2c0e480fb73e872ae2814c43bc1e734740bb0d54e8cb2a95925f98131/yarl-1.22.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:22b029f2881599e2f1b06f8f1db2ee63bd309e2293ba2d566e008ba12778b8da", size = 370799, upload-time = "2025-10-06T14:11:38.83Z" }, + { url = "https://files.pythonhosted.org/packages/50/3c/af9dba3b8b5eeb302f36f16f92791f3ea62e3f47763406abf6d5a4a3333b/yarl-1.22.0-cp314-cp314-win32.whl", hash = "sha256:6a635ea45ba4ea8238463b4f7d0e721bad669f80878b7bfd1f89266e2ae63da2", size = 82990, upload-time = "2025-10-06T14:11:40.624Z" }, + { url = "https://files.pythonhosted.org/packages/ac/30/ac3a0c5bdc1d6efd1b41fa24d4897a4329b3b1e98de9449679dd327af4f0/yarl-1.22.0-cp314-cp314-win_amd64.whl", hash = "sha256:0d6e6885777af0f110b0e5d7e5dda8b704efed3894da26220b7f3d887b839a79", size = 88292, upload-time = "2025-10-06T14:11:42.578Z" }, + { url = "https://files.pythonhosted.org/packages/df/0a/227ab4ff5b998a1b7410abc7b46c9b7a26b0ca9e86c34ba4b8d8bc7c63d5/yarl-1.22.0-cp314-cp314-win_arm64.whl", hash = "sha256:8218f4e98d3c10d683584cb40f0424f4b9fd6e95610232dd75e13743b070ee33", size = 82888, upload-time = "2025-10-06T14:11:44.863Z" }, + { url = "https://files.pythonhosted.org/packages/06/5e/a15eb13db90abd87dfbefb9760c0f3f257ac42a5cac7e75dbc23bed97a9f/yarl-1.22.0-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:45c2842ff0e0d1b35a6bf1cd6c690939dacb617a70827f715232b2e0494d55d1", size = 146223, upload-time = "2025-10-06T14:11:46.796Z" }, + { url = "https://files.pythonhosted.org/packages/18/82/9665c61910d4d84f41a5bf6837597c89e665fa88aa4941080704645932a9/yarl-1.22.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:d947071e6ebcf2e2bee8fce76e10faca8f7a14808ca36a910263acaacef08eca", size = 95981, upload-time = "2025-10-06T14:11:48.845Z" }, + { url = "https://files.pythonhosted.org/packages/5d/9a/2f65743589809af4d0a6d3aa749343c4b5f4c380cc24a8e94a3c6625a808/yarl-1.22.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:334b8721303e61b00019474cc103bdac3d7b1f65e91f0bfedeec2d56dfe74b53", size = 97303, upload-time = "2025-10-06T14:11:50.897Z" }, + { url = "https://files.pythonhosted.org/packages/b0/ab/5b13d3e157505c43c3b43b5a776cbf7b24a02bc4cccc40314771197e3508/yarl-1.22.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1e7ce67c34138a058fd092f67d07a72b8e31ff0c9236e751957465a24b28910c", size = 361820, upload-time = "2025-10-06T14:11:52.549Z" }, + { url = "https://files.pythonhosted.org/packages/fb/76/242a5ef4677615cf95330cfc1b4610e78184400699bdda0acb897ef5e49a/yarl-1.22.0-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d77e1b2c6d04711478cb1c4ab90db07f1609ccf06a287d5607fcd90dc9863acf", size = 323203, upload-time = "2025-10-06T14:11:54.225Z" }, + { url = "https://files.pythonhosted.org/packages/8c/96/475509110d3f0153b43d06164cf4195c64d16999e0c7e2d8a099adcd6907/yarl-1.22.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c4647674b6150d2cae088fc07de2738a84b8bcedebef29802cf0b0a82ab6face", size = 363173, upload-time = "2025-10-06T14:11:56.069Z" }, + { url = "https://files.pythonhosted.org/packages/c9/66/59db471aecfbd559a1fd48aedd954435558cd98c7d0da8b03cc6c140a32c/yarl-1.22.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:efb07073be061c8f79d03d04139a80ba33cbd390ca8f0297aae9cce6411e4c6b", size = 373562, upload-time = "2025-10-06T14:11:58.783Z" }, + { url = "https://files.pythonhosted.org/packages/03/1f/c5d94abc91557384719da10ff166b916107c1b45e4d0423a88457071dd88/yarl-1.22.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e51ac5435758ba97ad69617e13233da53908beccc6cfcd6c34bbed8dcbede486", size = 339828, upload-time = "2025-10-06T14:12:00.686Z" }, + { url = "https://files.pythonhosted.org/packages/5f/97/aa6a143d3afba17b6465733681c70cf175af89f76ec8d9286e08437a7454/yarl-1.22.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:33e32a0dd0c8205efa8e83d04fc9f19313772b78522d1bdc7d9aed706bfd6138", size = 347551, upload-time = "2025-10-06T14:12:02.628Z" }, + { url = "https://files.pythonhosted.org/packages/43/3c/45a2b6d80195959239a7b2a8810506d4eea5487dce61c2a3393e7fc3c52e/yarl-1.22.0-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:bf4a21e58b9cde0e401e683ebd00f6ed30a06d14e93f7c8fd059f8b6e8f87b6a", size = 334512, upload-time = "2025-10-06T14:12:04.871Z" }, + { url = "https://files.pythonhosted.org/packages/86/a0/c2ab48d74599c7c84cb104ebd799c5813de252bea0f360ffc29d270c2caa/yarl-1.22.0-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:e4b582bab49ac33c8deb97e058cd67c2c50dac0dd134874106d9c774fd272529", size = 352400, upload-time = "2025-10-06T14:12:06.624Z" }, + { url = "https://files.pythonhosted.org/packages/32/75/f8919b2eafc929567d3d8411f72bdb1a2109c01caaab4ebfa5f8ffadc15b/yarl-1.22.0-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:0b5bcc1a9c4839e7e30b7b30dd47fe5e7e44fb7054ec29b5bb8d526aa1041093", size = 357140, upload-time = "2025-10-06T14:12:08.362Z" }, + { url = "https://files.pythonhosted.org/packages/cf/72/6a85bba382f22cf78add705d8c3731748397d986e197e53ecc7835e76de7/yarl-1.22.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:c0232bce2170103ec23c454e54a57008a9a72b5d1c3105dc2496750da8cfa47c", size = 341473, upload-time = "2025-10-06T14:12:10.994Z" }, + { url = "https://files.pythonhosted.org/packages/35/18/55e6011f7c044dc80b98893060773cefcfdbf60dfefb8cb2f58b9bacbd83/yarl-1.22.0-cp314-cp314t-win32.whl", hash = "sha256:8009b3173bcd637be650922ac455946197d858b3630b6d8787aa9e5c4564533e", size = 89056, upload-time = "2025-10-06T14:12:13.317Z" }, + { url = "https://files.pythonhosted.org/packages/f9/86/0f0dccb6e59a9e7f122c5afd43568b1d31b8ab7dda5f1b01fb5c7025c9a9/yarl-1.22.0-cp314-cp314t-win_amd64.whl", hash = "sha256:9fb17ea16e972c63d25d4a97f016d235c78dd2344820eb35bc034bc32012ee27", size = 96292, upload-time = "2025-10-06T14:12:15.398Z" }, + { url = "https://files.pythonhosted.org/packages/48/b7/503c98092fb3b344a179579f55814b613c1fbb1c23b3ec14a7b008a66a6e/yarl-1.22.0-cp314-cp314t-win_arm64.whl", hash = "sha256:9f6d73c1436b934e3f01df1e1b21ff765cd1d28c77dfb9ace207f746d4610ee1", size = 85171, upload-time = "2025-10-06T14:12:16.935Z" }, + { url = "https://files.pythonhosted.org/packages/73/ae/b48f95715333080afb75a4504487cbe142cae1268afc482d06692d605ae6/yarl-1.22.0-py3-none-any.whl", hash = "sha256:1380560bdba02b6b6c90de54133c81c9f2a453dee9912fe58c1dcced1edb7cff", size = 46814, upload-time = "2025-10-06T14:12:53.872Z" }, +] + +[[package]] +name = "zipp" +version = "3.23.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, +]