diff --git a/.github/skills/azure-typespec-author/evaluate/.vally.yaml b/.github/skills/azure-typespec-author/evaluate/.vally.yaml index e0308b86996..faecc0dbfee 100644 --- a/.github/skills/azure-typespec-author/evaluate/.vally.yaml +++ b/.github/skills/azure-typespec-author/evaluate/.vally.yaml @@ -36,7 +36,7 @@ suites: version-evolution: evals: ["evals/001005.eval.yaml","evals/001006.eval.yaml","evals/001007.eval.yaml","evals/001008.eval.yaml"] armtemplate: - evals: ["evals/002001.eval.yaml","evals/002002.eval.yaml","evals/002003.eval.yaml","evals/002004.eval.yaml","evals/002005.eval.yaml","evals/002006.eval.yaml","evals/002007.eval.yaml","evals/002008.eval.yaml","evals/002009.eval.yaml","evals/002010.eval.yaml"] + evals: ["evals/002001.eval.yaml","evals/002002.eval.yaml","evals/002003.eval.yaml","evals/002004.eval.yaml","evals/002005.eval.yaml","evals/002006.eval.yaml","evals/002007.eval.yaml","evals/002008.eval.yaml","evals/002009.eval.yaml","evals/002010.eval.yaml","evals/002011.eval.yaml"] longrunningoperation: evals: ["evals/003001.eval.yaml","evals/003002.eval.yaml"] decorators: diff --git a/.github/skills/azure-typespec-author/evaluate/evals/002011.eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/002011.eval.yaml new file mode 100644 index 00000000000..d2545d3a1b8 --- /dev/null +++ b/.github/skills/azure-typespec-author/evaluate/evals/002011.eval.yaml @@ -0,0 +1,69 @@ +name: azure-typespec-author-eval +description: Evaluation suite for azure-typespec-author. +version: "1.0" +type: capability # capability | regression + +# Root-level environment +environment: azsdk-mcp + +# Execution configuration +config: + runs: 1 # Trials per stimulus + timeout: "660s" # Seconds per trial + model: claude-opus-4.6 #gpt-4o #claude-sonnet-4.6 # Model for agent execution + executor: copilot-sdk # Which executor to use + +# Test cases +stimuli: +- prompt: | + In the Widget project, add a checkExistence (HEAD) operation to the existing Employee resource. + name: 002011-arm-add-check-existence-operation + constraints: + max_turns: 5 + max_tokens: 100000 + environment: + files: + - src: ../fixtures/Microsoft.Widget/Widget/employee.tsp + dest: employee.tsp + - src: ../fixtures/Microsoft.Widget/Widget/main.tsp + dest: main.tsp + - src: ../fixtures/Microsoft.Widget/Widget/package-lock.json + dest: package-lock.json + - src: ../fixtures/Microsoft.Widget/Widget/package.json + dest: package.json + - src: ../fixtures/Microsoft.Widget/Widget/readme.md + dest: readme.md + - src: ../fixtures/Microsoft.Widget/Widget/shared.tsp + dest: shared.tsp + - src: ../fixtures/Microsoft.Widget/Widget/tspconfig.yaml + dest: tspconfig.yaml + commands: + - node -e "const fs=require('fs');const t=process.env.FIXTURE_NODE_MODULES;if(t&&fs.existsSync(t)&&!fs.existsSync('node_modules')){fs.symlinkSync(t,'node_modules','junction')}" + graders: + - type: tool-calls + config: + required: + - edit + - azure-sdk-mcp-azsdk_typespec_generate_authoring_plan + - azure-sdk-mcp-azsdk_run_typespec_validation + - type: skill-invocation + config: + required: + - azure-typespec-author + - type: file-matches + config: + path: employee.tsp + pattern: 'checkExistence\s+is\s+ArmResourceCheckExistence\s*<\s*Employee\s*>' + - type: prompt + config: + scoring: scale_1_5 + threshold: 1.0 + prompt: 'Verify change scope and quality only: no unrelated edits, and the implemented result is semantically consistent with the task intent.' + model: claude-opus-4.6 +scoring: + weights: + tool-calls: 1 + skill-invocation: 1 + prompt: 1 + file-matches: 1 + threshold: 1.0 diff --git a/.github/skills/azure-typespec-author/evaluate/evals/eval.yaml b/.github/skills/azure-typespec-author/evaluate/evals/eval.yaml index 686217fde5c..2522ee60459 100644 --- a/.github/skills/azure-typespec-author/evaluate/evals/eval.yaml +++ b/.github/skills/azure-typespec-author/evaluate/evals/eval.yaml @@ -1355,6 +1355,49 @@ stimuli: threshold: 1 prompt: 'Verify change scope and quality only: no unrelated edits, and the implemented result is semantically consistent with the task intent.' model: claude-opus-4.6 +- name: 002011-arm-add-check-existence-operation + tags: + suite: armtemplate + prompt: | + In the Widget project, add a checkExistence (HEAD) operation to the existing Employee resource. + constraints: + max_turns: 5 + max_tokens: 1000000 + environment: + files: + - src: ../fixtures/Microsoft.Widget/Widget/employee.tsp + dest: employee.tsp + - src: ../fixtures/Microsoft.Widget/Widget/main.tsp + dest: main.tsp + - src: ../fixtures/Microsoft.Widget/Widget/package-lock.json + dest: package-lock.json + - src: ../fixtures/Microsoft.Widget/Widget/package.json + dest: package.json + - src: ../fixtures/Microsoft.Widget/Widget/readme.md + dest: readme.md + - src: ../fixtures/Microsoft.Widget/Widget/shared.tsp + dest: shared.tsp + - src: ../fixtures/Microsoft.Widget/Widget/tspconfig.yaml + dest: tspconfig.yaml + commands: + - node -e "const fs=require('fs');const t=process.env.FIXTURE_NODE_MODULES;if(t&&fs.existsSync(t)&&!fs.existsSync('node_modules')){fs.symlinkSync(t,'node_modules','junction')}" + graders: + - type: tool-calls + config: + required: + - edit + - azure-sdk-mcp-azsdk_typespec_generate_authoring_plan + - azure-sdk-mcp-azsdk_run_typespec_validation + - type: file-matches + config: + path: employee.tsp + pattern: 'checkExistence\s+is\s+ArmResourceCheckExistence\s*<\s*Employee\s*>' + - type: prompt + config: + scoring: scale_1_5 + threshold: 1 + prompt: 'Verify change scope and quality only: no unrelated edits, and the implemented result is semantically consistent with the task intent.' + model: claude-opus-4.6 - name: 003001-arm-action-lro prompt: | Add a move async operation to move employee, operation's request is MoveRequest, response is MoveResponse