From d205ffa8bf55459a53502b0f9d63dc26c1ea1dab Mon Sep 17 00:00:00 2001 From: Sean Goedecke Date: Wed, 4 Jun 2025 03:17:58 +0000 Subject: [PATCH 1/9] Add eval command --- cmd/eval/eval.go | 443 ++++++++++++++++++++++++++++++++++++ cmd/eval/eval_test.go | 281 +++++++++++++++++++++++ cmd/eval/similarity.go | 84 +++++++ cmd/eval/similarity_test.go | 145 ++++++++++++ cmd/root.go | 2 + cmd/root_test.go | 1 + cmd/run/run.go | 40 ++-- failing_test_prompt.yml | 23 ++ pkg/prompt/prompt.go | 99 ++++++++ pkg/prompt/prompt_test.go | 94 ++++++++ sample_prompt.yml | 22 ++ 11 files changed, 1208 insertions(+), 26 deletions(-) create mode 100644 cmd/eval/eval.go create mode 100644 cmd/eval/eval_test.go create mode 100644 cmd/eval/similarity.go create mode 100644 cmd/eval/similarity_test.go create mode 100644 failing_test_prompt.yml create mode 100644 pkg/prompt/prompt.go create mode 100644 pkg/prompt/prompt_test.go create mode 100644 sample_prompt.yml diff --git a/cmd/eval/eval.go b/cmd/eval/eval.go new file mode 100644 index 00000000..1e4cd9f7 --- /dev/null +++ b/cmd/eval/eval.go @@ -0,0 +1,443 @@ +// Package eval provides a gh command to evaluate prompts against GitHub models. +package eval + +import ( + "context" + "errors" + "fmt" + "strings" + + "github.com/MakeNowJust/heredoc" + "github.com/github/gh-models/internal/azuremodels" + "github.com/github/gh-models/pkg/command" + "github.com/github/gh-models/pkg/prompt" + "github.com/github/gh-models/pkg/util" + "github.com/spf13/cobra" +) + +// EvaluationPromptFile represents the structure of a prompt.yml file for evaluation +// It extends the base prompt.File with evaluation-specific fields +type EvaluationPromptFile = prompt.File + +// TestResult represents the result of running a test case +type TestResult struct { + TestCase map[string]interface{} `json:"testCase"` + ModelResponse string `json:"modelResponse"` + EvaluationResults []EvaluationResult `json:"evaluationResults"` +} + +// EvaluationResult represents the result of a single evaluator +type EvaluationResult struct { + EvaluatorName string `json:"evaluatorName"` + Score float64 `json:"score"` + Passed bool `json:"passed"` + Details string `json:"details,omitempty"` +} + +// NewEvalCommand returns a new command to evaluate prompts against models +func NewEvalCommand(cfg *command.Config) *cobra.Command { + cmd := &cobra.Command{ + Use: "eval", + Short: "Evaluate prompts using test data and evaluators", + Long: heredoc.Docf(` + Runs evaluation tests against a model using a prompt.yml file. + + The prompt.yml file should contain: + - Model configuration and parameters + - Test data with input variables + - Messages with templated content + - Evaluators to assess model responses + + Example prompt.yml structure: + name: My Evaluation + model: gpt-4o + testData: + - input: "Hello world" + expected: "Hello there" + messages: + - role: user + content: "Respond to: {{input}}" + evaluators: + - name: contains-hello + string: + contains: "hello" + `, "`"), + Example: "gh models eval prompt.yml", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + promptFilePath := args[0] + + // Load the evaluation prompt file + evalFile, err := loadEvaluationPromptFile(promptFilePath) + if err != nil { + return fmt.Errorf("failed to load prompt file: %w", err) + } + + // Run evaluation + handler := &evalCommandHandler{ + cfg: cfg, + client: cfg.Client, + evalFile: evalFile, + similarityEvaluator: NewSimilarityEvaluator(), + } + + return handler.runEvaluation(cmd.Context()) + }, + } + + return cmd +} + +type evalCommandHandler struct { + cfg *command.Config + client azuremodels.Client + evalFile *EvaluationPromptFile + similarityEvaluator *SimilarityEvaluator +} + +func loadEvaluationPromptFile(filePath string) (*EvaluationPromptFile, error) { + evalFile, err := prompt.LoadFromFile(filePath) + if err != nil { + return nil, fmt.Errorf("failed to load prompt file: %w", err) + } + + // Debug output + fmt.Printf("DEBUG: Loaded file with name='%s', model='%s', testData count=%d\n", + evalFile.Name, evalFile.Model, len(evalFile.TestData)) + + return evalFile, nil +} + +func (h *evalCommandHandler) runEvaluation(ctx context.Context) error { + h.cfg.WriteToOut(fmt.Sprintf("Running evaluation: %s\n", h.evalFile.Name)) + h.cfg.WriteToOut(fmt.Sprintf("Description: %s\n", h.evalFile.Description)) + h.cfg.WriteToOut(fmt.Sprintf("Model: %s\n", h.evalFile.Model)) + h.cfg.WriteToOut(fmt.Sprintf("Test cases: %d\n", len(h.evalFile.TestData))) + h.cfg.WriteToOut("\n") + + var allResults []TestResult + passedTests := 0 + totalTests := len(h.evalFile.TestData) + + for i, testCase := range h.evalFile.TestData { + h.cfg.WriteToOut(fmt.Sprintf("Running test case %d/%d...\n", i+1, totalTests)) + + result, err := h.runTestCase(ctx, testCase) + if err != nil { + return fmt.Errorf("test case %d failed: %w", i+1, err) + } + + allResults = append(allResults, result) + + // Check if all evaluators passed + testPassed := true + for _, evalResult := range result.EvaluationResults { + if !evalResult.Passed { + testPassed = false + break + } + } + + if testPassed { + passedTests++ + h.cfg.WriteToOut(" ✓ PASSED\n") + } else { + h.cfg.WriteToOut(" ✗ FAILED\n") + // Show model response when test fails + h.cfg.WriteToOut(fmt.Sprintf(" Model Response: %s\n", result.ModelResponse)) + } + + // Show evaluation details + for _, evalResult := range result.EvaluationResults { + status := "✓" + if !evalResult.Passed { + status = "✗" + } + h.cfg.WriteToOut(fmt.Sprintf(" %s %s (score: %.2f)\n", + status, evalResult.EvaluatorName, evalResult.Score)) + if evalResult.Details != "" { + h.cfg.WriteToOut(fmt.Sprintf(" %s\n", evalResult.Details)) + } + } + h.cfg.WriteToOut("\n") + } + + // Summary + h.cfg.WriteToOut("Evaluation Summary:\n") + h.cfg.WriteToOut(fmt.Sprintf("Passed: %d/%d (%.1f%%)\n", + passedTests, totalTests, float64(passedTests)/float64(totalTests)*100)) + + if passedTests == totalTests { + h.cfg.WriteToOut("🎉 All tests passed!\n") + } else { + h.cfg.WriteToOut("❌ Some tests failed.\n") + } + + return nil +} + +func (h *evalCommandHandler) runTestCase(ctx context.Context, testCase map[string]interface{}) (TestResult, error) { + // Template the messages with test case data + messages, err := h.templateMessages(testCase) + if err != nil { + return TestResult{}, fmt.Errorf("failed to template messages: %w", err) + } + + // Call the model + response, err := h.callModel(ctx, messages) + if err != nil { + return TestResult{}, fmt.Errorf("failed to call model: %w", err) + } + + // Run evaluators + evalResults, err := h.runEvaluators(ctx, testCase, response) + if err != nil { + return TestResult{}, fmt.Errorf("failed to run evaluators: %w", err) + } + + return TestResult{ + TestCase: testCase, + ModelResponse: response, + EvaluationResults: evalResults, + }, nil +} + +func (h *evalCommandHandler) templateMessages(testCase map[string]interface{}) ([]azuremodels.ChatMessage, error) { + var messages []azuremodels.ChatMessage + + for _, msg := range h.evalFile.Messages { + content, err := h.templateString(msg.Content, testCase) + if err != nil { + return nil, fmt.Errorf("failed to template message content: %w", err) + } + + var role azuremodels.ChatMessageRole + switch strings.ToLower(msg.Role) { + case "system": + role = azuremodels.ChatMessageRoleSystem + case "user": + role = azuremodels.ChatMessageRoleUser + case "assistant": + role = azuremodels.ChatMessageRoleAssistant + default: + return nil, fmt.Errorf("unknown message role: %s", msg.Role) + } + + messages = append(messages, azuremodels.ChatMessage{ + Role: role, + Content: util.Ptr(content), + }) + } + + return messages, nil +} + +func (h *evalCommandHandler) templateString(templateStr string, data map[string]interface{}) (string, error) { + return prompt.TemplateString(templateStr, data) +} + +func (h *evalCommandHandler) callModel(ctx context.Context, messages []azuremodels.ChatMessage) (string, error) { + req := azuremodels.ChatCompletionOptions{ + Messages: messages, + Model: h.evalFile.Model, + Stream: false, + } + + // Apply model parameters + if h.evalFile.ModelParameters.MaxTokens != nil { + req.MaxTokens = h.evalFile.ModelParameters.MaxTokens + } + if h.evalFile.ModelParameters.Temperature != nil { + req.Temperature = h.evalFile.ModelParameters.Temperature + } + if h.evalFile.ModelParameters.TopP != nil { + req.TopP = h.evalFile.ModelParameters.TopP + } + + resp, err := h.client.GetChatCompletionStream(ctx, req) + if err != nil { + return "", err + } + + // For non-streaming requests, we should get a single response + var content strings.Builder + for { + completion, err := resp.Reader.Read() + if err != nil { + if errors.Is(err, context.Canceled) || strings.Contains(err.Error(), "EOF") { + break + } + return "", err + } + + for _, choice := range completion.Choices { + if choice.Delta != nil && choice.Delta.Content != nil { + content.WriteString(*choice.Delta.Content) + } + if choice.Message != nil && choice.Message.Content != nil { + content.WriteString(*choice.Message.Content) + } + } + } + + return strings.TrimSpace(content.String()), nil +} + +func (h *evalCommandHandler) runEvaluators(ctx context.Context, testCase map[string]interface{}, response string) ([]EvaluationResult, error) { + var results []EvaluationResult + + for _, evaluator := range h.evalFile.Evaluators { + result, err := h.runSingleEvaluator(ctx, evaluator, testCase, response) + if err != nil { + return nil, fmt.Errorf("evaluator %s failed: %w", evaluator.Name, err) + } + results = append(results, result) + } + + return results, nil +} + +func (h *evalCommandHandler) runSingleEvaluator(ctx context.Context, evaluator prompt.Evaluator, testCase map[string]interface{}, response string) (EvaluationResult, error) { + switch { + case evaluator.String != nil: + return h.runStringEvaluator(evaluator.Name, *evaluator.String, response) + case evaluator.LLM != nil: + return h.runLLMEvaluator(ctx, evaluator.Name, *evaluator.LLM, testCase, response) + case evaluator.Uses != "": + return h.runPluginEvaluator(ctx, evaluator.Name, evaluator.Uses, testCase, response) + default: + return EvaluationResult{}, fmt.Errorf("no evaluation method specified for evaluator %s", evaluator.Name) + } +} + +func (h *evalCommandHandler) runStringEvaluator(name string, eval prompt.StringEvaluator, response string) (EvaluationResult, error) { + var passed bool + var details string + + switch { + case eval.Equals != "": + passed = response == eval.Equals + details = fmt.Sprintf("Expected exact match: '%s'", eval.Equals) + case eval.Contains != "": + passed = strings.Contains(strings.ToLower(response), strings.ToLower(eval.Contains)) + details = fmt.Sprintf("Expected to contain: '%s'", eval.Contains) + case eval.StartsWith != "": + passed = strings.HasPrefix(strings.ToLower(response), strings.ToLower(eval.StartsWith)) + details = fmt.Sprintf("Expected to start with: '%s'", eval.StartsWith) + case eval.EndsWith != "": + passed = strings.HasSuffix(strings.ToLower(response), strings.ToLower(eval.EndsWith)) + details = fmt.Sprintf("Expected to end with: '%s'", eval.EndsWith) + default: + return EvaluationResult{}, errors.New("no string evaluation criteria specified") + } + + score := 0.0 + if passed { + score = 1.0 + } + + return EvaluationResult{ + EvaluatorName: name, + Score: score, + Passed: passed, + Details: details, + }, nil +} + +func (h *evalCommandHandler) runLLMEvaluator(ctx context.Context, name string, eval prompt.LLMEvaluator, testCase map[string]interface{}, response string) (EvaluationResult, error) { + // Template the evaluation prompt + evalData := make(map[string]interface{}) + for k, v := range testCase { + evalData[k] = v + } + evalData["completion"] = response + + promptContent, err := h.templateString(eval.Prompt, evalData) + if err != nil { + return EvaluationResult{}, fmt.Errorf("failed to template evaluation prompt: %w", err) + } + + // Prepare messages for evaluation + var messages []azuremodels.ChatMessage + if eval.SystemPrompt != "" { + messages = append(messages, azuremodels.ChatMessage{ + Role: azuremodels.ChatMessageRoleSystem, + Content: util.Ptr(eval.SystemPrompt), + }) + } + messages = append(messages, azuremodels.ChatMessage{ + Role: azuremodels.ChatMessageRoleUser, + Content: util.Ptr(promptContent), + }) + + // Call the evaluation model + req := azuremodels.ChatCompletionOptions{ + Messages: messages, + Model: eval.ModelID, + Stream: false, + } + + resp, err := h.client.GetChatCompletionStream(ctx, req) + if err != nil { + return EvaluationResult{}, fmt.Errorf("failed to call evaluation model: %w", err) + } + + var evalResponse strings.Builder + for { + completion, err := resp.Reader.Read() + if err != nil { + if errors.Is(err, context.Canceled) || strings.Contains(err.Error(), "EOF") { + break + } + return EvaluationResult{}, err + } + + for _, choice := range completion.Choices { + if choice.Delta != nil && choice.Delta.Content != nil { + evalResponse.WriteString(*choice.Delta.Content) + } + if choice.Message != nil && choice.Message.Content != nil { + evalResponse.WriteString(*choice.Message.Content) + } + } + } + + // Match response to choices + evalResponseText := strings.TrimSpace(strings.ToLower(evalResponse.String())) + for _, choice := range eval.Choices { + if strings.Contains(evalResponseText, strings.ToLower(choice.Choice)) { + return EvaluationResult{ + EvaluatorName: name, + Score: choice.Score, + Passed: choice.Score > 0, + Details: fmt.Sprintf("LLM evaluation matched choice: '%s'", choice.Choice), + }, nil + } + } + + // No match found + return EvaluationResult{ + EvaluatorName: name, + Score: 0.0, + Passed: false, + Details: fmt.Sprintf("LLM evaluation response '%s' did not match any defined choices", evalResponseText), + }, nil +} + +func (h *evalCommandHandler) runPluginEvaluator(ctx context.Context, name, plugin string, testCase map[string]interface{}, response string) (EvaluationResult, error) { + // For now, we'll implement basic support for github/similarity + if plugin == "github/similarity" { + return h.runSimilarityEvaluator(name, testCase, response) + } + + return EvaluationResult{ + EvaluatorName: name, + Score: 0.0, + Passed: false, + Details: fmt.Sprintf("Plugin evaluator '%s' not yet implemented", plugin), + }, nil +} + +func (h *evalCommandHandler) runSimilarityEvaluator(name string, testCase map[string]interface{}, response string) (EvaluationResult, error) { + return h.similarityEvaluator.Evaluate(name, testCase, response) +} diff --git a/cmd/eval/eval_test.go b/cmd/eval/eval_test.go new file mode 100644 index 00000000..fbc0cd42 --- /dev/null +++ b/cmd/eval/eval_test.go @@ -0,0 +1,281 @@ +package eval + +import ( + "bytes" + "context" + "os" + "path/filepath" + "testing" + + "github.com/github/gh-models/internal/azuremodels" + "github.com/github/gh-models/internal/sse" + "github.com/github/gh-models/pkg/command" + "github.com/github/gh-models/pkg/prompt" + "github.com/stretchr/testify/require" +) + +func TestEval(t *testing.T) { + t.Run("loads and parses evaluation prompt file", func(t *testing.T) { + const yamlBody = ` +name: Test Evaluation +description: A test evaluation +model: openai/gpt-4o +modelParameters: + temperature: 0.5 + maxTokens: 100 +testData: + - input: "hello" + expected: "hello world" + - input: "goodbye" + expected: "goodbye world" +messages: + - role: system + content: You are a helpful assistant. + - role: user + content: "Please respond to: {{input}}" +evaluators: + - name: contains-world + string: + contains: "world" + - name: similarity-check + uses: github/similarity +` + + tmpDir := t.TempDir() + promptFile := filepath.Join(tmpDir, "test.prompt.yml") + err := os.WriteFile(promptFile, []byte(yamlBody), 0644) + require.NoError(t, err) + + evalFile, err := prompt.LoadFromFile(promptFile) + require.NoError(t, err) + require.Equal(t, "Test Evaluation", evalFile.Name) + require.Equal(t, "A test evaluation", evalFile.Description) + require.Equal(t, "openai/gpt-4o", evalFile.Model) + require.Equal(t, 0.5, *evalFile.ModelParameters.Temperature) + require.Equal(t, 100, *evalFile.ModelParameters.MaxTokens) + require.Len(t, evalFile.TestData, 2) + require.Len(t, evalFile.Messages, 2) + require.Len(t, evalFile.Evaluators, 2) + }) + + t.Run("templates messages correctly", func(t *testing.T) { + evalFile := &prompt.File{ + Messages: []prompt.Message{ + {Role: "system", Content: "You are helpful."}, + {Role: "user", Content: "Process {{.input}} and return {{.expected}}"}, + }, + } + + handler := &evalCommandHandler{evalFile: evalFile} + testCase := map[string]interface{}{ + "input": "hello", + "expected": "world", + } + + messages, err := handler.templateMessages(testCase) + require.NoError(t, err) + require.Len(t, messages, 2) + require.Equal(t, "You are helpful.", *messages[0].Content) + require.Equal(t, "Process hello and return world", *messages[1].Content) + }) + + t.Run("string evaluator works correctly", func(t *testing.T) { + handler := &evalCommandHandler{} + + tests := []struct { + name string + evaluator prompt.StringEvaluator + response string + expected bool + }{ + { + name: "contains match", + evaluator: prompt.StringEvaluator{Contains: "world"}, + response: "hello world", + expected: true, + }, + { + name: "contains no match", + evaluator: prompt.StringEvaluator{Contains: "world"}, + response: "hello there", + expected: false, + }, + { + name: "equals match", + evaluator: prompt.StringEvaluator{Equals: "exact"}, + response: "exact", + expected: true, + }, + { + name: "equals no match", + evaluator: prompt.StringEvaluator{Equals: "exact"}, + response: "not exact", + expected: false, + }, + { + name: "starts with match", + evaluator: prompt.StringEvaluator{StartsWith: "hello"}, + response: "hello world", + expected: true, + }, + { + name: "ends with match", + evaluator: prompt.StringEvaluator{EndsWith: "world"}, + response: "hello world", + expected: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := handler.runStringEvaluator("test", tt.evaluator, tt.response) + require.NoError(t, err) + require.Equal(t, tt.expected, result.Passed) + if tt.expected { + require.Equal(t, 1.0, result.Score) + } else { + require.Equal(t, 0.0, result.Score) + } + }) + } + }) + + t.Run("similarity evaluator works", func(t *testing.T) { + handler := &evalCommandHandler{} + testCase := map[string]interface{}{ + "expected": "hello world", + } + + result, err := handler.runSimilarityEvaluator("similarity", testCase, "hello world") + require.NoError(t, err) + require.True(t, result.Passed) + require.Equal(t, 1.0, result.Score) + + result, err = handler.runSimilarityEvaluator("similarity", testCase, "completely different text") + require.NoError(t, err) + require.False(t, result.Passed) + require.True(t, result.Score < 0.7) + }) + + t.Run("command creation works", func(t *testing.T) { + out := new(bytes.Buffer) + client := azuremodels.NewMockClient() + cfg := command.NewConfig(out, out, client, true, 100) + + cmd := NewEvalCommand(cfg) + require.Equal(t, "eval", cmd.Use) + require.Contains(t, cmd.Short, "Evaluate prompts") + }) + + t.Run("integration test with mock client", func(t *testing.T) { + const yamlBody = ` +name: Mock Test +description: Test with mock client +model: openai/test-model +testData: + - input: "test input" + expected: "test response" +messages: + - role: user + content: "{{.input}}" +evaluators: + - name: contains-test + string: + contains: "test" +` + + tmpDir := t.TempDir() + promptFile := filepath.Join(tmpDir, "test.prompt.yml") + err := os.WriteFile(promptFile, []byte(yamlBody), 0644) + require.NoError(t, err) + + client := azuremodels.NewMockClient() + + // Mock a simple response + client.MockGetChatCompletionStream = func(ctx context.Context, req azuremodels.ChatCompletionOptions) (*azuremodels.ChatCompletionResponse, error) { + // Create a mock reader that returns "test response" + reader := sse.NewMockEventReader([]azuremodels.ChatCompletion{ + { + Choices: []azuremodels.ChatChoice{ + { + Message: &azuremodels.ChatChoiceMessage{ + Content: func() *string { s := "test response"; return &s }(), + }, + }, + }, + }, + }) + return &azuremodels.ChatCompletionResponse{Reader: reader}, nil + } + + out := new(bytes.Buffer) + cfg := command.NewConfig(out, out, client, true, 100) + + cmd := NewEvalCommand(cfg) + cmd.SetArgs([]string{promptFile}) + + err = cmd.Execute() + require.NoError(t, err) + + output := out.String() + require.Contains(t, output, "Mock Test") + require.Contains(t, output, "Running test case") + require.Contains(t, output, "PASSED") + }) + + t.Run("logs model response when test fails", func(t *testing.T) { + const yamlBody = ` +name: Failing Test +description: Test that fails to check model response logging +model: openai/test-model +testData: + - input: "test input" + expected: "expected but not returned" +messages: + - role: user + content: "{{.input}}" +evaluators: + - name: contains-nonexistent + string: + contains: "nonexistent text" +` + + tmpDir := t.TempDir() + promptFile := filepath.Join(tmpDir, "test.prompt.yml") + err := os.WriteFile(promptFile, []byte(yamlBody), 0644) + require.NoError(t, err) + + client := azuremodels.NewMockClient() + + // Mock a response that will fail the evaluator + client.MockGetChatCompletionStream = func(ctx context.Context, req azuremodels.ChatCompletionOptions) (*azuremodels.ChatCompletionResponse, error) { + reader := sse.NewMockEventReader([]azuremodels.ChatCompletion{ + { + Choices: []azuremodels.ChatChoice{ + { + Message: &azuremodels.ChatChoiceMessage{ + Content: func() *string { s := "actual model response"; return &s }(), + }, + }, + }, + }, + }) + return &azuremodels.ChatCompletionResponse{Reader: reader}, nil + } + + out := new(bytes.Buffer) + cfg := command.NewConfig(out, out, client, true, 100) + + cmd := NewEvalCommand(cfg) + cmd.SetArgs([]string{promptFile}) + + err = cmd.Execute() + require.NoError(t, err) + + output := out.String() + require.Contains(t, output, "Failing Test") + require.Contains(t, output, "Running test case") + require.Contains(t, output, "FAILED") + require.Contains(t, output, "Model Response: actual model response") + }) +} diff --git a/cmd/eval/similarity.go b/cmd/eval/similarity.go new file mode 100644 index 00000000..d2d2195b --- /dev/null +++ b/cmd/eval/similarity.go @@ -0,0 +1,84 @@ +package eval + +import ( + "fmt" + "strings" +) + +// SimilarityEvaluator handles similarity-based evaluation +type SimilarityEvaluator struct{} + +// NewSimilarityEvaluator creates a new similarity evaluator +func NewSimilarityEvaluator() *SimilarityEvaluator { + return &SimilarityEvaluator{} +} + +// Evaluate runs similarity evaluation between expected and actual values +func (s *SimilarityEvaluator) Evaluate(name string, testCase map[string]interface{}, response string) (EvaluationResult, error) { + // Simple similarity check using expected value if present + expected, ok := testCase["expected"] + if !ok { + return EvaluationResult{ + EvaluatorName: name, + Score: 0.0, + Passed: false, + Details: "No 'expected' value found in test case for similarity evaluation", + }, nil + } + + expectedStr, ok := expected.(string) + if !ok { + return EvaluationResult{ + EvaluatorName: name, + Score: 0.0, + Passed: false, + Details: "Expected value is not a string", + }, nil + } + + // Simple similarity metric (could be enhanced with more sophisticated algorithms) + similarity := s.calculateSimpleSimilarity(expectedStr, response) + passed := similarity > 0.4 // 40% similarity threshold + + return EvaluationResult{ + EvaluatorName: name, + Score: similarity, + Passed: passed, + Details: fmt.Sprintf("Similarity score: %.2f (threshold: 0.4)", similarity), + }, nil +} + +// calculateSimpleSimilarity computes a simple word-based similarity score +func (s *SimilarityEvaluator) calculateSimpleSimilarity(expected, actual string) float64 { + // Simple word-based similarity + expectedWords := strings.Fields(strings.ToLower(expected)) + actualWords := strings.Fields(strings.ToLower(actual)) + + if len(expectedWords) == 0 && len(actualWords) == 0 { + return 1.0 + } + if len(expectedWords) == 0 || len(actualWords) == 0 { + return 0.0 + } + + // Count matching words + expectedWordSet := make(map[string]bool) + for _, word := range expectedWords { + expectedWordSet[word] = true + } + + matchingWords := 0 + for _, word := range actualWords { + if expectedWordSet[word] { + matchingWords++ + } + } + + // Jaccard similarity + totalWords := len(expectedWords) + len(actualWords) - matchingWords + if totalWords == 0 { + return 1.0 + } + + return float64(matchingWords) / float64(totalWords) +} diff --git a/cmd/eval/similarity_test.go b/cmd/eval/similarity_test.go new file mode 100644 index 00000000..d841f0d8 --- /dev/null +++ b/cmd/eval/similarity_test.go @@ -0,0 +1,145 @@ +package eval + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestSimilarityEvaluator(t *testing.T) { + evaluator := NewSimilarityEvaluator() + + t.Run("exact match", func(t *testing.T) { + testCase := map[string]interface{}{ + "expected": "hello world", + } + + result, err := evaluator.Evaluate("similarity", testCase, "hello world") + require.NoError(t, err) + require.True(t, result.Passed) + require.Equal(t, 1.0, result.Score) + require.Contains(t, result.Details, "1.00") + }) + + t.Run("high similarity", func(t *testing.T) { + testCase := map[string]interface{}{ + "expected": "hello world test", + } + + result, err := evaluator.Evaluate("similarity", testCase, "hello world example") + require.NoError(t, err) + require.True(t, result.Passed) + require.True(t, result.Score > 0.4) + }) + + t.Run("low similarity", func(t *testing.T) { + testCase := map[string]interface{}{ + "expected": "hello world", + } + + result, err := evaluator.Evaluate("similarity", testCase, "completely different text") + require.NoError(t, err) + require.False(t, result.Passed) + require.True(t, result.Score < 0.7) + }) + + t.Run("missing expected value", func(t *testing.T) { + testCase := map[string]interface{}{} + + result, err := evaluator.Evaluate("similarity", testCase, "hello world") + require.NoError(t, err) + require.False(t, result.Passed) + require.Equal(t, 0.0, result.Score) + require.Contains(t, result.Details, "No 'expected' value found") + }) + + t.Run("non-string expected value", func(t *testing.T) { + testCase := map[string]interface{}{ + "expected": 123, + } + + result, err := evaluator.Evaluate("similarity", testCase, "hello world") + require.NoError(t, err) + require.False(t, result.Passed) + require.Equal(t, 0.0, result.Score) + require.Contains(t, result.Details, "Expected value is not a string") + }) + + t.Run("empty strings", func(t *testing.T) { + testCase := map[string]interface{}{ + "expected": "", + } + + result, err := evaluator.Evaluate("similarity", testCase, "") + require.NoError(t, err) + require.True(t, result.Passed) + require.Equal(t, 1.0, result.Score) + }) + + t.Run("case insensitive", func(t *testing.T) { + testCase := map[string]interface{}{ + "expected": "Hello World", + } + + result, err := evaluator.Evaluate("similarity", testCase, "hello world") + require.NoError(t, err) + require.True(t, result.Passed) + require.Equal(t, 1.0, result.Score) + }) +} + +func TestCalculateSimpleSimilarity(t *testing.T) { + evaluator := NewSimilarityEvaluator() + + tests := []struct { + name string + expected string + actual string + minScore float64 + maxScore float64 + }{ + { + name: "identical strings", + expected: "hello world", + actual: "hello world", + minScore: 1.0, + maxScore: 1.0, + }, + { + name: "partial overlap", + expected: "hello world test", + actual: "hello world example", + minScore: 0.4, + maxScore: 0.6, + }, + { + name: "no overlap", + expected: "hello world", + actual: "foo bar", + minScore: 0.0, + maxScore: 0.0, + }, + { + name: "empty strings", + expected: "", + actual: "", + minScore: 1.0, + maxScore: 1.0, + }, + { + name: "one empty", + expected: "hello", + actual: "", + minScore: 0.0, + maxScore: 0.0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + score := evaluator.calculateSimpleSimilarity(tt.expected, tt.actual) + require.GreaterOrEqual(t, score, tt.minScore) + require.LessOrEqual(t, score, tt.maxScore) + }) + } +} diff --git a/cmd/root.go b/cmd/root.go index 9e9b94ff..b27dd305 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -8,6 +8,7 @@ import ( "github.com/MakeNowJust/heredoc" "github.com/cli/go-gh/v2/pkg/auth" "github.com/cli/go-gh/v2/pkg/term" + "github.com/github/gh-models/cmd/eval" "github.com/github/gh-models/cmd/list" "github.com/github/gh-models/cmd/run" "github.com/github/gh-models/cmd/view" @@ -54,6 +55,7 @@ func NewRootCommand() *cobra.Command { cfg := command.NewConfigWithTerminal(terminal, client) + cmd.AddCommand(eval.NewEvalCommand(cfg)) cmd.AddCommand(list.NewListCommand(cfg)) cmd.AddCommand(run.NewRunCommand(cfg)) cmd.AddCommand(view.NewViewCommand(cfg)) diff --git a/cmd/root_test.go b/cmd/root_test.go index d05b1cdd..817701af 100644 --- a/cmd/root_test.go +++ b/cmd/root_test.go @@ -19,6 +19,7 @@ func TestRoot(t *testing.T) { require.NoError(t, err) output := buf.String() require.Regexp(t, regexp.MustCompile(`Usage:\n\s+gh models \[command\]`), output) + require.Regexp(t, regexp.MustCompile(`eval\s+Evaluate prompts using test data and evaluators`), output) require.Regexp(t, regexp.MustCompile(`list\s+List available models`), output) require.Regexp(t, regexp.MustCompile(`run\s+Run inference with the specified model`), output) require.Regexp(t, regexp.MustCompile(`view\s+View details about a model`), output) diff --git a/cmd/run/run.go b/cmd/run/run.go index 7a3a885c..fca6faad 100644 --- a/cmd/run/run.go +++ b/cmd/run/run.go @@ -18,10 +18,10 @@ import ( "github.com/github/gh-models/internal/azuremodels" "github.com/github/gh-models/internal/sse" "github.com/github/gh-models/pkg/command" + "github.com/github/gh-models/pkg/prompt" "github.com/github/gh-models/pkg/util" "github.com/spf13/cobra" "github.com/spf13/pflag" - "gopkg.in/yaml.v3" ) // ModelParameters represents the parameters that can be set for a model run. @@ -189,22 +189,6 @@ func isPipe(r io.Reader) bool { return false } -// promptFile mirrors the format of .prompt.yml -type promptFile struct { - Name string `yaml:"name"` - Description string `yaml:"description"` - Model string `yaml:"model"` - ModelParameters struct { - MaxTokens *int `yaml:"maxTokens"` - Temperature *float64 `yaml:"temperature"` - TopP *float64 `yaml:"topP"` - } `yaml:"modelParameters"` - Messages []struct { - Role string `yaml:"role"` - Content string `yaml:"content"` - } `yaml:"messages"` -} - // NewRunCommand returns a new gh command for running a model. func NewRunCommand(cfg *command.Config) *cobra.Command { cmd := &cobra.Command{ @@ -226,17 +210,13 @@ func NewRunCommand(cfg *command.Config) *cobra.Command { Args: cobra.ArbitraryArgs, RunE: func(cmd *cobra.Command, args []string) error { filePath, _ := cmd.Flags().GetString("file") - var pf *promptFile + var pf *prompt.File if filePath != "" { - b, err := os.ReadFile(filePath) + var err error + pf, err = prompt.LoadFromFile(filePath) if err != nil { return err } - p := promptFile{} - if err := yaml.Unmarshal(b, &p); err != nil { - return err - } - pf = &p // Inject model name as the first positional arg if user didn't supply one if pf.Model != "" && len(args) == 0 { args = append([]string{pf.Model}, args...) @@ -297,13 +277,21 @@ func NewRunCommand(cfg *command.Config) *cobra.Command { } else { interactiveMode = false + // Template the messages with the input + templateData := map[string]interface{}{ + "input": initialPrompt, + } + for _, m := range pf.Messages { - content := m.Content + content, err := prompt.TemplateString(m.Content, templateData) + if err != nil { + return err + } + switch strings.ToLower(m.Role) { case "system": conversation.systemPrompt = content case "user": - content = strings.ReplaceAll(content, "{{input}}", initialPrompt) conversation.AddMessage(azuremodels.ChatMessageRoleUser, content) case "assistant": conversation.AddMessage(azuremodels.ChatMessageRoleAssistant, content) diff --git a/failing_test_prompt.yml b/failing_test_prompt.yml new file mode 100644 index 00000000..9ae76d6a --- /dev/null +++ b/failing_test_prompt.yml @@ -0,0 +1,23 @@ +name: Failing Evaluation Test +description: Test that will fail to demonstrate model response logging +model: openai/gpt-4o +modelParameters: + temperature: 0.7 + maxTokens: 150 +testData: + - input: "What is the capital of France?" + expected: "Paris" + - input: "What is 2 + 2?" + expected: "4" +messages: + - role: system + content: You are a helpful assistant. + - role: user + content: "{{.input}}" +evaluators: + - name: contains-impossible + string: + contains: "this-text-will-never-appear-in-any-response" + - name: starts-with-wrong + string: + startsWith: "ZZZZZ" diff --git a/pkg/prompt/prompt.go b/pkg/prompt/prompt.go new file mode 100644 index 00000000..0fba1c3d --- /dev/null +++ b/pkg/prompt/prompt.go @@ -0,0 +1,99 @@ +// Package prompt provides shared types and utilities for working with .prompt.yml files +package prompt + +import ( + "os" + "strings" + "text/template" + + "gopkg.in/yaml.v3" +) + +// File represents the structure of a .prompt.yml file +type File struct { + Name string `yaml:"name"` + Description string `yaml:"description"` + Model string `yaml:"model"` + ModelParameters ModelParameters `yaml:"modelParameters"` + Messages []Message `yaml:"messages"` + // TestData and Evaluators are only used by eval command + TestData []map[string]interface{} `yaml:"testData,omitempty"` + Evaluators []Evaluator `yaml:"evaluators,omitempty"` +} + +// ModelParameters represents model configuration parameters +type ModelParameters struct { + MaxTokens *int `yaml:"maxTokens"` + Temperature *float64 `yaml:"temperature"` + TopP *float64 `yaml:"topP"` +} + +// Message represents a conversation message +type Message struct { + Role string `yaml:"role"` + Content string `yaml:"content"` +} + +// Evaluator represents an evaluation method (only used by eval command) +type Evaluator struct { + Name string `yaml:"name"` + String *StringEvaluator `yaml:"string,omitempty"` + LLM *LLMEvaluator `yaml:"llm,omitempty"` + Uses string `yaml:"uses,omitempty"` +} + +// StringEvaluator represents string-based evaluation +type StringEvaluator struct { + EndsWith string `yaml:"endsWith,omitempty"` + StartsWith string `yaml:"startsWith,omitempty"` + Contains string `yaml:"contains,omitempty"` + Equals string `yaml:"equals,omitempty"` +} + +// LLMEvaluator represents LLM-based evaluation +type LLMEvaluator struct { + ModelID string `yaml:"modelId"` + Prompt string `yaml:"prompt"` + Choices []Choice `yaml:"choices"` + SystemPrompt string `yaml:"systemPrompt,omitempty"` +} + +// Choice represents a scoring choice for LLM evaluation +type Choice struct { + Choice string `yaml:"choice"` + Score float64 `yaml:"score"` +} + +// LoadFromFile loads and parses a prompt file from the given path +func LoadFromFile(filePath string) (*File, error) { + data, err := os.ReadFile(filePath) + if err != nil { + return nil, err + } + + var promptFile File + if err := yaml.Unmarshal(data, &promptFile); err != nil { + return nil, err + } + + return &promptFile, nil +} + +// TemplateString templates a string with the given data using Go's text/template +func TemplateString(templateStr string, data interface{}) (string, error) { + tmpl, err := template.New("template").Option("missingkey=zero").Parse(templateStr) + if err != nil { + return "", err + } + + var result strings.Builder + if err := tmpl.Execute(&result, data); err != nil { + return "", err + } + + // Replace "" with empty string for missing template variables + resultStr := result.String() + resultStr = strings.ReplaceAll(resultStr, "", "") + + return resultStr, nil +} diff --git a/pkg/prompt/prompt_test.go b/pkg/prompt/prompt_test.go new file mode 100644 index 00000000..37c419c3 --- /dev/null +++ b/pkg/prompt/prompt_test.go @@ -0,0 +1,94 @@ +package prompt + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestPromptFile(t *testing.T) { + t.Run("loads and parses prompt file", func(t *testing.T) { + const yamlBody = ` +name: Test Prompt +description: A test prompt file +model: openai/gpt-4o +modelParameters: + temperature: 0.5 + maxTokens: 100 +messages: + - role: system + content: You are a helpful assistant. + - role: user + content: "Hello {{.name}}" +testData: + - name: "Alice" + - name: "Bob" +evaluators: + - name: contains-greeting + string: + contains: "hello" +` + + tmpDir := t.TempDir() + promptFilePath := filepath.Join(tmpDir, "test.prompt.yml") + err := os.WriteFile(promptFilePath, []byte(yamlBody), 0644) + require.NoError(t, err) + + promptFile, err := LoadFromFile(promptFilePath) + require.NoError(t, err) + require.Equal(t, "Test Prompt", promptFile.Name) + require.Equal(t, "A test prompt file", promptFile.Description) + require.Equal(t, "openai/gpt-4o", promptFile.Model) + require.Equal(t, 0.5, *promptFile.ModelParameters.Temperature) + require.Equal(t, 100, *promptFile.ModelParameters.MaxTokens) + require.Len(t, promptFile.Messages, 2) + require.Equal(t, "system", promptFile.Messages[0].Role) + require.Equal(t, "You are a helpful assistant.", promptFile.Messages[0].Content) + require.Equal(t, "user", promptFile.Messages[1].Role) + require.Equal(t, "Hello {{.name}}", promptFile.Messages[1].Content) + require.Len(t, promptFile.TestData, 2) + require.Equal(t, "Alice", promptFile.TestData[0]["name"]) + require.Equal(t, "Bob", promptFile.TestData[1]["name"]) + require.Len(t, promptFile.Evaluators, 1) + require.Equal(t, "contains-greeting", promptFile.Evaluators[0].Name) + require.Equal(t, "hello", promptFile.Evaluators[0].String.Contains) + }) + + t.Run("templates messages correctly", func(t *testing.T) { + testData := map[string]interface{}{ + "name": "World", + "age": 25, + } + + result, err := TemplateString("Hello {{.name}}, you are {{.age}} years old", testData) + require.NoError(t, err) + require.Equal(t, "Hello World, you are 25 years old", result) + }) + + t.Run("handles missing template variables", func(t *testing.T) { + testData := map[string]interface{}{ + "name": "World", + } + + result, err := TemplateString("Hello {{.name}}, you are {{.missing}} years old", testData) + require.NoError(t, err) + require.Equal(t, "Hello World, you are years old", result) + }) + + t.Run("handles file not found", func(t *testing.T) { + _, err := LoadFromFile("/nonexistent/file.yml") + require.Error(t, err) + }) + + t.Run("handles invalid YAML", func(t *testing.T) { + tmpDir := t.TempDir() + promptFilePath := filepath.Join(tmpDir, "invalid.prompt.yml") + err := os.WriteFile(promptFilePath, []byte("invalid: yaml: content: ["), 0644) + require.NoError(t, err) + + _, err = LoadFromFile(promptFilePath) + require.Error(t, err) + }) +} diff --git a/sample_prompt.yml b/sample_prompt.yml new file mode 100644 index 00000000..f17b1afc --- /dev/null +++ b/sample_prompt.yml @@ -0,0 +1,22 @@ +name: Sample Evaluation +description: A sample evaluation for testing the eval command +model: openai/gpt-4o +modelParameters: + temperature: 0.5 + maxTokens: 50 +testData: + - input: 'hello world' + expected: 'greeting response' + - input: 'goodbye world' + expected: 'farewell response' +messages: + - role: system + content: You are a helpful assistant that responds to greetings and farewells. + - role: user + content: 'Please respond to this message appropriately: {{.input}}' +evaluators: + - name: string evaluator + string: + contains: world + - name: similarity check + uses: github/similarity From 223d72412a8bb633a93783fd480f59c26bc85c12 Mon Sep 17 00:00:00 2001 From: Sean Goedecke Date: Wed, 4 Jun 2025 07:28:23 +0000 Subject: [PATCH 2/9] Fixup tests --- cmd/run/run_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd/run/run_test.go b/cmd/run/run_test.go index 7395e7cd..b400f4bc 100644 --- a/cmd/run/run_test.go +++ b/cmd/run/run_test.go @@ -149,7 +149,7 @@ messages: require.Contains(t, out.String(), reply) // response streamed to output }) - t.Run("--file with {{input}} placeholder is substituted with initial prompt and stdin", func(t *testing.T) { + t.Run("--file with {{.input}} placeholder is substituted with initial prompt and stdin", func(t *testing.T) { const yamlBody = ` name: Summarizer description: Summarizes input text @@ -158,7 +158,7 @@ messages: - role: system content: You are a text summarizer. - role: user - content: "{{input}}" + content: "{{.input}}" ` tmp, err := os.CreateTemp(t.TempDir(), "*.prompt.yml") @@ -222,7 +222,7 @@ messages: require.Len(t, capturedReq.Messages, 2) require.Equal(t, "You are a text summarizer.", *capturedReq.Messages[0].Content) - require.Equal(t, initialPrompt+"\n"+piped, *capturedReq.Messages[1].Content) // {{input}} -> "Please summarize the provided text.\nHello there!" + require.Equal(t, initialPrompt+"\n"+piped, *capturedReq.Messages[1].Content) // {{.input}} -> "Please summarize the provided text.\nHello there!" require.Contains(t, out.String(), reply) }) From b0fd466f3715624c6e1686c9f7d21943bb70796c Mon Sep 17 00:00:00 2001 From: Sean Goedecke Date: Wed, 4 Jun 2025 07:53:30 +0000 Subject: [PATCH 3/9] Revert change to templating syntax --- cmd/eval/eval_test.go | 6 +++--- cmd/run/run_test.go | 6 +++--- pkg/prompt/prompt.go | 40 ++++++++++++++++++++++++++------------- pkg/prompt/prompt_test.go | 6 +++--- 4 files changed, 36 insertions(+), 22 deletions(-) diff --git a/cmd/eval/eval_test.go b/cmd/eval/eval_test.go index fbc0cd42..c043d762 100644 --- a/cmd/eval/eval_test.go +++ b/cmd/eval/eval_test.go @@ -62,7 +62,7 @@ evaluators: evalFile := &prompt.File{ Messages: []prompt.Message{ {Role: "system", Content: "You are helpful."}, - {Role: "user", Content: "Process {{.input}} and return {{.expected}}"}, + {Role: "user", Content: "Process {{input}} and return {{expected}}"}, }, } @@ -177,7 +177,7 @@ testData: expected: "test response" messages: - role: user - content: "{{.input}}" + content: "{{input}}" evaluators: - name: contains-test string: @@ -233,7 +233,7 @@ testData: expected: "expected but not returned" messages: - role: user - content: "{{.input}}" + content: "{{input}}" evaluators: - name: contains-nonexistent string: diff --git a/cmd/run/run_test.go b/cmd/run/run_test.go index b400f4bc..7395e7cd 100644 --- a/cmd/run/run_test.go +++ b/cmd/run/run_test.go @@ -149,7 +149,7 @@ messages: require.Contains(t, out.String(), reply) // response streamed to output }) - t.Run("--file with {{.input}} placeholder is substituted with initial prompt and stdin", func(t *testing.T) { + t.Run("--file with {{input}} placeholder is substituted with initial prompt and stdin", func(t *testing.T) { const yamlBody = ` name: Summarizer description: Summarizes input text @@ -158,7 +158,7 @@ messages: - role: system content: You are a text summarizer. - role: user - content: "{{.input}}" + content: "{{input}}" ` tmp, err := os.CreateTemp(t.TempDir(), "*.prompt.yml") @@ -222,7 +222,7 @@ messages: require.Len(t, capturedReq.Messages, 2) require.Equal(t, "You are a text summarizer.", *capturedReq.Messages[0].Content) - require.Equal(t, initialPrompt+"\n"+piped, *capturedReq.Messages[1].Content) // {{.input}} -> "Please summarize the provided text.\nHello there!" + require.Equal(t, initialPrompt+"\n"+piped, *capturedReq.Messages[1].Content) // {{input}} -> "Please summarize the provided text.\nHello there!" require.Contains(t, out.String(), reply) }) diff --git a/pkg/prompt/prompt.go b/pkg/prompt/prompt.go index 0fba1c3d..f13d1968 100644 --- a/pkg/prompt/prompt.go +++ b/pkg/prompt/prompt.go @@ -2,9 +2,9 @@ package prompt import ( + "fmt" "os" "strings" - "text/template" "gopkg.in/yaml.v3" ) @@ -79,21 +79,35 @@ func LoadFromFile(filePath string) (*File, error) { return &promptFile, nil } -// TemplateString templates a string with the given data using Go's text/template +// TemplateString templates a string with the given data using simple {{variable}} replacement func TemplateString(templateStr string, data interface{}) (string, error) { - tmpl, err := template.New("template").Option("missingkey=zero").Parse(templateStr) - if err != nil { - return "", err + result := templateStr + + // Convert data to map[string]interface{} if it's not already + var dataMap map[string]interface{} + switch d := data.(type) { + case map[string]interface{}: + dataMap = d + case map[string]string: + dataMap = make(map[string]interface{}) + for k, v := range d { + dataMap[k] = v + } + default: + // If it's not a map, we can't template it + return result, nil } - var result strings.Builder - if err := tmpl.Execute(&result, data); err != nil { - return "", err + // Replace all {{variable}} patterns with values from the data map + for key, value := range dataMap { + placeholder := "{{" + key + "}}" + if valueStr, ok := value.(string); ok { + result = strings.ReplaceAll(result, placeholder, valueStr) + } else { + // Convert non-string values to string + result = strings.ReplaceAll(result, placeholder, fmt.Sprintf("%v", value)) + } } - // Replace "" with empty string for missing template variables - resultStr := result.String() - resultStr = strings.ReplaceAll(resultStr, "", "") - - return resultStr, nil + return result, nil } diff --git a/pkg/prompt/prompt_test.go b/pkg/prompt/prompt_test.go index 37c419c3..2ae7db33 100644 --- a/pkg/prompt/prompt_test.go +++ b/pkg/prompt/prompt_test.go @@ -62,7 +62,7 @@ evaluators: "age": 25, } - result, err := TemplateString("Hello {{.name}}, you are {{.age}} years old", testData) + result, err := TemplateString("Hello {{name}}, you are {{age}} years old", testData) require.NoError(t, err) require.Equal(t, "Hello World, you are 25 years old", result) }) @@ -72,9 +72,9 @@ evaluators: "name": "World", } - result, err := TemplateString("Hello {{.name}}, you are {{.missing}} years old", testData) + result, err := TemplateString("Hello {{name}}, you are {{missing}} years old", testData) require.NoError(t, err) - require.Equal(t, "Hello World, you are years old", result) + require.Equal(t, "Hello World, you are {{missing}} years old", result) }) t.Run("handles file not found", func(t *testing.T) { From 498dcddac548c1e15ef535cebf7f2bfa52a6db2e Mon Sep 17 00:00:00 2001 From: Sean Goedecke Date: Wed, 4 Jun 2025 08:02:06 +0000 Subject: [PATCH 4/9] Fixup syntax --- .../failing_test_prompt.yml | 2 +- sample_prompt.yml => fixtures/sample_prompt.yml | 2 +- pkg/prompt/prompt_test.go | 4 ++-- s.prompt.yml | 14 -------------- 4 files changed, 4 insertions(+), 18 deletions(-) rename failing_test_prompt.yml => fixtures/failing_test_prompt.yml (95%) rename sample_prompt.yml => fixtures/sample_prompt.yml (88%) delete mode 100644 s.prompt.yml diff --git a/failing_test_prompt.yml b/fixtures/failing_test_prompt.yml similarity index 95% rename from failing_test_prompt.yml rename to fixtures/failing_test_prompt.yml index 9ae76d6a..652f599c 100644 --- a/failing_test_prompt.yml +++ b/fixtures/failing_test_prompt.yml @@ -13,7 +13,7 @@ messages: - role: system content: You are a helpful assistant. - role: user - content: "{{.input}}" + content: "{{input}}" evaluators: - name: contains-impossible string: diff --git a/sample_prompt.yml b/fixtures/sample_prompt.yml similarity index 88% rename from sample_prompt.yml rename to fixtures/sample_prompt.yml index f17b1afc..342b4c81 100644 --- a/sample_prompt.yml +++ b/fixtures/sample_prompt.yml @@ -13,7 +13,7 @@ messages: - role: system content: You are a helpful assistant that responds to greetings and farewells. - role: user - content: 'Please respond to this message appropriately: {{.input}}' + content: 'Please respond to this message appropriately: {{input}}' evaluators: - name: string evaluator string: diff --git a/pkg/prompt/prompt_test.go b/pkg/prompt/prompt_test.go index 2ae7db33..a6ef1264 100644 --- a/pkg/prompt/prompt_test.go +++ b/pkg/prompt/prompt_test.go @@ -21,7 +21,7 @@ messages: - role: system content: You are a helpful assistant. - role: user - content: "Hello {{.name}}" + content: "Hello {{name}}" testData: - name: "Alice" - name: "Bob" @@ -47,7 +47,7 @@ evaluators: require.Equal(t, "system", promptFile.Messages[0].Role) require.Equal(t, "You are a helpful assistant.", promptFile.Messages[0].Content) require.Equal(t, "user", promptFile.Messages[1].Role) - require.Equal(t, "Hello {{.name}}", promptFile.Messages[1].Content) + require.Equal(t, "Hello {{name}}", promptFile.Messages[1].Content) require.Len(t, promptFile.TestData, 2) require.Equal(t, "Alice", promptFile.TestData[0]["name"]) require.Equal(t, "Bob", promptFile.TestData[1]["name"]) diff --git a/s.prompt.yml b/s.prompt.yml deleted file mode 100644 index b8b577f2..00000000 --- a/s.prompt.yml +++ /dev/null @@ -1,14 +0,0 @@ -name: Text Summarizer -description: Summarizes input text concisely -model: openai/gpt-4o-mini -modelParameters: - temperature: 0.5 -messages: - - role: system - content: You are a text summarizer. Your only job is to summarize text given to you. - - role: user - content: | - Summarize the given text, beginning with "Summary -": - - {{input}} - \ No newline at end of file From f936d04cb27a32615d46d582e92ed2614fab5e04 Mon Sep 17 00:00:00 2001 From: Sean Goedecke Date: Wed, 4 Jun 2025 08:09:20 +0000 Subject: [PATCH 5/9] Guard against nil test cases --- cmd/eval/eval.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cmd/eval/eval.go b/cmd/eval/eval.go index 1e4cd9f7..b26b6ea8 100644 --- a/cmd/eval/eval.go +++ b/cmd/eval/eval.go @@ -101,8 +101,7 @@ func loadEvaluationPromptFile(filePath string) (*EvaluationPromptFile, error) { return nil, fmt.Errorf("failed to load prompt file: %w", err) } - // Debug output - fmt.Printf("DEBUG: Loaded file with name='%s', model='%s', testData count=%d\n", + fmt.Printf("Loaded file with name='%s', model='%s', testData count=%d\n", evalFile.Name, evalFile.Model, len(evalFile.TestData)) return evalFile, nil @@ -164,8 +163,12 @@ func (h *evalCommandHandler) runEvaluation(ctx context.Context) error { // Summary h.cfg.WriteToOut("Evaluation Summary:\n") - h.cfg.WriteToOut(fmt.Sprintf("Passed: %d/%d (%.1f%%)\n", - passedTests, totalTests, float64(passedTests)/float64(totalTests)*100)) + if totalTests == 0 { + h.cfg.WriteToOut("Passed: 0/0 (0.0%)\n") + } else { + h.cfg.WriteToOut(fmt.Sprintf("Passed: %d/%d (%.1f%%)\n", + passedTests, totalTests, float64(passedTests)/float64(totalTests)*100)) + } if passedTests == totalTests { h.cfg.WriteToOut("🎉 All tests passed!\n") From cbad2a07ab756458857ec59d83d5f5a1afbe0f05 Mon Sep 17 00:00:00 2001 From: Sean Goedecke Date: Wed, 4 Jun 2025 08:37:04 +0000 Subject: [PATCH 6/9] Implement proper builtin evaluators --- cmd/eval/builtins.go | 386 ++++++++++++++++++++ cmd/eval/eval.go | 31 +- cmd/eval/eval_test.go | 40 +- cmd/eval/similarity.go | 84 ----- cmd/eval/similarity_test.go | 145 -------- evaluators.tmp | 565 +++++++++++++++++++++++++++++ fixtures/test_builtins.yml | 25 ++ fixtures/test_single_evaluator.yml | 12 + test_builtins.yml | 25 ++ test_evaluators.go | 23 ++ 10 files changed, 1080 insertions(+), 256 deletions(-) create mode 100644 cmd/eval/builtins.go delete mode 100644 cmd/eval/similarity.go delete mode 100644 cmd/eval/similarity_test.go create mode 100644 evaluators.tmp create mode 100644 fixtures/test_builtins.yml create mode 100644 fixtures/test_single_evaluator.yml create mode 100644 test_builtins.yml create mode 100644 test_evaluators.go diff --git a/cmd/eval/builtins.go b/cmd/eval/builtins.go new file mode 100644 index 00000000..0ee566d2 --- /dev/null +++ b/cmd/eval/builtins.go @@ -0,0 +1,386 @@ +package eval + +import "github.com/github/gh-models/pkg/prompt" + +// BuiltInEvaluators contains pre-configured LLM-based evaluators, taken from https://github.com/microsoft/promptflow +var BuiltInEvaluators = map[string]prompt.LLMEvaluator{ + "similarity": { + ModelID: "openai/gpt-4o", + SystemPrompt: "You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.", + Prompt: `Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale: +One star: the predicted answer is not at all similar to the correct answer +Two stars: the predicted answer is mostly not similar to the correct answer +Three stars: the predicted answer is somewhat similar to the correct answer +Four stars: the predicted answer is mostly similar to the correct answer +Five stars: the predicted answer is completely similar to the correct answer + +This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. + +The examples below show the Equivalence score for a question, a correct answer, and a predicted answer. + +question: What is the role of ribosomes? +correct answer: Ribosomes are cellular structures responsible for protein synthesis. They interpret the genetic information carried by messenger RNA (mRNA) and use it to assemble amino acids into proteins. +predicted answer: Ribosomes participate in carbohydrate breakdown by removing nutrients from complex sugar molecules. +stars: 1 + +question: Why did the Titanic sink? +correct answer: The Titanic sank after it struck an iceberg during its maiden voyage in 1912. The impact caused the ship's hull to breach, allowing water to flood into the vessel. The ship's design, lifeboat shortage, and lack of timely rescue efforts contributed to the tragic loss of life. +predicted answer: The sinking of the Titanic was a result of a large iceberg collision. This caused the ship to take on water and eventually sink, leading to the death of many passengers due to a shortage of lifeboats and insufficient rescue attempts. +stars: 2 + +question: What causes seasons on Earth? +correct answer: Seasons on Earth are caused by the tilt of the Earth's axis and its revolution around the Sun. As the Earth orbits the Sun, the tilt causes different parts of the planet to receive varying amounts of sunlight, resulting in changes in temperature and weather patterns. +predicted answer: Seasons occur because of the Earth's rotation and its elliptical orbit around the Sun. The tilt of the Earth's axis causes regions to be subjected to different sunlight intensities, which leads to temperature fluctuations and alternating weather conditions. +stars: 3 + +question: How does photosynthesis work? +correct answer: Photosynthesis is a process by which green plants and some other organisms convert light energy into chemical energy. This occurs as light is absorbed by chlorophyll molecules, and then carbon dioxide and water are converted into glucose and oxygen through a series of reactions. +predicted answer: In photosynthesis, sunlight is transformed into nutrients by plants and certain microorganisms. Light is captured by chlorophyll molecules, followed by the conversion of carbon dioxide and water into sugar and oxygen through multiple reactions. +stars: 4 + +question: What are the health benefits of regular exercise? +correct answer: Regular exercise can help maintain a healthy weight, increase muscle and bone strength, and reduce the risk of chronic diseases. It also promotes mental well-being by reducing stress and improving overall mood. +predicted answer: Routine physical activity can contribute to maintaining ideal body weight, enhancing muscle and bone strength, and preventing chronic illnesses. In addition, it supports mental health by alleviating stress and augmenting general mood. +stars: 5 + +question: {{input}} +correct answer: {{expected}} +predicted answer: {{completion}} +stars:`, + Choices: []prompt.Choice{ + {Choice: "1", Score: 0.0}, + {Choice: "2", Score: 0.25}, + {Choice: "3", Score: 0.5}, + {Choice: "4", Score: 0.75}, + {Choice: "5", Score: 1.0}, + }, + }, + "coherence": { + ModelID: "openai/gpt-4o", + SystemPrompt: `# Instruction +## Goal +### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. +- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. +- **Data**: Your input data include a QUERY and a RESPONSE. +- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.`, + Prompt: `# Definition +**Coherence** refers to the logical and orderly presentation of ideas in a response, allowing the reader to easily follow and understand the writer's train of thought. A coherent answer directly addresses the question with clear connections between sentences and paragraphs, using appropriate transitions and a logical sequence of ideas. + +# Ratings +## [Coherence: 1] (Incoherent Response) +**Definition:** The response lacks coherence entirely. It consists of disjointed words or phrases that do not form complete or meaningful sentences. There is no logical connection to the question, making the response incomprehensible. + +**Examples:** + **Query:** What are the benefits of renewable energy? + **Response:** Wind sun green jump apple silence over. + + **Query:** Explain the process of photosynthesis. + **Response:** Plants light water flying blue music. + +## [Coherence: 2] (Poorly Coherent Response) +**Definition:** The response shows minimal coherence with fragmented sentences and limited connection to the question. It contains some relevant keywords but lacks logical structure and clear relationships between ideas, making the overall message difficult to understand. + +**Examples:** + **Query:** How does vaccination work? + **Response:** Vaccines protect disease. Immune system fight. Health better. + + **Query:** Describe how a bill becomes a law. + **Response:** Idea proposed. Congress discuss vote. President signs. + +## [Coherence: 3] (Partially Coherent Response) +**Definition:** The response partially addresses the question with some relevant information but exhibits issues in the logical flow and organization of ideas. Connections between sentences may be unclear or abrupt, requiring the reader to infer the links. The response may lack smooth transitions and may present ideas out of order. + +**Examples:** + **Query:** What causes earthquakes? + **Response:** Earthquakes happen when tectonic plates move suddenly. Energy builds up then releases. Ground shakes and can cause damage. + + **Query:** Explain the importance of the water cycle. + **Response:** The water cycle moves water around Earth. Evaporation, then precipitation occurs. It supports life by distributing water. + +## [Coherence: 4] (Coherent Response) +**Definition:** The response is coherent and effectively addresses the question. Ideas are logically organized with clear connections between sentences and paragraphs. Appropriate transitions are used to guide the reader through the response, which flows smoothly and is easy to follow. + +**Examples:** + **Query:** What is the water cycle and how does it work? + **Response:** The water cycle is the continuous movement of water on Earth through processes like evaporation, condensation, and precipitation. Water evaporates from bodies of water, forms clouds through condensation, and returns to the surface as precipitation. This cycle is essential for distributing water resources globally. + + **Query:** Describe the role of mitochondria in cellular function. + **Response:** Mitochondria are organelles that produce energy for the cell. They convert nutrients into ATP through cellular respiration. This energy powers various cellular activities, making mitochondria vital for cell survival. + +## [Coherence: 5] (Highly Coherent Response) +**Definition:** The response is exceptionally coherent, demonstrating sophisticated organization and flow. Ideas are presented in a logical and seamless manner, with excellent use of transitional phrases and cohesive devices. The connections between concepts are clear and enhance the reader's understanding. The response thoroughly addresses the question with clarity and precision. + +**Examples:** + **Query:** Analyze the economic impacts of climate change on coastal cities. + **Response:** Climate change significantly affects the economies of coastal cities through rising sea levels, increased flooding, and more intense storms. These environmental changes can damage infrastructure, disrupt businesses, and lead to costly repairs. For instance, frequent flooding can hinder transportation and commerce, while the threat of severe weather may deter investment and tourism. Consequently, cities may face increased expenses for disaster preparedness and mitigation efforts, straining municipal budgets and impacting economic growth. + + **Query:** Discuss the significance of the Monroe Doctrine in shaping U.S. foreign policy. + **Response:** The Monroe Doctrine was a pivotal policy declared in 1823 that asserted U.S. opposition to European colonization in the Americas. By stating that any intervention by external powers in the Western Hemisphere would be viewed as a hostile act, it established the U.S. as a protector of the region. This doctrine shaped U.S. foreign policy by promoting isolation from European conflicts while justifying American influence and expansion in the hemisphere. Its long-term significance lies in its enduring influence on international relations and its role in defining the U.S. position in global affairs. + +# Data +QUERY: {{input}} +RESPONSE: {{completion}} + +# Tasks +## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information: +- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". +- **Explanation**: a very short explanation of why you think the input Data should get that Score. +- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions. + +## Please provide only your Score as the last output on a new line. +# Output`, + Choices: []prompt.Choice{ + {Choice: "1", Score: 0.0}, + {Choice: "2", Score: 0.25}, + {Choice: "3", Score: 0.5}, + {Choice: "4", Score: 0.75}, + {Choice: "5", Score: 1.0}, + }, + }, + "fluency": { + ModelID: "openai/gpt-4o", + SystemPrompt: `# Instruction +## Goal +### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. +- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. +- **Data**: Your input data include a RESPONSE. +- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.`, + Prompt: `# Definition +**Fluency** refers to the effectiveness and clarity of written communication, focusing on grammatical accuracy, vocabulary range, sentence complexity, coherence, and overall readability. It assesses how smoothly ideas are conveyed and how easily the text can be understood by the reader. + +# Ratings +## [Fluency: 1] (Emergent Fluency) +**Definition:** The response shows minimal command of the language. It contains pervasive grammatical errors, extremely limited vocabulary, and fragmented or incoherent sentences. The message is largely incomprehensible, making understanding very difficult. + +**Examples:** + **Response:** Free time I. Go park. Not fun. Alone. + + **Response:** Like food pizza. Good cheese eat. + +## [Fluency: 2] (Basic Fluency) +**Definition:** The response communicates simple ideas but has frequent grammatical errors and limited vocabulary. Sentences are short and may be improperly constructed, leading to partial understanding. Repetition and awkward phrasing are common. + +**Examples:** + **Response:** I like play soccer. I watch movie. It fun. + + **Response:** My town small. Many people. We have market. + +## [Fluency: 3] (Competent Fluency) +**Definition:** The response clearly conveys ideas with occasional grammatical errors. Vocabulary is adequate but not extensive. Sentences are generally correct but may lack complexity and variety. The text is coherent, and the message is easily understood with minimal effort. + +**Examples:** + **Response:** I'm planning to visit friends and maybe see a movie together. + + **Response:** I try to eat healthy food and exercise regularly by jogging. + +## [Fluency: 4] (Proficient Fluency) +**Definition:** The response is well-articulated with good control of grammar and a varied vocabulary. Sentences are complex and well-structured, demonstrating coherence and cohesion. Minor errors may occur but do not affect overall understanding. The text flows smoothly, and ideas are connected logically. + +**Examples:** + **Response:** My interest in mathematics and problem-solving inspired me to become an engineer, as I enjoy designing solutions that improve people's lives. + + **Response:** Environmental conservation is crucial because it protects ecosystems, preserves biodiversity, and ensures natural resources are available for future generations. + +## [Fluency: 5] (Exceptional Fluency) +**Definition:** The response demonstrates an exceptional command of language with sophisticated vocabulary and complex, varied sentence structures. It is coherent, cohesive, and engaging, with precise and nuanced expression. Grammar is flawless, and the text reflects a high level of eloquence and style. + +**Examples:** + **Response:** Globalization exerts a profound influence on cultural diversity by facilitating unprecedented cultural exchange while simultaneously risking the homogenization of distinct cultural identities, which can diminish the richness of global heritage. + + **Response:** Technology revolutionizes modern education by providing interactive learning platforms, enabling personalized learning experiences, and connecting students worldwide, thereby transforming how knowledge is acquired and shared. + +# Data +RESPONSE: {{completion}} + +# Tasks +## Please provide your assessment Score for the previous RESPONSE based on the Definitions above. Your output should include the following information: +- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". +- **Explanation**: a very short explanation of why you think the input Data should get that Score. +- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions. + +## Please provide only your Score as the last output on a new line. +# Output`, + Choices: []prompt.Choice{ + {Choice: "1", Score: 0.0}, + {Choice: "2", Score: 0.25}, + {Choice: "3", Score: 0.5}, + {Choice: "4", Score: 0.75}, + {Choice: "5", Score: 1.0}, + }, + }, + "relevance": { + ModelID: "openai/gpt-4o", + SystemPrompt: `# Instruction +## Goal +### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. +- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. +- **Data**: Your input data include QUERY and RESPONSE. +- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.`, + Prompt: `# Definition +**Relevance** refers to how effectively a response addresses a question. It assesses the accuracy, completeness, and direct relevance of the response based solely on the given information. + +# Ratings +## [Relevance: 1] (Irrelevant Response) +**Definition:** The response is unrelated to the question. It provides information that is off-topic and does not attempt to address the question posed. + +**Examples:** + **Query:** What is the team preparing for? + **Response:** I went grocery shopping yesterday evening. + + **Query:** When will the company's new product line launch? + **Response:** International travel can be very rewarding and educational. + +## [Relevance: 2] (Incorrect Response) +**Definition:** The response attempts to address the question but includes incorrect information. It provides a response that is factually wrong based on the provided information. + +**Examples:** + **Query:** When was the merger between the two firms finalized? + **Response:** The merger was finalized on April 10th. + + **Query:** Where and when will the solar eclipse be visible? + **Response:** The solar eclipse will be visible in Asia on December 14th. + +## [Relevance: 3] (Incomplete Response) +**Definition:** The response addresses the question but omits key details necessary for a full understanding. It provides a partial response that lacks essential information. + +**Examples:** + **Query:** What type of food does the new restaurant offer? + **Response:** The restaurant offers Italian food like pasta. + + **Query:** What topics will the conference cover? + **Response:** The conference will cover renewable energy and climate change. + +## [Relevance: 4] (Complete Response) +**Definition:** The response fully addresses the question with accurate and complete information. It includes all essential details required for a comprehensive understanding, without adding any extraneous information. + +**Examples:** + **Query:** What type of food does the new restaurant offer? + **Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto. + + **Query:** What topics will the conference cover? + **Response:** The conference will cover renewable energy, climate change, and sustainability practices. + +## [Relevance: 5] (Comprehensive Response with Insights) +**Definition:** The response not only fully and accurately addresses the question but also includes additional relevant insights or elaboration. It may explain the significance, implications, or provide minor inferences that enhance understanding. + +**Examples:** + **Query:** What type of food does the new restaurant offer? + **Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto, aiming to provide customers with an authentic Italian dining experience. + + **Query:** What topics will the conference cover? + **Response:** The conference will cover renewable energy, climate change, and sustainability practices, bringing together global experts to discuss these critical issues. + +# Data +QUERY: {{input}} +RESPONSE: {{completion}} + +# Tasks +## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information: +- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". +- **Explanation**: a very short explanation of why you think the input Data should get that Score. +- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions. + +## Please provide only your Score as the last output on a new line. +# Output`, + Choices: []prompt.Choice{ + {Choice: "1", Score: 0.0}, + {Choice: "2", Score: 0.25}, + {Choice: "3", Score: 0.5}, + {Choice: "4", Score: 0.75}, + {Choice: "5", Score: 1.0}, + }, + }, + "groundedness": { + ModelID: "openai/gpt-4o", + SystemPrompt: `# Instruction +## Goal +### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. +- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. +- **Data**: Your input data include CONTEXT, QUERY, and RESPONSE. +- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.`, + Prompt: `# Definition +**Groundedness** refers to how well an answer is anchored in the provided context, evaluating its relevance, accuracy, and completeness based exclusively on that context. It assesses the extent to which the answer directly and fully addresses the question without introducing unrelated or incorrect information. The scale ranges from 1 to 5, with higher numbers indicating greater groundedness. + +# Ratings +## [Groundedness: 1] (Completely Unrelated Response) +**Definition:** An answer that does not relate to the question or the context in any way. It fails to address the topic, provides irrelevant information, or introduces completely unrelated subjects. + +**Examples:** + **Context:** The company's annual meeting will be held next Thursday. + **Query:** When is the company's annual meeting? + **Response:** I enjoy hiking in the mountains during summer. + + **Context:** The new policy aims to reduce carbon emissions by 20% over the next five years. + **Query:** What is the goal of the new policy? + **Response:** My favorite color is blue. + +## [Groundedness: 2] (Related Topic but Does Not Respond to the Query) +**Definition:** An answer that relates to the general topic of the context but does not answer the specific question asked. It may mention concepts from the context but fails to provide a direct or relevant response. + +**Examples:** + **Context:** The museum will exhibit modern art pieces from various local artists. + **Query:** What kind of art will be exhibited at the museum? + **Response:** Museums are important cultural institutions. + + **Context:** The new software update improves battery life and performance. + **Query:** What does the new software update improve? + **Response:** Software updates can sometimes fix bugs. + +## [Groundedness: 3] (Attempts to Respond but Contains Incorrect Information) +**Definition:** An answer that attempts to respond to the question but includes incorrect information not supported by the context. It may misstate facts, misinterpret the context, or provide erroneous details. + +**Examples:** + **Context:** The festival starts on June 5th and features international musicians. + **Query:** When does the festival start? + **Response:** The festival starts on July 5th and features local artists. + + **Context:** The recipe requires two eggs and one cup of milk. + **Query:** How many eggs are needed for the recipe? + **Response:** You need three eggs for the recipe. + +## [Groundedness: 4] (Partially Correct Response) +**Definition:** An answer that provides a correct response to the question but is incomplete or lacks specific details mentioned in the context. It captures some of the necessary information but omits key elements needed for a full understanding. + +**Examples:** + **Context:** The bookstore offers a 15% discount to students and a 10% discount to senior citizens. + **Query:** What discount does the bookstore offer to students? + **Response:** Students get a discount at the bookstore. + + **Context:** The company's headquarters are located in Berlin, Germany. + **Query:** Where are the company's headquarters? + **Response:** The company's headquarters are in Germany. + +## [Groundedness: 5] (Fully Correct and Complete Response) +**Definition:** An answer that thoroughly and accurately responds to the question, including all relevant details from the context. It directly addresses the question with precise information, demonstrating complete understanding without adding extraneous information. + +**Examples:** + **Context:** The author released her latest novel, 'The Silent Echo', on September 1st. + **Query:** When was 'The Silent Echo' released? + **Response:** 'The Silent Echo' was released on September 1st. + + **Context:** Participants must register by May 31st to be eligible for early bird pricing. + **Query:** By what date must participants register to receive early bird pricing? + **Response:** Participants must register by May 31st to receive early bird pricing. + +# Data +CONTEXT: {{expected}} +QUERY: {{input}} +RESPONSE: {{completion}} + +# Tasks +## Please provide your assessment Score for the previous RESPONSE in relation to the CONTEXT and QUERY based on the Definitions above. Your output should include the following information: +- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". +- **Explanation**: a very short explanation of why you think the input Data should get that Score. +- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions. + +## Please provide only your Score as the last output on a new line. +# Output`, + Choices: []prompt.Choice{ + {Choice: "1", Score: 0.0}, + {Choice: "2", Score: 0.25}, + {Choice: "3", Score: 0.5}, + {Choice: "4", Score: 0.75}, + {Choice: "5", Score: 1.0}, + }, + }, +} diff --git a/cmd/eval/eval.go b/cmd/eval/eval.go index b26b6ea8..22f6816f 100644 --- a/cmd/eval/eval.go +++ b/cmd/eval/eval.go @@ -61,7 +61,7 @@ func NewEvalCommand(cfg *command.Config) *cobra.Command { - name: contains-hello string: contains: "hello" - `, "`"), + `), Example: "gh models eval prompt.yml", Args: cobra.ExactArgs(1), RunE: func(cmd *cobra.Command, args []string) error { @@ -75,10 +75,9 @@ func NewEvalCommand(cfg *command.Config) *cobra.Command { // Run evaluation handler := &evalCommandHandler{ - cfg: cfg, - client: cfg.Client, - evalFile: evalFile, - similarityEvaluator: NewSimilarityEvaluator(), + cfg: cfg, + client: cfg.Client, + evalFile: evalFile, } return handler.runEvaluation(cmd.Context()) @@ -89,10 +88,9 @@ func NewEvalCommand(cfg *command.Config) *cobra.Command { } type evalCommandHandler struct { - cfg *command.Config - client azuremodels.Client - evalFile *EvaluationPromptFile - similarityEvaluator *SimilarityEvaluator + cfg *command.Config + client azuremodels.Client + evalFile *EvaluationPromptFile } func loadEvaluationPromptFile(filePath string) (*EvaluationPromptFile, error) { @@ -428,19 +426,18 @@ func (h *evalCommandHandler) runLLMEvaluator(ctx context.Context, name string, e } func (h *evalCommandHandler) runPluginEvaluator(ctx context.Context, name, plugin string, testCase map[string]interface{}, response string) (EvaluationResult, error) { - // For now, we'll implement basic support for github/similarity - if plugin == "github/similarity" { - return h.runSimilarityEvaluator(name, testCase, response) + // Handle built-in evaluators like github/similarity, github/coherence, etc. + if strings.HasPrefix(plugin, "github/") { + evaluatorName := strings.TrimPrefix(plugin, "github/") + if builtinEvaluator, exists := BuiltInEvaluators[evaluatorName]; exists { + return h.runLLMEvaluator(ctx, name, builtinEvaluator, testCase, response) + } } return EvaluationResult{ EvaluatorName: name, Score: 0.0, Passed: false, - Details: fmt.Sprintf("Plugin evaluator '%s' not yet implemented", plugin), + Details: fmt.Sprintf("Plugin evaluator '%s' not found", plugin), }, nil } - -func (h *evalCommandHandler) runSimilarityEvaluator(name string, testCase map[string]interface{}, response string) (EvaluationResult, error) { - return h.similarityEvaluator.Evaluate(name, testCase, response) -} diff --git a/cmd/eval/eval_test.go b/cmd/eval/eval_test.go index c043d762..caca2d04 100644 --- a/cmd/eval/eval_test.go +++ b/cmd/eval/eval_test.go @@ -140,21 +140,41 @@ evaluators: } }) - t.Run("similarity evaluator works", func(t *testing.T) { - handler := &evalCommandHandler{} + t.Run("plugin evaluator works with github/similarity", func(t *testing.T) { + out := new(bytes.Buffer) + client := azuremodels.NewMockClient() + cfg := command.NewConfig(out, out, client, true, 100) + + // Mock a response that returns "4" for the LLM evaluator + client.MockGetChatCompletionStream = func(ctx context.Context, req azuremodels.ChatCompletionOptions) (*azuremodels.ChatCompletionResponse, error) { + reader := sse.NewMockEventReader([]azuremodels.ChatCompletion{ + { + Choices: []azuremodels.ChatChoice{ + { + Message: &azuremodels.ChatChoiceMessage{ + Content: func() *string { s := "4"; return &s }(), + }, + }, + }, + }, + }) + return &azuremodels.ChatCompletionResponse{Reader: reader}, nil + } + + handler := &evalCommandHandler{ + cfg: cfg, + client: client, + } testCase := map[string]interface{}{ - "expected": "hello world", + "input": "test question", + "expected": "test answer", } - result, err := handler.runSimilarityEvaluator("similarity", testCase, "hello world") + result, err := handler.runPluginEvaluator(context.Background(), "similarity", "github/similarity", testCase, "test response") require.NoError(t, err) + require.Equal(t, "similarity", result.EvaluatorName) + require.Equal(t, 0.75, result.Score) // Score for choice "4" require.True(t, result.Passed) - require.Equal(t, 1.0, result.Score) - - result, err = handler.runSimilarityEvaluator("similarity", testCase, "completely different text") - require.NoError(t, err) - require.False(t, result.Passed) - require.True(t, result.Score < 0.7) }) t.Run("command creation works", func(t *testing.T) { diff --git a/cmd/eval/similarity.go b/cmd/eval/similarity.go deleted file mode 100644 index d2d2195b..00000000 --- a/cmd/eval/similarity.go +++ /dev/null @@ -1,84 +0,0 @@ -package eval - -import ( - "fmt" - "strings" -) - -// SimilarityEvaluator handles similarity-based evaluation -type SimilarityEvaluator struct{} - -// NewSimilarityEvaluator creates a new similarity evaluator -func NewSimilarityEvaluator() *SimilarityEvaluator { - return &SimilarityEvaluator{} -} - -// Evaluate runs similarity evaluation between expected and actual values -func (s *SimilarityEvaluator) Evaluate(name string, testCase map[string]interface{}, response string) (EvaluationResult, error) { - // Simple similarity check using expected value if present - expected, ok := testCase["expected"] - if !ok { - return EvaluationResult{ - EvaluatorName: name, - Score: 0.0, - Passed: false, - Details: "No 'expected' value found in test case for similarity evaluation", - }, nil - } - - expectedStr, ok := expected.(string) - if !ok { - return EvaluationResult{ - EvaluatorName: name, - Score: 0.0, - Passed: false, - Details: "Expected value is not a string", - }, nil - } - - // Simple similarity metric (could be enhanced with more sophisticated algorithms) - similarity := s.calculateSimpleSimilarity(expectedStr, response) - passed := similarity > 0.4 // 40% similarity threshold - - return EvaluationResult{ - EvaluatorName: name, - Score: similarity, - Passed: passed, - Details: fmt.Sprintf("Similarity score: %.2f (threshold: 0.4)", similarity), - }, nil -} - -// calculateSimpleSimilarity computes a simple word-based similarity score -func (s *SimilarityEvaluator) calculateSimpleSimilarity(expected, actual string) float64 { - // Simple word-based similarity - expectedWords := strings.Fields(strings.ToLower(expected)) - actualWords := strings.Fields(strings.ToLower(actual)) - - if len(expectedWords) == 0 && len(actualWords) == 0 { - return 1.0 - } - if len(expectedWords) == 0 || len(actualWords) == 0 { - return 0.0 - } - - // Count matching words - expectedWordSet := make(map[string]bool) - for _, word := range expectedWords { - expectedWordSet[word] = true - } - - matchingWords := 0 - for _, word := range actualWords { - if expectedWordSet[word] { - matchingWords++ - } - } - - // Jaccard similarity - totalWords := len(expectedWords) + len(actualWords) - matchingWords - if totalWords == 0 { - return 1.0 - } - - return float64(matchingWords) / float64(totalWords) -} diff --git a/cmd/eval/similarity_test.go b/cmd/eval/similarity_test.go deleted file mode 100644 index d841f0d8..00000000 --- a/cmd/eval/similarity_test.go +++ /dev/null @@ -1,145 +0,0 @@ -package eval - -import ( - "testing" - - "github.com/stretchr/testify/require" -) - -func TestSimilarityEvaluator(t *testing.T) { - evaluator := NewSimilarityEvaluator() - - t.Run("exact match", func(t *testing.T) { - testCase := map[string]interface{}{ - "expected": "hello world", - } - - result, err := evaluator.Evaluate("similarity", testCase, "hello world") - require.NoError(t, err) - require.True(t, result.Passed) - require.Equal(t, 1.0, result.Score) - require.Contains(t, result.Details, "1.00") - }) - - t.Run("high similarity", func(t *testing.T) { - testCase := map[string]interface{}{ - "expected": "hello world test", - } - - result, err := evaluator.Evaluate("similarity", testCase, "hello world example") - require.NoError(t, err) - require.True(t, result.Passed) - require.True(t, result.Score > 0.4) - }) - - t.Run("low similarity", func(t *testing.T) { - testCase := map[string]interface{}{ - "expected": "hello world", - } - - result, err := evaluator.Evaluate("similarity", testCase, "completely different text") - require.NoError(t, err) - require.False(t, result.Passed) - require.True(t, result.Score < 0.7) - }) - - t.Run("missing expected value", func(t *testing.T) { - testCase := map[string]interface{}{} - - result, err := evaluator.Evaluate("similarity", testCase, "hello world") - require.NoError(t, err) - require.False(t, result.Passed) - require.Equal(t, 0.0, result.Score) - require.Contains(t, result.Details, "No 'expected' value found") - }) - - t.Run("non-string expected value", func(t *testing.T) { - testCase := map[string]interface{}{ - "expected": 123, - } - - result, err := evaluator.Evaluate("similarity", testCase, "hello world") - require.NoError(t, err) - require.False(t, result.Passed) - require.Equal(t, 0.0, result.Score) - require.Contains(t, result.Details, "Expected value is not a string") - }) - - t.Run("empty strings", func(t *testing.T) { - testCase := map[string]interface{}{ - "expected": "", - } - - result, err := evaluator.Evaluate("similarity", testCase, "") - require.NoError(t, err) - require.True(t, result.Passed) - require.Equal(t, 1.0, result.Score) - }) - - t.Run("case insensitive", func(t *testing.T) { - testCase := map[string]interface{}{ - "expected": "Hello World", - } - - result, err := evaluator.Evaluate("similarity", testCase, "hello world") - require.NoError(t, err) - require.True(t, result.Passed) - require.Equal(t, 1.0, result.Score) - }) -} - -func TestCalculateSimpleSimilarity(t *testing.T) { - evaluator := NewSimilarityEvaluator() - - tests := []struct { - name string - expected string - actual string - minScore float64 - maxScore float64 - }{ - { - name: "identical strings", - expected: "hello world", - actual: "hello world", - minScore: 1.0, - maxScore: 1.0, - }, - { - name: "partial overlap", - expected: "hello world test", - actual: "hello world example", - minScore: 0.4, - maxScore: 0.6, - }, - { - name: "no overlap", - expected: "hello world", - actual: "foo bar", - minScore: 0.0, - maxScore: 0.0, - }, - { - name: "empty strings", - expected: "", - actual: "", - minScore: 1.0, - maxScore: 1.0, - }, - { - name: "one empty", - expected: "hello", - actual: "", - minScore: 0.0, - maxScore: 0.0, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - score := evaluator.calculateSimpleSimilarity(tt.expected, tt.actual) - require.GreaterOrEqual(t, score, tt.minScore) - require.LessOrEqual(t, score, tt.maxScore) - }) - } -} diff --git a/evaluators.tmp b/evaluators.tmp new file mode 100644 index 00000000..1b64fdb8 --- /dev/null +++ b/evaluators.tmp @@ -0,0 +1,565 @@ + +import {VariableCompletion, VariablePrompt} from '../../../variables' +import type {EvaluatorCfg} from '../../config' + +// From: https://github.com/Azure/azure-sdk-for-python//blob/b577491b088944ccd074746079d429bf79edc970/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty +export const RelevanceEvaluator: EvaluatorCfg = { + name: 'Relevance', + llm: { + model: 'gpt-4o', + modelId: 'azureml://registries/azure-openai/models/gpt-4o/versions/2024-08-06', + modelParameters: { + temperature: 0.0, + max_tokens: 800, + top_p: 1.0, + presence_penalty: 0, + frequency_penalty: 0, + response_format: 'text', + }, + systemPrompt: `# Instruction +## Goal +### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. +- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. +- **Data**: Your input data include QUERY and RESPONSE. +- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.`, + // The prompt is slightly modified, originally this also included reason and score: + // ## Please provide your answers between the tags: your chain of thoughts, your explanation, your Score. + // # Output`, + // + prompt: `# Definition +**Relevance** refers to how effectively a response addresses a question. It assesses the accuracy, completeness, and direct relevance of the response based solely on the given information. + +# Ratings +## [Relevance: 1] (Irrelevant Response) +**Definition:** The response is unrelated to the question. It provides information that is off-topic and does not attempt to address the question posed. + +**Examples:** + **Query:** What is the team preparing for? + **Response:** I went grocery shopping yesterday evening. + + **Query:** When will the company's new product line launch? + **Response:** International travel can be very rewarding and educational. + +## [Relevance: 2] (Incorrect Response) +**Definition:** The response attempts to address the question but includes incorrect information. It provides a response that is factually wrong based on the provided information. + +**Examples:** + **Query:** When was the merger between the two firms finalized? + **Response:** The merger was finalized on April 10th. + + **Query:** Where and when will the solar eclipse be visible? + **Response:** The solar eclipse will be visible in Asia on December 14th. + +## [Relevance: 3] (Incomplete Response) +**Definition:** The response addresses the question but omits key details necessary for a full understanding. It provides a partial response that lacks essential information. + +**Examples:** + **Query:** What type of food does the new restaurant offer? + **Response:** The restaurant offers Italian food like pasta. + + **Query:** What topics will the conference cover? + **Response:** The conference will cover renewable energy and climate change. + +## [Relevance: 4] (Complete Response) +**Definition:** The response fully addresses the question with accurate and complete information. It includes all essential details required for a comprehensive understanding, without adding any extraneous information. + +**Examples:** + **Query:** What type of food does the new restaurant offer? + **Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto. + + **Query:** What topics will the conference cover? + **Response:** The conference will cover renewable energy, climate change, and sustainability practices. + +## [Relevance: 5] (Comprehensive Response with Insights) +**Definition:** The response not only fully and accurately addresses the question but also includes additional relevant insights or elaboration. It may explain the significance, implications, or provide minor inferences that enhance understanding. + +**Examples:** + **Query:** What type of food does the new restaurant offer? + **Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto, aiming to provide customers with an authentic Italian dining experience. + + **Query:** What topics will the conference cover? + **Response:** The conference will cover renewable energy, climate change, and sustainability practices, bringing together global experts to discuss these critical issues. + + + +# Data +QUERY: {{${VariablePrompt}}} +RESPONSE: {{${VariableCompletion}}} + + +# Tasks +## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information: +- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". +- **Explanation**: a very short explanation of why you think the input Data should get that Score. +- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions. + +## Please provide only your Score as the last output on a new line. +# Output`, + choices: [ + { + choice: '1', + score: 0, + }, + { + choice: '2', + score: 0.25, + }, + { + choice: '3', + score: 0.5, + }, + { + choice: '4', + score: 0.75, + }, + { + choice: '5', + score: 1.0, + }, + ], + }, +} + + + +import {VariableCompletion, VariableExpected, VariablePrompt} from '../../../variables' +import type {EvaluatorCfg} from '../../config' + +// From: https://github.com/Azure/azure-sdk-for-python//blob/b577491b088944ccd074746079d429bf79edc970/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty#L0-L1 +export const GroundednessEvaluator: EvaluatorCfg = { + name: 'Groundedness', + llm: { + model: 'gpt-4o', + modelId: 'azureml://registries/azure-openai/models/gpt-4o/versions/2024-08-06', + modelParameters: { + temperature: 0.0, + max_tokens: 800, + top_p: 1.0, + presence_penalty: 0, + frequency_penalty: 0, + response_format: 'text', + }, + systemPrompt: `# Instruction +## Goal +### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. +- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. +- **Data**: Your input data include CONTEXT, QUERY, and RESPONSE. +- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.`, + // The prompt is slightly modified, originally this also included reason and score: + // ## Please provide your answers between the tags: your chain of thoughts, your explanation, your Score. + // # Output + prompt: `# Definition +**Groundedness** refers to how well an answer is anchored in the provided context, evaluating its relevance, accuracy, and completeness based exclusively on that context. It assesses the extent to which the answer directly and fully addresses the question without introducing unrelated or incorrect information. The scale ranges from 1 to 5, with higher numbers indicating greater groundedness. + +# Ratings +## [Groundedness: 1] (Completely Unrelated Response) +**Definition:** An answer that does not relate to the question or the context in any way. It fails to address the topic, provides irrelevant information, or introduces completely unrelated subjects. + +**Examples:** + **Context:** The company's annual meeting will be held next Thursday. + **Query:** When is the company's annual meeting? + **Response:** I enjoy hiking in the mountains during summer. + + **Context:** The new policy aims to reduce carbon emissions by 20% over the next five years. + **Query:** What is the goal of the new policy? + **Response:** My favorite color is blue. + +## [Groundedness: 2] (Related Topic but Does Not Respond to the Query) +**Definition:** An answer that relates to the general topic of the context but does not answer the specific question asked. It may mention concepts from the context but fails to provide a direct or relevant response. + +**Examples:** + **Context:** The museum will exhibit modern art pieces from various local artists. + **Query:** What kind of art will be exhibited at the museum? + **Response:** Museums are important cultural institutions. + + **Context:** The new software update improves battery life and performance. + **Query:** What does the new software update improve? + **Response:** Software updates can sometimes fix bugs. + +## [Groundedness: 3] (Attempts to Respond but Contains Incorrect Information) +**Definition:** An answer that attempts to respond to the question but includes incorrect information not supported by the context. It may misstate facts, misinterpret the context, or provide erroneous details. + +**Examples:** + **Context:** The festival starts on June 5th and features international musicians. + **Query:** When does the festival start? + **Response:** The festival starts on July 5th and features local artists. + + **Context:** The recipe requires two eggs and one cup of milk. + **Query:** How many eggs are needed for the recipe? + **Response:** You need three eggs for the recipe. + +## [Groundedness: 4] (Partially Correct Response) +**Definition:** An answer that provides a correct response to the question but is incomplete or lacks specific details mentioned in the context. It captures some of the necessary information but omits key elements needed for a full understanding. + +**Examples:** + **Context:** The bookstore offers a 15% discount to students and a 10% discount to senior citizens. + **Query:** What discount does the bookstore offer to students? + **Response:** Students get a discount at the bookstore. + + **Context:** The company's headquarters are located in Berlin, Germany. + **Query:** Where are the company's headquarters? + **Response:** The company's headquarters are in Germany. + +## [Groundedness: 5] (Fully Correct and Complete Response) +**Definition:** An answer that thoroughly and accurately responds to the question, including all relevant details from the context. It directly addresses the question with precise information, demonstrating complete understanding without adding extraneous information. + +**Examples:** + **Context:** The author released her latest novel, 'The Silent Echo', on September 1st. + **Query:** When was 'The Silent Echo' released? + **Response:** 'The Silent Echo' was released on September 1st. + + **Context:** Participants must register by May 31st to be eligible for early bird pricing. + **Query:** By what date must participants register to receive early bird pricing? + **Response:** Participants must register by May 31st to receive early bird pricing. + + +# Data +CONTEXT: {{${VariableExpected}}} +QUERY: {{${VariablePrompt}}} +RESPONSE: {{${VariableCompletion}}} + + +# Tasks +## Please provide your assessment Score for the previous RESPONSE in relation to the CONTEXT and QUERY based on the Definitions above. Your output should include the following information: +- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". +- **Explanation**: a very short explanation of why you think the input Data should get that Score. +- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions. + +## Please provide only your Score as the last output on a new line. +# Output`, + choices: [ + { + choice: '1', + score: 0, + }, + { + choice: '2', + score: 0.25, + }, + { + choice: '3', + score: 0.5, + }, + { + choice: '4', + score: 0.75, + }, + { + choice: '5', + score: 1.0, + }, + ], + }, +} + +import {VariableCompletion, VariableExpected, VariableInput} from '../../../variables' +import type {EvaluatorCfg} from '../../config' + +// From https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty +export const SimilarityEvaluator: EvaluatorCfg = { + name: 'Similarity', + llm: { + model: 'gpt-4o', + modelId: 'azureml://registries/azure-openai/models/gpt-4o/versions/2024-08-06', + modelParameters: { + temperature: 0.0, + max_tokens: 1, + top_p: 1.0, + presence_penalty: 0, + frequency_penalty: 0, + response_format: 'text', + }, + systemPrompt: `You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.`, + prompt: `Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale: +One star: the predicted answer is not at all similar to the correct answer +Two stars: the predicted answer is mostly not similar to the correct answer +Three stars: the predicted answer is somewhat similar to the correct answer +Four stars: the predicted answer is mostly similar to the correct answer +Five stars: the predicted answer is completely similar to the correct answer + +This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. + +The examples below show the Equivalence score for a question, a correct answer, and a predicted answer. + +question: What is the role of ribosomes? +correct answer: Ribosomes are cellular structures responsible for protein synthesis. They interpret the genetic information carried by messenger RNA (mRNA) and use it to assemble amino acids into proteins. +predicted answer: Ribosomes participate in carbohydrate breakdown by removing nutrients from complex sugar molecules. +stars: 1 + +question: Why did the Titanic sink? +correct answer: The Titanic sank after it struck an iceberg during its maiden voyage in 1912. The impact caused the ship's hull to breach, allowing water to flood into the vessel. The ship's design, lifeboat shortage, and lack of timely rescue efforts contributed to the tragic loss of life. +predicted answer: The sinking of the Titanic was a result of a large iceberg collision. This caused the ship to take on water and eventually sink, leading to the death of many passengers due to a shortage of lifeboats and insufficient rescue attempts. +stars: 2 + +question: What causes seasons on Earth? +correct answer: Seasons on Earth are caused by the tilt of the Earth's axis and its revolution around the Sun. As the Earth orbits the Sun, the tilt causes different parts of the planet to receive varying amounts of sunlight, resulting in changes in temperature and weather patterns. +predicted answer: Seasons occur because of the Earth's rotation and its elliptical orbit around the Sun. The tilt of the Earth's axis causes regions to be subjected to different sunlight intensities, which leads to temperature fluctuations and alternating weather conditions. +stars: 3 + +question: How does photosynthesis work? +correct answer: Photosynthesis is a process by which green plants and some other organisms convert light energy into chemical energy. This occurs as light is absorbed by chlorophyll molecules, and then carbon dioxide and water are converted into glucose and oxygen through a series of reactions. +predicted answer: In photosynthesis, sunlight is transformed into nutrients by plants and certain microorganisms. Light is captured by chlorophyll molecules, followed by the conversion of carbon dioxide and water into sugar and oxygen through multiple reactions. +stars: 4 + +question: What are the health benefits of regular exercise? +correct answer: Regular exercise can help maintain a healthy weight, increase muscle and bone strength, and reduce the risk of chronic diseases. It also promotes mental well-being by reducing stress and improving overall mood. +predicted answer: Routine physical activity can contribute to maintaining ideal body weight, enhancing muscle and bone strength, and preventing chronic illnesses. In addition, it supports mental health by alleviating stress and augmenting general mood. +stars: 5 + +question: {{${VariableInput}}} +correct answer: {{${VariableExpected}}} +predicted answer: {{${VariableCompletion}}} +stars:`, + choices: [ + { + choice: '1', + score: 0, + }, + { + choice: '2', + score: 0.25, + }, + { + choice: '3', + score: 0.5, + }, + { + choice: '4', + score: 0.75, + }, + { + choice: '5', + score: 1.0, + }, + ], + }, +} + +import {VariableCompletion} from '../../../variables' +import type {EvaluatorCfg} from '../../config' + +// From: https://github.com/Azure/azure-sdk-for-python//blob/b577491b088944ccd074746079d429bf79edc970/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +export const FluencyEvaluator: EvaluatorCfg = { + name: 'Fluency', + llm: { + model: 'gpt-4o', + modelId: 'azureml://registries/azure-openai/models/gpt-4o/versions/2024-08-06', + modelParameters: { + temperature: 0.0, + max_tokens: 800, + top_p: 1.0, + presence_penalty: 0, + frequency_penalty: 0, + response_format: 'text', + }, + systemPrompt: `# Instruction +## Goal +### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. +- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. +- **Data**: Your input data include a RESPONSE. +- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.`, + // The prompt is slightly modified, originally this also included reason and score: + // ## Please provide your answers between the tags: your chain of thoughts, your explanation, your Score. + // # Output + prompt: `# Definition +**Fluency** refers to the effectiveness and clarity of written communication, focusing on grammatical accuracy, vocabulary range, sentence complexity, coherence, and overall readability. It assesses how smoothly ideas are conveyed and how easily the text can be understood by the reader. + +# Ratings +## [Fluency: 1] (Emergent Fluency) +**Definition:** The response shows minimal command of the language. It contains pervasive grammatical errors, extremely limited vocabulary, and fragmented or incoherent sentences. The message is largely incomprehensible, making understanding very difficult. + +**Examples:** + **Response:** Free time I. Go park. Not fun. Alone. + + **Response:** Like food pizza. Good cheese eat. + +## [Fluency: 2] (Basic Fluency) +**Definition:** The response communicates simple ideas but has frequent grammatical errors and limited vocabulary. Sentences are short and may be improperly constructed, leading to partial understanding. Repetition and awkward phrasing are common. + +**Examples:** + **Response:** I like play soccer. I watch movie. It fun. + + **Response:** My town small. Many people. We have market. + +## [Fluency: 3] (Competent Fluency) +**Definition:** The response clearly conveys ideas with occasional grammatical errors. Vocabulary is adequate but not extensive. Sentences are generally correct but may lack complexity and variety. The text is coherent, and the message is easily understood with minimal effort. + +**Examples:** + **Response:** I'm planning to visit friends and maybe see a movie together. + + **Response:** I try to eat healthy food and exercise regularly by jogging. + +## [Fluency: 4] (Proficient Fluency) +**Definition:** The response is well-articulated with good control of grammar and a varied vocabulary. Sentences are complex and well-structured, demonstrating coherence and cohesion. Minor errors may occur but do not affect overall understanding. The text flows smoothly, and ideas are connected logically. + +**Examples:** + **Response:** My interest in mathematics and problem-solving inspired me to become an engineer, as I enjoy designing solutions that improve people's lives. + + **Response:** Environmental conservation is crucial because it protects ecosystems, preserves biodiversity, and ensures natural resources are available for future generations. + +## [Fluency: 5] (Exceptional Fluency) +**Definition:** The response demonstrates an exceptional command of language with sophisticated vocabulary and complex, varied sentence structures. It is coherent, cohesive, and engaging, with precise and nuanced expression. Grammar is flawless, and the text reflects a high level of eloquence and style. + +**Examples:** + **Response:** Globalization exerts a profound influence on cultural diversity by facilitating unprecedented cultural exchange while simultaneously risking the homogenization of distinct cultural identities, which can diminish the richness of global heritage. + + **Response:** Technology revolutionizes modern education by providing interactive learning platforms, enabling personalized learning experiences, and connecting students worldwide, thereby transforming how knowledge is acquired and shared. + + +# Data +RESPONSE: {{${VariableCompletion}}} + + +# Tasks +## Please provide your assessment Score for the previous RESPONSE based on the Definitions above. Your output should include the following information: +- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". +- **Explanation**: a very short explanation of why you think the input Data should get that Score. +- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions. + + +## Please provide only your Score as the last output on a new line. +# Output`, + choices: [ + { + choice: '1', + score: 0, + }, + { + choice: '2', + score: 0.25, + }, + { + choice: '3', + score: 0.5, + }, + { + choice: '4', + score: 0.75, + }, + { + choice: '5', + score: 1.0, + }, + ], + }, +} + + +import {VariableCompletion, VariableInput} from '../../../variables' +import type {EvaluatorCfg} from '../../config' + +// From: https://github.com/Azure/azure-sdk-for-python//blob/b577491b088944ccd074746079d429bf79edc970/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty +export const CoherenceEvaluator: EvaluatorCfg = { + name: 'Coherence', + llm: { + model: 'gpt-4o', + modelId: 'azureml://registries/azure-openai/models/gpt-4o/versions/2024-08-06', + modelParameters: { + temperature: 0.0, + max_tokens: 800, + top_p: 1.0, + presence_penalty: 0, + frequency_penalty: 0, + response_format: 'text', + }, + systemPrompt: `# Instruction + ## Goal + ### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. + - **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. + - **Data**: Your input data include a QUERY and a RESPONSE. + - **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.`, + // The prompt is slightly modified, originally this also included reason and score: + // ## Please provide your answers between the tags: your chain of thoughts, your explanation, your Score. + // # Output + prompt: `# Definition +**Coherence** refers to the logical and orderly presentation of ideas in a response, allowing the reader to easily follow and understand the writer's train of thought. A coherent answer directly addresses the question with clear connections between sentences and paragraphs, using appropriate transitions and a logical sequence of ideas. + +# Ratings +## [Coherence: 1] (Incoherent Response) +**Definition:** The response lacks coherence entirely. It consists of disjointed words or phrases that do not form complete or meaningful sentences. There is no logical connection to the question, making the response incomprehensible. + +**Examples:** + **Query:** What are the benefits of renewable energy? + **Response:** Wind sun green jump apple silence over. + + **Query:** Explain the process of photosynthesis. + **Response:** Plants light water flying blue music. + +## [Coherence: 2] (Poorly Coherent Response) +**Definition:** The response shows minimal coherence with fragmented sentences and limited connection to the question. It contains some relevant keywords but lacks logical structure and clear relationships between ideas, making the overall message difficult to understand. + +**Examples:** + **Query:** How does vaccination work? + **Response:** Vaccines protect disease. Immune system fight. Health better. + + **Query:** Describe how a bill becomes a law. + **Response:** Idea proposed. Congress discuss vote. President signs. + +## [Coherence: 3] (Partially Coherent Response) +**Definition:** The response partially addresses the question with some relevant information but exhibits issues in the logical flow and organization of ideas. Connections between sentences may be unclear or abrupt, requiring the reader to infer the links. The response may lack smooth transitions and may present ideas out of order. + +**Examples:** + **Query:** What causes earthquakes? + **Response:** Earthquakes happen when tectonic plates move suddenly. Energy builds up then releases. Ground shakes and can cause damage. + + **Query:** Explain the importance of the water cycle. + **Response:** The water cycle moves water around Earth. Evaporation, then precipitation occurs. It supports life by distributing water. + +## [Coherence: 4] (Coherent Response) +**Definition:** The response is coherent and effectively addresses the question. Ideas are logically organized with clear connections between sentences and paragraphs. Appropriate transitions are used to guide the reader through the response, which flows smoothly and is easy to follow. + +**Examples:** + **Query:** What is the water cycle and how does it work? + **Response:** The water cycle is the continuous movement of water on Earth through processes like evaporation, condensation, and precipitation. Water evaporates from bodies of water, forms clouds through condensation, and returns to the surface as precipitation. This cycle is essential for distributing water resources globally. + + **Query:** Describe the role of mitochondria in cellular function. + **Response:** Mitochondria are organelles that produce energy for the cell. They convert nutrients into ATP through cellular respiration. This energy powers various cellular activities, making mitochondria vital for cell survival. + +## [Coherence: 5] (Highly Coherent Response) +**Definition:** The response is exceptionally coherent, demonstrating sophisticated organization and flow. Ideas are presented in a logical and seamless manner, with excellent use of transitional phrases and cohesive devices. The connections between concepts are clear and enhance the reader's understanding. The response thoroughly addresses the question with clarity and precision. + +**Examples:** + **Query:** Analyze the economic impacts of climate change on coastal cities. + **Response:** Climate change significantly affects the economies of coastal cities through rising sea levels, increased flooding, and more intense storms. These environmental changes can damage infrastructure, disrupt businesses, and lead to costly repairs. For instance, frequent flooding can hinder transportation and commerce, while the threat of severe weather may deter investment and tourism. Consequently, cities may face increased expenses for disaster preparedness and mitigation efforts, straining municipal budgets and impacting economic growth. + + **Query:** Discuss the significance of the Monroe Doctrine in shaping U.S. foreign policy. + **Response:** The Monroe Doctrine was a pivotal policy declared in 1823 that asserted U.S. opposition to European colonization in the Americas. By stating that any intervention by external powers in the Western Hemisphere would be viewed as a hostile act, it established the U.S. as a protector of the region. This doctrine shaped U.S. foreign policy by promoting isolation from European conflicts while justifying American influence and expansion in the hemisphere. Its long-term significance lies in its enduring influence on international relations and its role in defining the U.S. position in global affairs. + + +# Data +QUERY: {{${VariableInput}}} +RESPONSE: {{${VariableCompletion}}} + + +# Tasks +## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information: +- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". +- **Explanation**: a very short explanation of why you think the input Data should get that Score. +- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions. + +## Please provide only your Score as the last output on a new line. +# Output`, + choices: [ + { + choice: '1', + score: 0, + }, + { + choice: '2', + score: 0.25, + }, + { + choice: '3', + score: 0.5, + }, + { + choice: '4', + score: 0.75, + }, + { + choice: '5', + score: 1.0, + }, + ], + }, +} diff --git a/fixtures/test_builtins.yml b/fixtures/test_builtins.yml new file mode 100644 index 00000000..1e8717b2 --- /dev/null +++ b/fixtures/test_builtins.yml @@ -0,0 +1,25 @@ +name: Test Built-in Evaluators +description: Testing the new LLM-based built-in evaluators +model: openai/gpt-4o +modelParameters: + temperature: 0.5 + maxTokens: 100 +testData: + - input: 'What is photosynthesis?' + expected: 'Photosynthesis is the process by which plants convert sunlight into energy using chlorophyll, converting carbon dioxide and water into glucose and oxygen.' +messages: + - role: system + content: You are a helpful assistant that provides accurate scientific information. + - role: user + content: '{{input}}' +evaluators: + - name: similarity test + uses: github/similarity + - name: coherence test + uses: github/coherence + - name: fluency test + uses: github/fluency + - name: relevance test + uses: github/relevance + - name: groundedness test + uses: github/groundedness diff --git a/fixtures/test_single_evaluator.yml b/fixtures/test_single_evaluator.yml new file mode 100644 index 00000000..34f2d414 --- /dev/null +++ b/fixtures/test_single_evaluator.yml @@ -0,0 +1,12 @@ +name: "Test Single Evaluator" +description: "Testing a single built-in evaluator" +model: "openai/gpt-4o" +testData: + - input: "What is machine learning?" + expected: "Machine learning is a subset of artificial intelligence that enables computers to learn and make decisions from data without being explicitly programmed." +messages: + - role: user + content: "{{input}}" +evaluators: + - name: "fluency-test" + uses: "github/fluency" diff --git a/test_builtins.yml b/test_builtins.yml new file mode 100644 index 00000000..1e8717b2 --- /dev/null +++ b/test_builtins.yml @@ -0,0 +1,25 @@ +name: Test Built-in Evaluators +description: Testing the new LLM-based built-in evaluators +model: openai/gpt-4o +modelParameters: + temperature: 0.5 + maxTokens: 100 +testData: + - input: 'What is photosynthesis?' + expected: 'Photosynthesis is the process by which plants convert sunlight into energy using chlorophyll, converting carbon dioxide and water into glucose and oxygen.' +messages: + - role: system + content: You are a helpful assistant that provides accurate scientific information. + - role: user + content: '{{input}}' +evaluators: + - name: similarity test + uses: github/similarity + - name: coherence test + uses: github/coherence + - name: fluency test + uses: github/fluency + - name: relevance test + uses: github/relevance + - name: groundedness test + uses: github/groundedness diff --git a/test_evaluators.go b/test_evaluators.go new file mode 100644 index 00000000..3b1edd8a --- /dev/null +++ b/test_evaluators.go @@ -0,0 +1,23 @@ +package main + +import ( + "fmt" + "github.com/github/gh-models/cmd/eval" +) + +func main() { + fmt.Println("Testing built-in evaluators...") + + // Test that all expected evaluators exist + evaluators := []string{"similarity", "coherence", "fluency", "relevance", "groundedness"} + + for _, name := range evaluators { + if evaluator, exists := eval.BuiltInEvaluators[name]; exists { + fmt.Printf("✓ %s evaluator exists with model: %s\n", name, evaluator.ModelID) + } else { + fmt.Printf("✗ %s evaluator not found\n", name) + } + } + + fmt.Println("Built-in evaluators test completed!") +} From 3bdb250fe0fa5c0910bfade1acf1a4200463979b Mon Sep 17 00:00:00 2001 From: Sean Goedecke Date: Wed, 4 Jun 2025 08:38:37 +0000 Subject: [PATCH 7/9] Truncate to first 100 chars --- cmd/eval/eval.go | 8 ++++++-- test_evaluators.go | 6 +++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cmd/eval/eval.go b/cmd/eval/eval.go index 22f6816f..6e59a887 100644 --- a/cmd/eval/eval.go +++ b/cmd/eval/eval.go @@ -140,8 +140,12 @@ func (h *evalCommandHandler) runEvaluation(ctx context.Context) error { h.cfg.WriteToOut(" ✓ PASSED\n") } else { h.cfg.WriteToOut(" ✗ FAILED\n") - // Show model response when test fails - h.cfg.WriteToOut(fmt.Sprintf(" Model Response: %s\n", result.ModelResponse)) + // Show the first 100 characters of the model response when test fails + preview := result.ModelResponse + if len(preview) > 100 { + preview = preview[:100] + "..." + } + h.cfg.WriteToOut(fmt.Sprintf(" Model Response: %s\n", preview)) } // Show evaluation details diff --git a/test_evaluators.go b/test_evaluators.go index 3b1edd8a..6f5345e6 100644 --- a/test_evaluators.go +++ b/test_evaluators.go @@ -7,10 +7,10 @@ import ( func main() { fmt.Println("Testing built-in evaluators...") - + // Test that all expected evaluators exist evaluators := []string{"similarity", "coherence", "fluency", "relevance", "groundedness"} - + for _, name := range evaluators { if evaluator, exists := eval.BuiltInEvaluators[name]; exists { fmt.Printf("✓ %s evaluator exists with model: %s\n", name, evaluator.ModelID) @@ -18,6 +18,6 @@ func main() { fmt.Printf("✗ %s evaluator not found\n", name) } } - + fmt.Println("Built-in evaluators test completed!") } From e527146b0d23fb3f94c380641f7ffb493e81a580 Mon Sep 17 00:00:00 2001 From: Sean Goedecke Date: Wed, 4 Jun 2025 08:41:44 +0000 Subject: [PATCH 8/9] Remove extraneous files --- evaluators.tmp | 565 --------------------------------------------- test_builtins.yml | 25 -- test_evaluators.go | 23 -- 3 files changed, 613 deletions(-) delete mode 100644 evaluators.tmp delete mode 100644 test_builtins.yml delete mode 100644 test_evaluators.go diff --git a/evaluators.tmp b/evaluators.tmp deleted file mode 100644 index 1b64fdb8..00000000 --- a/evaluators.tmp +++ /dev/null @@ -1,565 +0,0 @@ - -import {VariableCompletion, VariablePrompt} from '../../../variables' -import type {EvaluatorCfg} from '../../config' - -// From: https://github.com/Azure/azure-sdk-for-python//blob/b577491b088944ccd074746079d429bf79edc970/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_relevance/relevance.prompty -export const RelevanceEvaluator: EvaluatorCfg = { - name: 'Relevance', - llm: { - model: 'gpt-4o', - modelId: 'azureml://registries/azure-openai/models/gpt-4o/versions/2024-08-06', - modelParameters: { - temperature: 0.0, - max_tokens: 800, - top_p: 1.0, - presence_penalty: 0, - frequency_penalty: 0, - response_format: 'text', - }, - systemPrompt: `# Instruction -## Goal -### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. -- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. -- **Data**: Your input data include QUERY and RESPONSE. -- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.`, - // The prompt is slightly modified, originally this also included reason and score: - // ## Please provide your answers between the tags: your chain of thoughts, your explanation, your Score. - // # Output`, - // - prompt: `# Definition -**Relevance** refers to how effectively a response addresses a question. It assesses the accuracy, completeness, and direct relevance of the response based solely on the given information. - -# Ratings -## [Relevance: 1] (Irrelevant Response) -**Definition:** The response is unrelated to the question. It provides information that is off-topic and does not attempt to address the question posed. - -**Examples:** - **Query:** What is the team preparing for? - **Response:** I went grocery shopping yesterday evening. - - **Query:** When will the company's new product line launch? - **Response:** International travel can be very rewarding and educational. - -## [Relevance: 2] (Incorrect Response) -**Definition:** The response attempts to address the question but includes incorrect information. It provides a response that is factually wrong based on the provided information. - -**Examples:** - **Query:** When was the merger between the two firms finalized? - **Response:** The merger was finalized on April 10th. - - **Query:** Where and when will the solar eclipse be visible? - **Response:** The solar eclipse will be visible in Asia on December 14th. - -## [Relevance: 3] (Incomplete Response) -**Definition:** The response addresses the question but omits key details necessary for a full understanding. It provides a partial response that lacks essential information. - -**Examples:** - **Query:** What type of food does the new restaurant offer? - **Response:** The restaurant offers Italian food like pasta. - - **Query:** What topics will the conference cover? - **Response:** The conference will cover renewable energy and climate change. - -## [Relevance: 4] (Complete Response) -**Definition:** The response fully addresses the question with accurate and complete information. It includes all essential details required for a comprehensive understanding, without adding any extraneous information. - -**Examples:** - **Query:** What type of food does the new restaurant offer? - **Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto. - - **Query:** What topics will the conference cover? - **Response:** The conference will cover renewable energy, climate change, and sustainability practices. - -## [Relevance: 5] (Comprehensive Response with Insights) -**Definition:** The response not only fully and accurately addresses the question but also includes additional relevant insights or elaboration. It may explain the significance, implications, or provide minor inferences that enhance understanding. - -**Examples:** - **Query:** What type of food does the new restaurant offer? - **Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto, aiming to provide customers with an authentic Italian dining experience. - - **Query:** What topics will the conference cover? - **Response:** The conference will cover renewable energy, climate change, and sustainability practices, bringing together global experts to discuss these critical issues. - - - -# Data -QUERY: {{${VariablePrompt}}} -RESPONSE: {{${VariableCompletion}}} - - -# Tasks -## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information: -- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". -- **Explanation**: a very short explanation of why you think the input Data should get that Score. -- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions. - -## Please provide only your Score as the last output on a new line. -# Output`, - choices: [ - { - choice: '1', - score: 0, - }, - { - choice: '2', - score: 0.25, - }, - { - choice: '3', - score: 0.5, - }, - { - choice: '4', - score: 0.75, - }, - { - choice: '5', - score: 1.0, - }, - ], - }, -} - - - -import {VariableCompletion, VariableExpected, VariablePrompt} from '../../../variables' -import type {EvaluatorCfg} from '../../config' - -// From: https://github.com/Azure/azure-sdk-for-python//blob/b577491b088944ccd074746079d429bf79edc970/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_groundedness/groundedness_with_query.prompty#L0-L1 -export const GroundednessEvaluator: EvaluatorCfg = { - name: 'Groundedness', - llm: { - model: 'gpt-4o', - modelId: 'azureml://registries/azure-openai/models/gpt-4o/versions/2024-08-06', - modelParameters: { - temperature: 0.0, - max_tokens: 800, - top_p: 1.0, - presence_penalty: 0, - frequency_penalty: 0, - response_format: 'text', - }, - systemPrompt: `# Instruction -## Goal -### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. -- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. -- **Data**: Your input data include CONTEXT, QUERY, and RESPONSE. -- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.`, - // The prompt is slightly modified, originally this also included reason and score: - // ## Please provide your answers between the tags: your chain of thoughts, your explanation, your Score. - // # Output - prompt: `# Definition -**Groundedness** refers to how well an answer is anchored in the provided context, evaluating its relevance, accuracy, and completeness based exclusively on that context. It assesses the extent to which the answer directly and fully addresses the question without introducing unrelated or incorrect information. The scale ranges from 1 to 5, with higher numbers indicating greater groundedness. - -# Ratings -## [Groundedness: 1] (Completely Unrelated Response) -**Definition:** An answer that does not relate to the question or the context in any way. It fails to address the topic, provides irrelevant information, or introduces completely unrelated subjects. - -**Examples:** - **Context:** The company's annual meeting will be held next Thursday. - **Query:** When is the company's annual meeting? - **Response:** I enjoy hiking in the mountains during summer. - - **Context:** The new policy aims to reduce carbon emissions by 20% over the next five years. - **Query:** What is the goal of the new policy? - **Response:** My favorite color is blue. - -## [Groundedness: 2] (Related Topic but Does Not Respond to the Query) -**Definition:** An answer that relates to the general topic of the context but does not answer the specific question asked. It may mention concepts from the context but fails to provide a direct or relevant response. - -**Examples:** - **Context:** The museum will exhibit modern art pieces from various local artists. - **Query:** What kind of art will be exhibited at the museum? - **Response:** Museums are important cultural institutions. - - **Context:** The new software update improves battery life and performance. - **Query:** What does the new software update improve? - **Response:** Software updates can sometimes fix bugs. - -## [Groundedness: 3] (Attempts to Respond but Contains Incorrect Information) -**Definition:** An answer that attempts to respond to the question but includes incorrect information not supported by the context. It may misstate facts, misinterpret the context, or provide erroneous details. - -**Examples:** - **Context:** The festival starts on June 5th and features international musicians. - **Query:** When does the festival start? - **Response:** The festival starts on July 5th and features local artists. - - **Context:** The recipe requires two eggs and one cup of milk. - **Query:** How many eggs are needed for the recipe? - **Response:** You need three eggs for the recipe. - -## [Groundedness: 4] (Partially Correct Response) -**Definition:** An answer that provides a correct response to the question but is incomplete or lacks specific details mentioned in the context. It captures some of the necessary information but omits key elements needed for a full understanding. - -**Examples:** - **Context:** The bookstore offers a 15% discount to students and a 10% discount to senior citizens. - **Query:** What discount does the bookstore offer to students? - **Response:** Students get a discount at the bookstore. - - **Context:** The company's headquarters are located in Berlin, Germany. - **Query:** Where are the company's headquarters? - **Response:** The company's headquarters are in Germany. - -## [Groundedness: 5] (Fully Correct and Complete Response) -**Definition:** An answer that thoroughly and accurately responds to the question, including all relevant details from the context. It directly addresses the question with precise information, demonstrating complete understanding without adding extraneous information. - -**Examples:** - **Context:** The author released her latest novel, 'The Silent Echo', on September 1st. - **Query:** When was 'The Silent Echo' released? - **Response:** 'The Silent Echo' was released on September 1st. - - **Context:** Participants must register by May 31st to be eligible for early bird pricing. - **Query:** By what date must participants register to receive early bird pricing? - **Response:** Participants must register by May 31st to receive early bird pricing. - - -# Data -CONTEXT: {{${VariableExpected}}} -QUERY: {{${VariablePrompt}}} -RESPONSE: {{${VariableCompletion}}} - - -# Tasks -## Please provide your assessment Score for the previous RESPONSE in relation to the CONTEXT and QUERY based on the Definitions above. Your output should include the following information: -- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". -- **Explanation**: a very short explanation of why you think the input Data should get that Score. -- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions. - -## Please provide only your Score as the last output on a new line. -# Output`, - choices: [ - { - choice: '1', - score: 0, - }, - { - choice: '2', - score: 0.25, - }, - { - choice: '3', - score: 0.5, - }, - { - choice: '4', - score: 0.75, - }, - { - choice: '5', - score: 1.0, - }, - ], - }, -} - -import {VariableCompletion, VariableExpected, VariableInput} from '../../../variables' -import type {EvaluatorCfg} from '../../config' - -// From https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_similarity/similarity.prompty -export const SimilarityEvaluator: EvaluatorCfg = { - name: 'Similarity', - llm: { - model: 'gpt-4o', - modelId: 'azureml://registries/azure-openai/models/gpt-4o/versions/2024-08-06', - modelParameters: { - temperature: 0.0, - max_tokens: 1, - top_p: 1.0, - presence_penalty: 0, - frequency_penalty: 0, - response_format: 'text', - }, - systemPrompt: `You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.`, - prompt: `Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale: -One star: the predicted answer is not at all similar to the correct answer -Two stars: the predicted answer is mostly not similar to the correct answer -Three stars: the predicted answer is somewhat similar to the correct answer -Four stars: the predicted answer is mostly similar to the correct answer -Five stars: the predicted answer is completely similar to the correct answer - -This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. - -The examples below show the Equivalence score for a question, a correct answer, and a predicted answer. - -question: What is the role of ribosomes? -correct answer: Ribosomes are cellular structures responsible for protein synthesis. They interpret the genetic information carried by messenger RNA (mRNA) and use it to assemble amino acids into proteins. -predicted answer: Ribosomes participate in carbohydrate breakdown by removing nutrients from complex sugar molecules. -stars: 1 - -question: Why did the Titanic sink? -correct answer: The Titanic sank after it struck an iceberg during its maiden voyage in 1912. The impact caused the ship's hull to breach, allowing water to flood into the vessel. The ship's design, lifeboat shortage, and lack of timely rescue efforts contributed to the tragic loss of life. -predicted answer: The sinking of the Titanic was a result of a large iceberg collision. This caused the ship to take on water and eventually sink, leading to the death of many passengers due to a shortage of lifeboats and insufficient rescue attempts. -stars: 2 - -question: What causes seasons on Earth? -correct answer: Seasons on Earth are caused by the tilt of the Earth's axis and its revolution around the Sun. As the Earth orbits the Sun, the tilt causes different parts of the planet to receive varying amounts of sunlight, resulting in changes in temperature and weather patterns. -predicted answer: Seasons occur because of the Earth's rotation and its elliptical orbit around the Sun. The tilt of the Earth's axis causes regions to be subjected to different sunlight intensities, which leads to temperature fluctuations and alternating weather conditions. -stars: 3 - -question: How does photosynthesis work? -correct answer: Photosynthesis is a process by which green plants and some other organisms convert light energy into chemical energy. This occurs as light is absorbed by chlorophyll molecules, and then carbon dioxide and water are converted into glucose and oxygen through a series of reactions. -predicted answer: In photosynthesis, sunlight is transformed into nutrients by plants and certain microorganisms. Light is captured by chlorophyll molecules, followed by the conversion of carbon dioxide and water into sugar and oxygen through multiple reactions. -stars: 4 - -question: What are the health benefits of regular exercise? -correct answer: Regular exercise can help maintain a healthy weight, increase muscle and bone strength, and reduce the risk of chronic diseases. It also promotes mental well-being by reducing stress and improving overall mood. -predicted answer: Routine physical activity can contribute to maintaining ideal body weight, enhancing muscle and bone strength, and preventing chronic illnesses. In addition, it supports mental health by alleviating stress and augmenting general mood. -stars: 5 - -question: {{${VariableInput}}} -correct answer: {{${VariableExpected}}} -predicted answer: {{${VariableCompletion}}} -stars:`, - choices: [ - { - choice: '1', - score: 0, - }, - { - choice: '2', - score: 0.25, - }, - { - choice: '3', - score: 0.5, - }, - { - choice: '4', - score: 0.75, - }, - { - choice: '5', - score: 1.0, - }, - ], - }, -} - -import {VariableCompletion} from '../../../variables' -import type {EvaluatorCfg} from '../../config' - -// From: https://github.com/Azure/azure-sdk-for-python//blob/b577491b088944ccd074746079d429bf79edc970/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty -export const FluencyEvaluator: EvaluatorCfg = { - name: 'Fluency', - llm: { - model: 'gpt-4o', - modelId: 'azureml://registries/azure-openai/models/gpt-4o/versions/2024-08-06', - modelParameters: { - temperature: 0.0, - max_tokens: 800, - top_p: 1.0, - presence_penalty: 0, - frequency_penalty: 0, - response_format: 'text', - }, - systemPrompt: `# Instruction -## Goal -### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. -- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. -- **Data**: Your input data include a RESPONSE. -- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.`, - // The prompt is slightly modified, originally this also included reason and score: - // ## Please provide your answers between the tags: your chain of thoughts, your explanation, your Score. - // # Output - prompt: `# Definition -**Fluency** refers to the effectiveness and clarity of written communication, focusing on grammatical accuracy, vocabulary range, sentence complexity, coherence, and overall readability. It assesses how smoothly ideas are conveyed and how easily the text can be understood by the reader. - -# Ratings -## [Fluency: 1] (Emergent Fluency) -**Definition:** The response shows minimal command of the language. It contains pervasive grammatical errors, extremely limited vocabulary, and fragmented or incoherent sentences. The message is largely incomprehensible, making understanding very difficult. - -**Examples:** - **Response:** Free time I. Go park. Not fun. Alone. - - **Response:** Like food pizza. Good cheese eat. - -## [Fluency: 2] (Basic Fluency) -**Definition:** The response communicates simple ideas but has frequent grammatical errors and limited vocabulary. Sentences are short and may be improperly constructed, leading to partial understanding. Repetition and awkward phrasing are common. - -**Examples:** - **Response:** I like play soccer. I watch movie. It fun. - - **Response:** My town small. Many people. We have market. - -## [Fluency: 3] (Competent Fluency) -**Definition:** The response clearly conveys ideas with occasional grammatical errors. Vocabulary is adequate but not extensive. Sentences are generally correct but may lack complexity and variety. The text is coherent, and the message is easily understood with minimal effort. - -**Examples:** - **Response:** I'm planning to visit friends and maybe see a movie together. - - **Response:** I try to eat healthy food and exercise regularly by jogging. - -## [Fluency: 4] (Proficient Fluency) -**Definition:** The response is well-articulated with good control of grammar and a varied vocabulary. Sentences are complex and well-structured, demonstrating coherence and cohesion. Minor errors may occur but do not affect overall understanding. The text flows smoothly, and ideas are connected logically. - -**Examples:** - **Response:** My interest in mathematics and problem-solving inspired me to become an engineer, as I enjoy designing solutions that improve people's lives. - - **Response:** Environmental conservation is crucial because it protects ecosystems, preserves biodiversity, and ensures natural resources are available for future generations. - -## [Fluency: 5] (Exceptional Fluency) -**Definition:** The response demonstrates an exceptional command of language with sophisticated vocabulary and complex, varied sentence structures. It is coherent, cohesive, and engaging, with precise and nuanced expression. Grammar is flawless, and the text reflects a high level of eloquence and style. - -**Examples:** - **Response:** Globalization exerts a profound influence on cultural diversity by facilitating unprecedented cultural exchange while simultaneously risking the homogenization of distinct cultural identities, which can diminish the richness of global heritage. - - **Response:** Technology revolutionizes modern education by providing interactive learning platforms, enabling personalized learning experiences, and connecting students worldwide, thereby transforming how knowledge is acquired and shared. - - -# Data -RESPONSE: {{${VariableCompletion}}} - - -# Tasks -## Please provide your assessment Score for the previous RESPONSE based on the Definitions above. Your output should include the following information: -- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". -- **Explanation**: a very short explanation of why you think the input Data should get that Score. -- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions. - - -## Please provide only your Score as the last output on a new line. -# Output`, - choices: [ - { - choice: '1', - score: 0, - }, - { - choice: '2', - score: 0.25, - }, - { - choice: '3', - score: 0.5, - }, - { - choice: '4', - score: 0.75, - }, - { - choice: '5', - score: 1.0, - }, - ], - }, -} - - -import {VariableCompletion, VariableInput} from '../../../variables' -import type {EvaluatorCfg} from '../../config' - -// From: https://github.com/Azure/azure-sdk-for-python//blob/b577491b088944ccd074746079d429bf79edc970/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_coherence/coherence.prompty -export const CoherenceEvaluator: EvaluatorCfg = { - name: 'Coherence', - llm: { - model: 'gpt-4o', - modelId: 'azureml://registries/azure-openai/models/gpt-4o/versions/2024-08-06', - modelParameters: { - temperature: 0.0, - max_tokens: 800, - top_p: 1.0, - presence_penalty: 0, - frequency_penalty: 0, - response_format: 'text', - }, - systemPrompt: `# Instruction - ## Goal - ### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. - - **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. - - **Data**: Your input data include a QUERY and a RESPONSE. - - **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.`, - // The prompt is slightly modified, originally this also included reason and score: - // ## Please provide your answers between the tags: your chain of thoughts, your explanation, your Score. - // # Output - prompt: `# Definition -**Coherence** refers to the logical and orderly presentation of ideas in a response, allowing the reader to easily follow and understand the writer's train of thought. A coherent answer directly addresses the question with clear connections between sentences and paragraphs, using appropriate transitions and a logical sequence of ideas. - -# Ratings -## [Coherence: 1] (Incoherent Response) -**Definition:** The response lacks coherence entirely. It consists of disjointed words or phrases that do not form complete or meaningful sentences. There is no logical connection to the question, making the response incomprehensible. - -**Examples:** - **Query:** What are the benefits of renewable energy? - **Response:** Wind sun green jump apple silence over. - - **Query:** Explain the process of photosynthesis. - **Response:** Plants light water flying blue music. - -## [Coherence: 2] (Poorly Coherent Response) -**Definition:** The response shows minimal coherence with fragmented sentences and limited connection to the question. It contains some relevant keywords but lacks logical structure and clear relationships between ideas, making the overall message difficult to understand. - -**Examples:** - **Query:** How does vaccination work? - **Response:** Vaccines protect disease. Immune system fight. Health better. - - **Query:** Describe how a bill becomes a law. - **Response:** Idea proposed. Congress discuss vote. President signs. - -## [Coherence: 3] (Partially Coherent Response) -**Definition:** The response partially addresses the question with some relevant information but exhibits issues in the logical flow and organization of ideas. Connections between sentences may be unclear or abrupt, requiring the reader to infer the links. The response may lack smooth transitions and may present ideas out of order. - -**Examples:** - **Query:** What causes earthquakes? - **Response:** Earthquakes happen when tectonic plates move suddenly. Energy builds up then releases. Ground shakes and can cause damage. - - **Query:** Explain the importance of the water cycle. - **Response:** The water cycle moves water around Earth. Evaporation, then precipitation occurs. It supports life by distributing water. - -## [Coherence: 4] (Coherent Response) -**Definition:** The response is coherent and effectively addresses the question. Ideas are logically organized with clear connections between sentences and paragraphs. Appropriate transitions are used to guide the reader through the response, which flows smoothly and is easy to follow. - -**Examples:** - **Query:** What is the water cycle and how does it work? - **Response:** The water cycle is the continuous movement of water on Earth through processes like evaporation, condensation, and precipitation. Water evaporates from bodies of water, forms clouds through condensation, and returns to the surface as precipitation. This cycle is essential for distributing water resources globally. - - **Query:** Describe the role of mitochondria in cellular function. - **Response:** Mitochondria are organelles that produce energy for the cell. They convert nutrients into ATP through cellular respiration. This energy powers various cellular activities, making mitochondria vital for cell survival. - -## [Coherence: 5] (Highly Coherent Response) -**Definition:** The response is exceptionally coherent, demonstrating sophisticated organization and flow. Ideas are presented in a logical and seamless manner, with excellent use of transitional phrases and cohesive devices. The connections between concepts are clear and enhance the reader's understanding. The response thoroughly addresses the question with clarity and precision. - -**Examples:** - **Query:** Analyze the economic impacts of climate change on coastal cities. - **Response:** Climate change significantly affects the economies of coastal cities through rising sea levels, increased flooding, and more intense storms. These environmental changes can damage infrastructure, disrupt businesses, and lead to costly repairs. For instance, frequent flooding can hinder transportation and commerce, while the threat of severe weather may deter investment and tourism. Consequently, cities may face increased expenses for disaster preparedness and mitigation efforts, straining municipal budgets and impacting economic growth. - - **Query:** Discuss the significance of the Monroe Doctrine in shaping U.S. foreign policy. - **Response:** The Monroe Doctrine was a pivotal policy declared in 1823 that asserted U.S. opposition to European colonization in the Americas. By stating that any intervention by external powers in the Western Hemisphere would be viewed as a hostile act, it established the U.S. as a protector of the region. This doctrine shaped U.S. foreign policy by promoting isolation from European conflicts while justifying American influence and expansion in the hemisphere. Its long-term significance lies in its enduring influence on international relations and its role in defining the U.S. position in global affairs. - - -# Data -QUERY: {{${VariableInput}}} -RESPONSE: {{${VariableCompletion}}} - - -# Tasks -## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information: -- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". -- **Explanation**: a very short explanation of why you think the input Data should get that Score. -- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions. - -## Please provide only your Score as the last output on a new line. -# Output`, - choices: [ - { - choice: '1', - score: 0, - }, - { - choice: '2', - score: 0.25, - }, - { - choice: '3', - score: 0.5, - }, - { - choice: '4', - score: 0.75, - }, - { - choice: '5', - score: 1.0, - }, - ], - }, -} diff --git a/test_builtins.yml b/test_builtins.yml deleted file mode 100644 index 1e8717b2..00000000 --- a/test_builtins.yml +++ /dev/null @@ -1,25 +0,0 @@ -name: Test Built-in Evaluators -description: Testing the new LLM-based built-in evaluators -model: openai/gpt-4o -modelParameters: - temperature: 0.5 - maxTokens: 100 -testData: - - input: 'What is photosynthesis?' - expected: 'Photosynthesis is the process by which plants convert sunlight into energy using chlorophyll, converting carbon dioxide and water into glucose and oxygen.' -messages: - - role: system - content: You are a helpful assistant that provides accurate scientific information. - - role: user - content: '{{input}}' -evaluators: - - name: similarity test - uses: github/similarity - - name: coherence test - uses: github/coherence - - name: fluency test - uses: github/fluency - - name: relevance test - uses: github/relevance - - name: groundedness test - uses: github/groundedness diff --git a/test_evaluators.go b/test_evaluators.go deleted file mode 100644 index 6f5345e6..00000000 --- a/test_evaluators.go +++ /dev/null @@ -1,23 +0,0 @@ -package main - -import ( - "fmt" - "github.com/github/gh-models/cmd/eval" -) - -func main() { - fmt.Println("Testing built-in evaluators...") - - // Test that all expected evaluators exist - evaluators := []string{"similarity", "coherence", "fluency", "relevance", "groundedness"} - - for _, name := range evaluators { - if evaluator, exists := eval.BuiltInEvaluators[name]; exists { - fmt.Printf("✓ %s evaluator exists with model: %s\n", name, evaluator.ModelID) - } else { - fmt.Printf("✗ %s evaluator not found\n", name) - } - } - - fmt.Println("Built-in evaluators test completed!") -} From f70f858682b8c20440804f20da7aa402f939806a Mon Sep 17 00:00:00 2001 From: Sean Goedecke Date: Wed, 4 Jun 2025 08:54:00 +0000 Subject: [PATCH 9/9] Remove unused allResults var --- cmd/eval/eval.go | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/cmd/eval/eval.go b/cmd/eval/eval.go index 6e59a887..1d5ebaec 100644 --- a/cmd/eval/eval.go +++ b/cmd/eval/eval.go @@ -62,7 +62,7 @@ func NewEvalCommand(cfg *command.Config) *cobra.Command { string: contains: "hello" `), - Example: "gh models eval prompt.yml", + Example: "gh models eval my_prompt.prompt.yml", Args: cobra.ExactArgs(1), RunE: func(cmd *cobra.Command, args []string) error { promptFilePath := args[0] @@ -99,9 +99,6 @@ func loadEvaluationPromptFile(filePath string) (*EvaluationPromptFile, error) { return nil, fmt.Errorf("failed to load prompt file: %w", err) } - fmt.Printf("Loaded file with name='%s', model='%s', testData count=%d\n", - evalFile.Name, evalFile.Model, len(evalFile.TestData)) - return evalFile, nil } @@ -112,7 +109,6 @@ func (h *evalCommandHandler) runEvaluation(ctx context.Context) error { h.cfg.WriteToOut(fmt.Sprintf("Test cases: %d\n", len(h.evalFile.TestData))) h.cfg.WriteToOut("\n") - var allResults []TestResult passedTests := 0 totalTests := len(h.evalFile.TestData) @@ -124,8 +120,6 @@ func (h *evalCommandHandler) runEvaluation(ctx context.Context) error { return fmt.Errorf("test case %d failed: %w", i+1, err) } - allResults = append(allResults, result) - // Check if all evaluators passed testPassed := true for _, evalResult := range result.EvaluationResults {