diff --git a/cmd/eval/builtins.go b/cmd/eval/builtins.go new file mode 100644 index 00000000..0ee566d2 --- /dev/null +++ b/cmd/eval/builtins.go @@ -0,0 +1,386 @@ +package eval + +import "github.com/github/gh-models/pkg/prompt" + +// BuiltInEvaluators contains pre-configured LLM-based evaluators, taken from https://github.com/microsoft/promptflow +var BuiltInEvaluators = map[string]prompt.LLMEvaluator{ + "similarity": { + ModelID: "openai/gpt-4o", + SystemPrompt: "You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. You should return a single integer value between 1 to 5 representing the evaluation metric. You will include no other text or information.", + Prompt: `Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale: +One star: the predicted answer is not at all similar to the correct answer +Two stars: the predicted answer is mostly not similar to the correct answer +Three stars: the predicted answer is somewhat similar to the correct answer +Four stars: the predicted answer is mostly similar to the correct answer +Five stars: the predicted answer is completely similar to the correct answer + +This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. + +The examples below show the Equivalence score for a question, a correct answer, and a predicted answer. + +question: What is the role of ribosomes? +correct answer: Ribosomes are cellular structures responsible for protein synthesis. They interpret the genetic information carried by messenger RNA (mRNA) and use it to assemble amino acids into proteins. +predicted answer: Ribosomes participate in carbohydrate breakdown by removing nutrients from complex sugar molecules. +stars: 1 + +question: Why did the Titanic sink? +correct answer: The Titanic sank after it struck an iceberg during its maiden voyage in 1912. The impact caused the ship's hull to breach, allowing water to flood into the vessel. The ship's design, lifeboat shortage, and lack of timely rescue efforts contributed to the tragic loss of life. +predicted answer: The sinking of the Titanic was a result of a large iceberg collision. This caused the ship to take on water and eventually sink, leading to the death of many passengers due to a shortage of lifeboats and insufficient rescue attempts. +stars: 2 + +question: What causes seasons on Earth? +correct answer: Seasons on Earth are caused by the tilt of the Earth's axis and its revolution around the Sun. As the Earth orbits the Sun, the tilt causes different parts of the planet to receive varying amounts of sunlight, resulting in changes in temperature and weather patterns. +predicted answer: Seasons occur because of the Earth's rotation and its elliptical orbit around the Sun. The tilt of the Earth's axis causes regions to be subjected to different sunlight intensities, which leads to temperature fluctuations and alternating weather conditions. +stars: 3 + +question: How does photosynthesis work? +correct answer: Photosynthesis is a process by which green plants and some other organisms convert light energy into chemical energy. This occurs as light is absorbed by chlorophyll molecules, and then carbon dioxide and water are converted into glucose and oxygen through a series of reactions. +predicted answer: In photosynthesis, sunlight is transformed into nutrients by plants and certain microorganisms. Light is captured by chlorophyll molecules, followed by the conversion of carbon dioxide and water into sugar and oxygen through multiple reactions. +stars: 4 + +question: What are the health benefits of regular exercise? +correct answer: Regular exercise can help maintain a healthy weight, increase muscle and bone strength, and reduce the risk of chronic diseases. It also promotes mental well-being by reducing stress and improving overall mood. +predicted answer: Routine physical activity can contribute to maintaining ideal body weight, enhancing muscle and bone strength, and preventing chronic illnesses. In addition, it supports mental health by alleviating stress and augmenting general mood. +stars: 5 + +question: {{input}} +correct answer: {{expected}} +predicted answer: {{completion}} +stars:`, + Choices: []prompt.Choice{ + {Choice: "1", Score: 0.0}, + {Choice: "2", Score: 0.25}, + {Choice: "3", Score: 0.5}, + {Choice: "4", Score: 0.75}, + {Choice: "5", Score: 1.0}, + }, + }, + "coherence": { + ModelID: "openai/gpt-4o", + SystemPrompt: `# Instruction +## Goal +### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. +- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. +- **Data**: Your input data include a QUERY and a RESPONSE. +- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.`, + Prompt: `# Definition +**Coherence** refers to the logical and orderly presentation of ideas in a response, allowing the reader to easily follow and understand the writer's train of thought. A coherent answer directly addresses the question with clear connections between sentences and paragraphs, using appropriate transitions and a logical sequence of ideas. + +# Ratings +## [Coherence: 1] (Incoherent Response) +**Definition:** The response lacks coherence entirely. It consists of disjointed words or phrases that do not form complete or meaningful sentences. There is no logical connection to the question, making the response incomprehensible. + +**Examples:** + **Query:** What are the benefits of renewable energy? + **Response:** Wind sun green jump apple silence over. + + **Query:** Explain the process of photosynthesis. + **Response:** Plants light water flying blue music. + +## [Coherence: 2] (Poorly Coherent Response) +**Definition:** The response shows minimal coherence with fragmented sentences and limited connection to the question. It contains some relevant keywords but lacks logical structure and clear relationships between ideas, making the overall message difficult to understand. + +**Examples:** + **Query:** How does vaccination work? + **Response:** Vaccines protect disease. Immune system fight. Health better. + + **Query:** Describe how a bill becomes a law. + **Response:** Idea proposed. Congress discuss vote. President signs. + +## [Coherence: 3] (Partially Coherent Response) +**Definition:** The response partially addresses the question with some relevant information but exhibits issues in the logical flow and organization of ideas. Connections between sentences may be unclear or abrupt, requiring the reader to infer the links. The response may lack smooth transitions and may present ideas out of order. + +**Examples:** + **Query:** What causes earthquakes? + **Response:** Earthquakes happen when tectonic plates move suddenly. Energy builds up then releases. Ground shakes and can cause damage. + + **Query:** Explain the importance of the water cycle. + **Response:** The water cycle moves water around Earth. Evaporation, then precipitation occurs. It supports life by distributing water. + +## [Coherence: 4] (Coherent Response) +**Definition:** The response is coherent and effectively addresses the question. Ideas are logically organized with clear connections between sentences and paragraphs. Appropriate transitions are used to guide the reader through the response, which flows smoothly and is easy to follow. + +**Examples:** + **Query:** What is the water cycle and how does it work? + **Response:** The water cycle is the continuous movement of water on Earth through processes like evaporation, condensation, and precipitation. Water evaporates from bodies of water, forms clouds through condensation, and returns to the surface as precipitation. This cycle is essential for distributing water resources globally. + + **Query:** Describe the role of mitochondria in cellular function. + **Response:** Mitochondria are organelles that produce energy for the cell. They convert nutrients into ATP through cellular respiration. This energy powers various cellular activities, making mitochondria vital for cell survival. + +## [Coherence: 5] (Highly Coherent Response) +**Definition:** The response is exceptionally coherent, demonstrating sophisticated organization and flow. Ideas are presented in a logical and seamless manner, with excellent use of transitional phrases and cohesive devices. The connections between concepts are clear and enhance the reader's understanding. The response thoroughly addresses the question with clarity and precision. + +**Examples:** + **Query:** Analyze the economic impacts of climate change on coastal cities. + **Response:** Climate change significantly affects the economies of coastal cities through rising sea levels, increased flooding, and more intense storms. These environmental changes can damage infrastructure, disrupt businesses, and lead to costly repairs. For instance, frequent flooding can hinder transportation and commerce, while the threat of severe weather may deter investment and tourism. Consequently, cities may face increased expenses for disaster preparedness and mitigation efforts, straining municipal budgets and impacting economic growth. + + **Query:** Discuss the significance of the Monroe Doctrine in shaping U.S. foreign policy. + **Response:** The Monroe Doctrine was a pivotal policy declared in 1823 that asserted U.S. opposition to European colonization in the Americas. By stating that any intervention by external powers in the Western Hemisphere would be viewed as a hostile act, it established the U.S. as a protector of the region. This doctrine shaped U.S. foreign policy by promoting isolation from European conflicts while justifying American influence and expansion in the hemisphere. Its long-term significance lies in its enduring influence on international relations and its role in defining the U.S. position in global affairs. + +# Data +QUERY: {{input}} +RESPONSE: {{completion}} + +# Tasks +## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information: +- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". +- **Explanation**: a very short explanation of why you think the input Data should get that Score. +- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions. + +## Please provide only your Score as the last output on a new line. +# Output`, + Choices: []prompt.Choice{ + {Choice: "1", Score: 0.0}, + {Choice: "2", Score: 0.25}, + {Choice: "3", Score: 0.5}, + {Choice: "4", Score: 0.75}, + {Choice: "5", Score: 1.0}, + }, + }, + "fluency": { + ModelID: "openai/gpt-4o", + SystemPrompt: `# Instruction +## Goal +### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. +- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. +- **Data**: Your input data include a RESPONSE. +- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.`, + Prompt: `# Definition +**Fluency** refers to the effectiveness and clarity of written communication, focusing on grammatical accuracy, vocabulary range, sentence complexity, coherence, and overall readability. It assesses how smoothly ideas are conveyed and how easily the text can be understood by the reader. + +# Ratings +## [Fluency: 1] (Emergent Fluency) +**Definition:** The response shows minimal command of the language. It contains pervasive grammatical errors, extremely limited vocabulary, and fragmented or incoherent sentences. The message is largely incomprehensible, making understanding very difficult. + +**Examples:** + **Response:** Free time I. Go park. Not fun. Alone. + + **Response:** Like food pizza. Good cheese eat. + +## [Fluency: 2] (Basic Fluency) +**Definition:** The response communicates simple ideas but has frequent grammatical errors and limited vocabulary. Sentences are short and may be improperly constructed, leading to partial understanding. Repetition and awkward phrasing are common. + +**Examples:** + **Response:** I like play soccer. I watch movie. It fun. + + **Response:** My town small. Many people. We have market. + +## [Fluency: 3] (Competent Fluency) +**Definition:** The response clearly conveys ideas with occasional grammatical errors. Vocabulary is adequate but not extensive. Sentences are generally correct but may lack complexity and variety. The text is coherent, and the message is easily understood with minimal effort. + +**Examples:** + **Response:** I'm planning to visit friends and maybe see a movie together. + + **Response:** I try to eat healthy food and exercise regularly by jogging. + +## [Fluency: 4] (Proficient Fluency) +**Definition:** The response is well-articulated with good control of grammar and a varied vocabulary. Sentences are complex and well-structured, demonstrating coherence and cohesion. Minor errors may occur but do not affect overall understanding. The text flows smoothly, and ideas are connected logically. + +**Examples:** + **Response:** My interest in mathematics and problem-solving inspired me to become an engineer, as I enjoy designing solutions that improve people's lives. + + **Response:** Environmental conservation is crucial because it protects ecosystems, preserves biodiversity, and ensures natural resources are available for future generations. + +## [Fluency: 5] (Exceptional Fluency) +**Definition:** The response demonstrates an exceptional command of language with sophisticated vocabulary and complex, varied sentence structures. It is coherent, cohesive, and engaging, with precise and nuanced expression. Grammar is flawless, and the text reflects a high level of eloquence and style. + +**Examples:** + **Response:** Globalization exerts a profound influence on cultural diversity by facilitating unprecedented cultural exchange while simultaneously risking the homogenization of distinct cultural identities, which can diminish the richness of global heritage. + + **Response:** Technology revolutionizes modern education by providing interactive learning platforms, enabling personalized learning experiences, and connecting students worldwide, thereby transforming how knowledge is acquired and shared. + +# Data +RESPONSE: {{completion}} + +# Tasks +## Please provide your assessment Score for the previous RESPONSE based on the Definitions above. Your output should include the following information: +- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". +- **Explanation**: a very short explanation of why you think the input Data should get that Score. +- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions. + +## Please provide only your Score as the last output on a new line. +# Output`, + Choices: []prompt.Choice{ + {Choice: "1", Score: 0.0}, + {Choice: "2", Score: 0.25}, + {Choice: "3", Score: 0.5}, + {Choice: "4", Score: 0.75}, + {Choice: "5", Score: 1.0}, + }, + }, + "relevance": { + ModelID: "openai/gpt-4o", + SystemPrompt: `# Instruction +## Goal +### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. +- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. +- **Data**: Your input data include QUERY and RESPONSE. +- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.`, + Prompt: `# Definition +**Relevance** refers to how effectively a response addresses a question. It assesses the accuracy, completeness, and direct relevance of the response based solely on the given information. + +# Ratings +## [Relevance: 1] (Irrelevant Response) +**Definition:** The response is unrelated to the question. It provides information that is off-topic and does not attempt to address the question posed. + +**Examples:** + **Query:** What is the team preparing for? + **Response:** I went grocery shopping yesterday evening. + + **Query:** When will the company's new product line launch? + **Response:** International travel can be very rewarding and educational. + +## [Relevance: 2] (Incorrect Response) +**Definition:** The response attempts to address the question but includes incorrect information. It provides a response that is factually wrong based on the provided information. + +**Examples:** + **Query:** When was the merger between the two firms finalized? + **Response:** The merger was finalized on April 10th. + + **Query:** Where and when will the solar eclipse be visible? + **Response:** The solar eclipse will be visible in Asia on December 14th. + +## [Relevance: 3] (Incomplete Response) +**Definition:** The response addresses the question but omits key details necessary for a full understanding. It provides a partial response that lacks essential information. + +**Examples:** + **Query:** What type of food does the new restaurant offer? + **Response:** The restaurant offers Italian food like pasta. + + **Query:** What topics will the conference cover? + **Response:** The conference will cover renewable energy and climate change. + +## [Relevance: 4] (Complete Response) +**Definition:** The response fully addresses the question with accurate and complete information. It includes all essential details required for a comprehensive understanding, without adding any extraneous information. + +**Examples:** + **Query:** What type of food does the new restaurant offer? + **Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto. + + **Query:** What topics will the conference cover? + **Response:** The conference will cover renewable energy, climate change, and sustainability practices. + +## [Relevance: 5] (Comprehensive Response with Insights) +**Definition:** The response not only fully and accurately addresses the question but also includes additional relevant insights or elaboration. It may explain the significance, implications, or provide minor inferences that enhance understanding. + +**Examples:** + **Query:** What type of food does the new restaurant offer? + **Response:** The new restaurant offers Italian cuisine, featuring dishes like pasta, pizza, and risotto, aiming to provide customers with an authentic Italian dining experience. + + **Query:** What topics will the conference cover? + **Response:** The conference will cover renewable energy, climate change, and sustainability practices, bringing together global experts to discuss these critical issues. + +# Data +QUERY: {{input}} +RESPONSE: {{completion}} + +# Tasks +## Please provide your assessment Score for the previous RESPONSE in relation to the QUERY based on the Definitions above. Your output should include the following information: +- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". +- **Explanation**: a very short explanation of why you think the input Data should get that Score. +- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions. + +## Please provide only your Score as the last output on a new line. +# Output`, + Choices: []prompt.Choice{ + {Choice: "1", Score: 0.0}, + {Choice: "2", Score: 0.25}, + {Choice: "3", Score: 0.5}, + {Choice: "4", Score: 0.75}, + {Choice: "5", Score: 1.0}, + }, + }, + "groundedness": { + ModelID: "openai/gpt-4o", + SystemPrompt: `# Instruction +## Goal +### You are an expert in evaluating the quality of a RESPONSE from an intelligent system based on provided definition and data. Your goal will involve answering the questions below using the information provided. +- **Definition**: You are given a definition of the communication trait that is being evaluated to help guide your Score. +- **Data**: Your input data include CONTEXT, QUERY, and RESPONSE. +- **Tasks**: To complete your evaluation you will be asked to evaluate the Data in different ways.`, + Prompt: `# Definition +**Groundedness** refers to how well an answer is anchored in the provided context, evaluating its relevance, accuracy, and completeness based exclusively on that context. It assesses the extent to which the answer directly and fully addresses the question without introducing unrelated or incorrect information. The scale ranges from 1 to 5, with higher numbers indicating greater groundedness. + +# Ratings +## [Groundedness: 1] (Completely Unrelated Response) +**Definition:** An answer that does not relate to the question or the context in any way. It fails to address the topic, provides irrelevant information, or introduces completely unrelated subjects. + +**Examples:** + **Context:** The company's annual meeting will be held next Thursday. + **Query:** When is the company's annual meeting? + **Response:** I enjoy hiking in the mountains during summer. + + **Context:** The new policy aims to reduce carbon emissions by 20% over the next five years. + **Query:** What is the goal of the new policy? + **Response:** My favorite color is blue. + +## [Groundedness: 2] (Related Topic but Does Not Respond to the Query) +**Definition:** An answer that relates to the general topic of the context but does not answer the specific question asked. It may mention concepts from the context but fails to provide a direct or relevant response. + +**Examples:** + **Context:** The museum will exhibit modern art pieces from various local artists. + **Query:** What kind of art will be exhibited at the museum? + **Response:** Museums are important cultural institutions. + + **Context:** The new software update improves battery life and performance. + **Query:** What does the new software update improve? + **Response:** Software updates can sometimes fix bugs. + +## [Groundedness: 3] (Attempts to Respond but Contains Incorrect Information) +**Definition:** An answer that attempts to respond to the question but includes incorrect information not supported by the context. It may misstate facts, misinterpret the context, or provide erroneous details. + +**Examples:** + **Context:** The festival starts on June 5th and features international musicians. + **Query:** When does the festival start? + **Response:** The festival starts on July 5th and features local artists. + + **Context:** The recipe requires two eggs and one cup of milk. + **Query:** How many eggs are needed for the recipe? + **Response:** You need three eggs for the recipe. + +## [Groundedness: 4] (Partially Correct Response) +**Definition:** An answer that provides a correct response to the question but is incomplete or lacks specific details mentioned in the context. It captures some of the necessary information but omits key elements needed for a full understanding. + +**Examples:** + **Context:** The bookstore offers a 15% discount to students and a 10% discount to senior citizens. + **Query:** What discount does the bookstore offer to students? + **Response:** Students get a discount at the bookstore. + + **Context:** The company's headquarters are located in Berlin, Germany. + **Query:** Where are the company's headquarters? + **Response:** The company's headquarters are in Germany. + +## [Groundedness: 5] (Fully Correct and Complete Response) +**Definition:** An answer that thoroughly and accurately responds to the question, including all relevant details from the context. It directly addresses the question with precise information, demonstrating complete understanding without adding extraneous information. + +**Examples:** + **Context:** The author released her latest novel, 'The Silent Echo', on September 1st. + **Query:** When was 'The Silent Echo' released? + **Response:** 'The Silent Echo' was released on September 1st. + + **Context:** Participants must register by May 31st to be eligible for early bird pricing. + **Query:** By what date must participants register to receive early bird pricing? + **Response:** Participants must register by May 31st to receive early bird pricing. + +# Data +CONTEXT: {{expected}} +QUERY: {{input}} +RESPONSE: {{completion}} + +# Tasks +## Please provide your assessment Score for the previous RESPONSE in relation to the CONTEXT and QUERY based on the Definitions above. Your output should include the following information: +- **ThoughtChain**: To improve the reasoning process, think step by step and include a step-by-step explanation of your thought process as you analyze the data based on the definitions. Keep it brief and start your ThoughtChain with "Let's think step by step:". +- **Explanation**: a very short explanation of why you think the input Data should get that Score. +- **Score**: based on your previous analysis, provide your Score. The Score you give MUST be a integer score (i.e., "1", "2"...) based on the levels of the definitions. + +## Please provide only your Score as the last output on a new line. +# Output`, + Choices: []prompt.Choice{ + {Choice: "1", Score: 0.0}, + {Choice: "2", Score: 0.25}, + {Choice: "3", Score: 0.5}, + {Choice: "4", Score: 0.75}, + {Choice: "5", Score: 1.0}, + }, + }, +} diff --git a/cmd/eval/eval.go b/cmd/eval/eval.go new file mode 100644 index 00000000..1d5ebaec --- /dev/null +++ b/cmd/eval/eval.go @@ -0,0 +1,441 @@ +// Package eval provides a gh command to evaluate prompts against GitHub models. +package eval + +import ( + "context" + "errors" + "fmt" + "strings" + + "github.com/MakeNowJust/heredoc" + "github.com/github/gh-models/internal/azuremodels" + "github.com/github/gh-models/pkg/command" + "github.com/github/gh-models/pkg/prompt" + "github.com/github/gh-models/pkg/util" + "github.com/spf13/cobra" +) + +// EvaluationPromptFile represents the structure of a prompt.yml file for evaluation +// It extends the base prompt.File with evaluation-specific fields +type EvaluationPromptFile = prompt.File + +// TestResult represents the result of running a test case +type TestResult struct { + TestCase map[string]interface{} `json:"testCase"` + ModelResponse string `json:"modelResponse"` + EvaluationResults []EvaluationResult `json:"evaluationResults"` +} + +// EvaluationResult represents the result of a single evaluator +type EvaluationResult struct { + EvaluatorName string `json:"evaluatorName"` + Score float64 `json:"score"` + Passed bool `json:"passed"` + Details string `json:"details,omitempty"` +} + +// NewEvalCommand returns a new command to evaluate prompts against models +func NewEvalCommand(cfg *command.Config) *cobra.Command { + cmd := &cobra.Command{ + Use: "eval", + Short: "Evaluate prompts using test data and evaluators", + Long: heredoc.Docf(` + Runs evaluation tests against a model using a prompt.yml file. + + The prompt.yml file should contain: + - Model configuration and parameters + - Test data with input variables + - Messages with templated content + - Evaluators to assess model responses + + Example prompt.yml structure: + name: My Evaluation + model: gpt-4o + testData: + - input: "Hello world" + expected: "Hello there" + messages: + - role: user + content: "Respond to: {{input}}" + evaluators: + - name: contains-hello + string: + contains: "hello" + `), + Example: "gh models eval my_prompt.prompt.yml", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + promptFilePath := args[0] + + // Load the evaluation prompt file + evalFile, err := loadEvaluationPromptFile(promptFilePath) + if err != nil { + return fmt.Errorf("failed to load prompt file: %w", err) + } + + // Run evaluation + handler := &evalCommandHandler{ + cfg: cfg, + client: cfg.Client, + evalFile: evalFile, + } + + return handler.runEvaluation(cmd.Context()) + }, + } + + return cmd +} + +type evalCommandHandler struct { + cfg *command.Config + client azuremodels.Client + evalFile *EvaluationPromptFile +} + +func loadEvaluationPromptFile(filePath string) (*EvaluationPromptFile, error) { + evalFile, err := prompt.LoadFromFile(filePath) + if err != nil { + return nil, fmt.Errorf("failed to load prompt file: %w", err) + } + + return evalFile, nil +} + +func (h *evalCommandHandler) runEvaluation(ctx context.Context) error { + h.cfg.WriteToOut(fmt.Sprintf("Running evaluation: %s\n", h.evalFile.Name)) + h.cfg.WriteToOut(fmt.Sprintf("Description: %s\n", h.evalFile.Description)) + h.cfg.WriteToOut(fmt.Sprintf("Model: %s\n", h.evalFile.Model)) + h.cfg.WriteToOut(fmt.Sprintf("Test cases: %d\n", len(h.evalFile.TestData))) + h.cfg.WriteToOut("\n") + + passedTests := 0 + totalTests := len(h.evalFile.TestData) + + for i, testCase := range h.evalFile.TestData { + h.cfg.WriteToOut(fmt.Sprintf("Running test case %d/%d...\n", i+1, totalTests)) + + result, err := h.runTestCase(ctx, testCase) + if err != nil { + return fmt.Errorf("test case %d failed: %w", i+1, err) + } + + // Check if all evaluators passed + testPassed := true + for _, evalResult := range result.EvaluationResults { + if !evalResult.Passed { + testPassed = false + break + } + } + + if testPassed { + passedTests++ + h.cfg.WriteToOut(" ✓ PASSED\n") + } else { + h.cfg.WriteToOut(" ✗ FAILED\n") + // Show the first 100 characters of the model response when test fails + preview := result.ModelResponse + if len(preview) > 100 { + preview = preview[:100] + "..." + } + h.cfg.WriteToOut(fmt.Sprintf(" Model Response: %s\n", preview)) + } + + // Show evaluation details + for _, evalResult := range result.EvaluationResults { + status := "✓" + if !evalResult.Passed { + status = "✗" + } + h.cfg.WriteToOut(fmt.Sprintf(" %s %s (score: %.2f)\n", + status, evalResult.EvaluatorName, evalResult.Score)) + if evalResult.Details != "" { + h.cfg.WriteToOut(fmt.Sprintf(" %s\n", evalResult.Details)) + } + } + h.cfg.WriteToOut("\n") + } + + // Summary + h.cfg.WriteToOut("Evaluation Summary:\n") + if totalTests == 0 { + h.cfg.WriteToOut("Passed: 0/0 (0.0%)\n") + } else { + h.cfg.WriteToOut(fmt.Sprintf("Passed: %d/%d (%.1f%%)\n", + passedTests, totalTests, float64(passedTests)/float64(totalTests)*100)) + } + + if passedTests == totalTests { + h.cfg.WriteToOut("🎉 All tests passed!\n") + } else { + h.cfg.WriteToOut("❌ Some tests failed.\n") + } + + return nil +} + +func (h *evalCommandHandler) runTestCase(ctx context.Context, testCase map[string]interface{}) (TestResult, error) { + // Template the messages with test case data + messages, err := h.templateMessages(testCase) + if err != nil { + return TestResult{}, fmt.Errorf("failed to template messages: %w", err) + } + + // Call the model + response, err := h.callModel(ctx, messages) + if err != nil { + return TestResult{}, fmt.Errorf("failed to call model: %w", err) + } + + // Run evaluators + evalResults, err := h.runEvaluators(ctx, testCase, response) + if err != nil { + return TestResult{}, fmt.Errorf("failed to run evaluators: %w", err) + } + + return TestResult{ + TestCase: testCase, + ModelResponse: response, + EvaluationResults: evalResults, + }, nil +} + +func (h *evalCommandHandler) templateMessages(testCase map[string]interface{}) ([]azuremodels.ChatMessage, error) { + var messages []azuremodels.ChatMessage + + for _, msg := range h.evalFile.Messages { + content, err := h.templateString(msg.Content, testCase) + if err != nil { + return nil, fmt.Errorf("failed to template message content: %w", err) + } + + var role azuremodels.ChatMessageRole + switch strings.ToLower(msg.Role) { + case "system": + role = azuremodels.ChatMessageRoleSystem + case "user": + role = azuremodels.ChatMessageRoleUser + case "assistant": + role = azuremodels.ChatMessageRoleAssistant + default: + return nil, fmt.Errorf("unknown message role: %s", msg.Role) + } + + messages = append(messages, azuremodels.ChatMessage{ + Role: role, + Content: util.Ptr(content), + }) + } + + return messages, nil +} + +func (h *evalCommandHandler) templateString(templateStr string, data map[string]interface{}) (string, error) { + return prompt.TemplateString(templateStr, data) +} + +func (h *evalCommandHandler) callModel(ctx context.Context, messages []azuremodels.ChatMessage) (string, error) { + req := azuremodels.ChatCompletionOptions{ + Messages: messages, + Model: h.evalFile.Model, + Stream: false, + } + + // Apply model parameters + if h.evalFile.ModelParameters.MaxTokens != nil { + req.MaxTokens = h.evalFile.ModelParameters.MaxTokens + } + if h.evalFile.ModelParameters.Temperature != nil { + req.Temperature = h.evalFile.ModelParameters.Temperature + } + if h.evalFile.ModelParameters.TopP != nil { + req.TopP = h.evalFile.ModelParameters.TopP + } + + resp, err := h.client.GetChatCompletionStream(ctx, req) + if err != nil { + return "", err + } + + // For non-streaming requests, we should get a single response + var content strings.Builder + for { + completion, err := resp.Reader.Read() + if err != nil { + if errors.Is(err, context.Canceled) || strings.Contains(err.Error(), "EOF") { + break + } + return "", err + } + + for _, choice := range completion.Choices { + if choice.Delta != nil && choice.Delta.Content != nil { + content.WriteString(*choice.Delta.Content) + } + if choice.Message != nil && choice.Message.Content != nil { + content.WriteString(*choice.Message.Content) + } + } + } + + return strings.TrimSpace(content.String()), nil +} + +func (h *evalCommandHandler) runEvaluators(ctx context.Context, testCase map[string]interface{}, response string) ([]EvaluationResult, error) { + var results []EvaluationResult + + for _, evaluator := range h.evalFile.Evaluators { + result, err := h.runSingleEvaluator(ctx, evaluator, testCase, response) + if err != nil { + return nil, fmt.Errorf("evaluator %s failed: %w", evaluator.Name, err) + } + results = append(results, result) + } + + return results, nil +} + +func (h *evalCommandHandler) runSingleEvaluator(ctx context.Context, evaluator prompt.Evaluator, testCase map[string]interface{}, response string) (EvaluationResult, error) { + switch { + case evaluator.String != nil: + return h.runStringEvaluator(evaluator.Name, *evaluator.String, response) + case evaluator.LLM != nil: + return h.runLLMEvaluator(ctx, evaluator.Name, *evaluator.LLM, testCase, response) + case evaluator.Uses != "": + return h.runPluginEvaluator(ctx, evaluator.Name, evaluator.Uses, testCase, response) + default: + return EvaluationResult{}, fmt.Errorf("no evaluation method specified for evaluator %s", evaluator.Name) + } +} + +func (h *evalCommandHandler) runStringEvaluator(name string, eval prompt.StringEvaluator, response string) (EvaluationResult, error) { + var passed bool + var details string + + switch { + case eval.Equals != "": + passed = response == eval.Equals + details = fmt.Sprintf("Expected exact match: '%s'", eval.Equals) + case eval.Contains != "": + passed = strings.Contains(strings.ToLower(response), strings.ToLower(eval.Contains)) + details = fmt.Sprintf("Expected to contain: '%s'", eval.Contains) + case eval.StartsWith != "": + passed = strings.HasPrefix(strings.ToLower(response), strings.ToLower(eval.StartsWith)) + details = fmt.Sprintf("Expected to start with: '%s'", eval.StartsWith) + case eval.EndsWith != "": + passed = strings.HasSuffix(strings.ToLower(response), strings.ToLower(eval.EndsWith)) + details = fmt.Sprintf("Expected to end with: '%s'", eval.EndsWith) + default: + return EvaluationResult{}, errors.New("no string evaluation criteria specified") + } + + score := 0.0 + if passed { + score = 1.0 + } + + return EvaluationResult{ + EvaluatorName: name, + Score: score, + Passed: passed, + Details: details, + }, nil +} + +func (h *evalCommandHandler) runLLMEvaluator(ctx context.Context, name string, eval prompt.LLMEvaluator, testCase map[string]interface{}, response string) (EvaluationResult, error) { + // Template the evaluation prompt + evalData := make(map[string]interface{}) + for k, v := range testCase { + evalData[k] = v + } + evalData["completion"] = response + + promptContent, err := h.templateString(eval.Prompt, evalData) + if err != nil { + return EvaluationResult{}, fmt.Errorf("failed to template evaluation prompt: %w", err) + } + + // Prepare messages for evaluation + var messages []azuremodels.ChatMessage + if eval.SystemPrompt != "" { + messages = append(messages, azuremodels.ChatMessage{ + Role: azuremodels.ChatMessageRoleSystem, + Content: util.Ptr(eval.SystemPrompt), + }) + } + messages = append(messages, azuremodels.ChatMessage{ + Role: azuremodels.ChatMessageRoleUser, + Content: util.Ptr(promptContent), + }) + + // Call the evaluation model + req := azuremodels.ChatCompletionOptions{ + Messages: messages, + Model: eval.ModelID, + Stream: false, + } + + resp, err := h.client.GetChatCompletionStream(ctx, req) + if err != nil { + return EvaluationResult{}, fmt.Errorf("failed to call evaluation model: %w", err) + } + + var evalResponse strings.Builder + for { + completion, err := resp.Reader.Read() + if err != nil { + if errors.Is(err, context.Canceled) || strings.Contains(err.Error(), "EOF") { + break + } + return EvaluationResult{}, err + } + + for _, choice := range completion.Choices { + if choice.Delta != nil && choice.Delta.Content != nil { + evalResponse.WriteString(*choice.Delta.Content) + } + if choice.Message != nil && choice.Message.Content != nil { + evalResponse.WriteString(*choice.Message.Content) + } + } + } + + // Match response to choices + evalResponseText := strings.TrimSpace(strings.ToLower(evalResponse.String())) + for _, choice := range eval.Choices { + if strings.Contains(evalResponseText, strings.ToLower(choice.Choice)) { + return EvaluationResult{ + EvaluatorName: name, + Score: choice.Score, + Passed: choice.Score > 0, + Details: fmt.Sprintf("LLM evaluation matched choice: '%s'", choice.Choice), + }, nil + } + } + + // No match found + return EvaluationResult{ + EvaluatorName: name, + Score: 0.0, + Passed: false, + Details: fmt.Sprintf("LLM evaluation response '%s' did not match any defined choices", evalResponseText), + }, nil +} + +func (h *evalCommandHandler) runPluginEvaluator(ctx context.Context, name, plugin string, testCase map[string]interface{}, response string) (EvaluationResult, error) { + // Handle built-in evaluators like github/similarity, github/coherence, etc. + if strings.HasPrefix(plugin, "github/") { + evaluatorName := strings.TrimPrefix(plugin, "github/") + if builtinEvaluator, exists := BuiltInEvaluators[evaluatorName]; exists { + return h.runLLMEvaluator(ctx, name, builtinEvaluator, testCase, response) + } + } + + return EvaluationResult{ + EvaluatorName: name, + Score: 0.0, + Passed: false, + Details: fmt.Sprintf("Plugin evaluator '%s' not found", plugin), + }, nil +} diff --git a/cmd/eval/eval_test.go b/cmd/eval/eval_test.go new file mode 100644 index 00000000..caca2d04 --- /dev/null +++ b/cmd/eval/eval_test.go @@ -0,0 +1,301 @@ +package eval + +import ( + "bytes" + "context" + "os" + "path/filepath" + "testing" + + "github.com/github/gh-models/internal/azuremodels" + "github.com/github/gh-models/internal/sse" + "github.com/github/gh-models/pkg/command" + "github.com/github/gh-models/pkg/prompt" + "github.com/stretchr/testify/require" +) + +func TestEval(t *testing.T) { + t.Run("loads and parses evaluation prompt file", func(t *testing.T) { + const yamlBody = ` +name: Test Evaluation +description: A test evaluation +model: openai/gpt-4o +modelParameters: + temperature: 0.5 + maxTokens: 100 +testData: + - input: "hello" + expected: "hello world" + - input: "goodbye" + expected: "goodbye world" +messages: + - role: system + content: You are a helpful assistant. + - role: user + content: "Please respond to: {{input}}" +evaluators: + - name: contains-world + string: + contains: "world" + - name: similarity-check + uses: github/similarity +` + + tmpDir := t.TempDir() + promptFile := filepath.Join(tmpDir, "test.prompt.yml") + err := os.WriteFile(promptFile, []byte(yamlBody), 0644) + require.NoError(t, err) + + evalFile, err := prompt.LoadFromFile(promptFile) + require.NoError(t, err) + require.Equal(t, "Test Evaluation", evalFile.Name) + require.Equal(t, "A test evaluation", evalFile.Description) + require.Equal(t, "openai/gpt-4o", evalFile.Model) + require.Equal(t, 0.5, *evalFile.ModelParameters.Temperature) + require.Equal(t, 100, *evalFile.ModelParameters.MaxTokens) + require.Len(t, evalFile.TestData, 2) + require.Len(t, evalFile.Messages, 2) + require.Len(t, evalFile.Evaluators, 2) + }) + + t.Run("templates messages correctly", func(t *testing.T) { + evalFile := &prompt.File{ + Messages: []prompt.Message{ + {Role: "system", Content: "You are helpful."}, + {Role: "user", Content: "Process {{input}} and return {{expected}}"}, + }, + } + + handler := &evalCommandHandler{evalFile: evalFile} + testCase := map[string]interface{}{ + "input": "hello", + "expected": "world", + } + + messages, err := handler.templateMessages(testCase) + require.NoError(t, err) + require.Len(t, messages, 2) + require.Equal(t, "You are helpful.", *messages[0].Content) + require.Equal(t, "Process hello and return world", *messages[1].Content) + }) + + t.Run("string evaluator works correctly", func(t *testing.T) { + handler := &evalCommandHandler{} + + tests := []struct { + name string + evaluator prompt.StringEvaluator + response string + expected bool + }{ + { + name: "contains match", + evaluator: prompt.StringEvaluator{Contains: "world"}, + response: "hello world", + expected: true, + }, + { + name: "contains no match", + evaluator: prompt.StringEvaluator{Contains: "world"}, + response: "hello there", + expected: false, + }, + { + name: "equals match", + evaluator: prompt.StringEvaluator{Equals: "exact"}, + response: "exact", + expected: true, + }, + { + name: "equals no match", + evaluator: prompt.StringEvaluator{Equals: "exact"}, + response: "not exact", + expected: false, + }, + { + name: "starts with match", + evaluator: prompt.StringEvaluator{StartsWith: "hello"}, + response: "hello world", + expected: true, + }, + { + name: "ends with match", + evaluator: prompt.StringEvaluator{EndsWith: "world"}, + response: "hello world", + expected: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := handler.runStringEvaluator("test", tt.evaluator, tt.response) + require.NoError(t, err) + require.Equal(t, tt.expected, result.Passed) + if tt.expected { + require.Equal(t, 1.0, result.Score) + } else { + require.Equal(t, 0.0, result.Score) + } + }) + } + }) + + t.Run("plugin evaluator works with github/similarity", func(t *testing.T) { + out := new(bytes.Buffer) + client := azuremodels.NewMockClient() + cfg := command.NewConfig(out, out, client, true, 100) + + // Mock a response that returns "4" for the LLM evaluator + client.MockGetChatCompletionStream = func(ctx context.Context, req azuremodels.ChatCompletionOptions) (*azuremodels.ChatCompletionResponse, error) { + reader := sse.NewMockEventReader([]azuremodels.ChatCompletion{ + { + Choices: []azuremodels.ChatChoice{ + { + Message: &azuremodels.ChatChoiceMessage{ + Content: func() *string { s := "4"; return &s }(), + }, + }, + }, + }, + }) + return &azuremodels.ChatCompletionResponse{Reader: reader}, nil + } + + handler := &evalCommandHandler{ + cfg: cfg, + client: client, + } + testCase := map[string]interface{}{ + "input": "test question", + "expected": "test answer", + } + + result, err := handler.runPluginEvaluator(context.Background(), "similarity", "github/similarity", testCase, "test response") + require.NoError(t, err) + require.Equal(t, "similarity", result.EvaluatorName) + require.Equal(t, 0.75, result.Score) // Score for choice "4" + require.True(t, result.Passed) + }) + + t.Run("command creation works", func(t *testing.T) { + out := new(bytes.Buffer) + client := azuremodels.NewMockClient() + cfg := command.NewConfig(out, out, client, true, 100) + + cmd := NewEvalCommand(cfg) + require.Equal(t, "eval", cmd.Use) + require.Contains(t, cmd.Short, "Evaluate prompts") + }) + + t.Run("integration test with mock client", func(t *testing.T) { + const yamlBody = ` +name: Mock Test +description: Test with mock client +model: openai/test-model +testData: + - input: "test input" + expected: "test response" +messages: + - role: user + content: "{{input}}" +evaluators: + - name: contains-test + string: + contains: "test" +` + + tmpDir := t.TempDir() + promptFile := filepath.Join(tmpDir, "test.prompt.yml") + err := os.WriteFile(promptFile, []byte(yamlBody), 0644) + require.NoError(t, err) + + client := azuremodels.NewMockClient() + + // Mock a simple response + client.MockGetChatCompletionStream = func(ctx context.Context, req azuremodels.ChatCompletionOptions) (*azuremodels.ChatCompletionResponse, error) { + // Create a mock reader that returns "test response" + reader := sse.NewMockEventReader([]azuremodels.ChatCompletion{ + { + Choices: []azuremodels.ChatChoice{ + { + Message: &azuremodels.ChatChoiceMessage{ + Content: func() *string { s := "test response"; return &s }(), + }, + }, + }, + }, + }) + return &azuremodels.ChatCompletionResponse{Reader: reader}, nil + } + + out := new(bytes.Buffer) + cfg := command.NewConfig(out, out, client, true, 100) + + cmd := NewEvalCommand(cfg) + cmd.SetArgs([]string{promptFile}) + + err = cmd.Execute() + require.NoError(t, err) + + output := out.String() + require.Contains(t, output, "Mock Test") + require.Contains(t, output, "Running test case") + require.Contains(t, output, "PASSED") + }) + + t.Run("logs model response when test fails", func(t *testing.T) { + const yamlBody = ` +name: Failing Test +description: Test that fails to check model response logging +model: openai/test-model +testData: + - input: "test input" + expected: "expected but not returned" +messages: + - role: user + content: "{{input}}" +evaluators: + - name: contains-nonexistent + string: + contains: "nonexistent text" +` + + tmpDir := t.TempDir() + promptFile := filepath.Join(tmpDir, "test.prompt.yml") + err := os.WriteFile(promptFile, []byte(yamlBody), 0644) + require.NoError(t, err) + + client := azuremodels.NewMockClient() + + // Mock a response that will fail the evaluator + client.MockGetChatCompletionStream = func(ctx context.Context, req azuremodels.ChatCompletionOptions) (*azuremodels.ChatCompletionResponse, error) { + reader := sse.NewMockEventReader([]azuremodels.ChatCompletion{ + { + Choices: []azuremodels.ChatChoice{ + { + Message: &azuremodels.ChatChoiceMessage{ + Content: func() *string { s := "actual model response"; return &s }(), + }, + }, + }, + }, + }) + return &azuremodels.ChatCompletionResponse{Reader: reader}, nil + } + + out := new(bytes.Buffer) + cfg := command.NewConfig(out, out, client, true, 100) + + cmd := NewEvalCommand(cfg) + cmd.SetArgs([]string{promptFile}) + + err = cmd.Execute() + require.NoError(t, err) + + output := out.String() + require.Contains(t, output, "Failing Test") + require.Contains(t, output, "Running test case") + require.Contains(t, output, "FAILED") + require.Contains(t, output, "Model Response: actual model response") + }) +} diff --git a/cmd/root.go b/cmd/root.go index 9e9b94ff..b27dd305 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -8,6 +8,7 @@ import ( "github.com/MakeNowJust/heredoc" "github.com/cli/go-gh/v2/pkg/auth" "github.com/cli/go-gh/v2/pkg/term" + "github.com/github/gh-models/cmd/eval" "github.com/github/gh-models/cmd/list" "github.com/github/gh-models/cmd/run" "github.com/github/gh-models/cmd/view" @@ -54,6 +55,7 @@ func NewRootCommand() *cobra.Command { cfg := command.NewConfigWithTerminal(terminal, client) + cmd.AddCommand(eval.NewEvalCommand(cfg)) cmd.AddCommand(list.NewListCommand(cfg)) cmd.AddCommand(run.NewRunCommand(cfg)) cmd.AddCommand(view.NewViewCommand(cfg)) diff --git a/cmd/root_test.go b/cmd/root_test.go index d05b1cdd..817701af 100644 --- a/cmd/root_test.go +++ b/cmd/root_test.go @@ -19,6 +19,7 @@ func TestRoot(t *testing.T) { require.NoError(t, err) output := buf.String() require.Regexp(t, regexp.MustCompile(`Usage:\n\s+gh models \[command\]`), output) + require.Regexp(t, regexp.MustCompile(`eval\s+Evaluate prompts using test data and evaluators`), output) require.Regexp(t, regexp.MustCompile(`list\s+List available models`), output) require.Regexp(t, regexp.MustCompile(`run\s+Run inference with the specified model`), output) require.Regexp(t, regexp.MustCompile(`view\s+View details about a model`), output) diff --git a/cmd/run/run.go b/cmd/run/run.go index 7a3a885c..fca6faad 100644 --- a/cmd/run/run.go +++ b/cmd/run/run.go @@ -18,10 +18,10 @@ import ( "github.com/github/gh-models/internal/azuremodels" "github.com/github/gh-models/internal/sse" "github.com/github/gh-models/pkg/command" + "github.com/github/gh-models/pkg/prompt" "github.com/github/gh-models/pkg/util" "github.com/spf13/cobra" "github.com/spf13/pflag" - "gopkg.in/yaml.v3" ) // ModelParameters represents the parameters that can be set for a model run. @@ -189,22 +189,6 @@ func isPipe(r io.Reader) bool { return false } -// promptFile mirrors the format of .prompt.yml -type promptFile struct { - Name string `yaml:"name"` - Description string `yaml:"description"` - Model string `yaml:"model"` - ModelParameters struct { - MaxTokens *int `yaml:"maxTokens"` - Temperature *float64 `yaml:"temperature"` - TopP *float64 `yaml:"topP"` - } `yaml:"modelParameters"` - Messages []struct { - Role string `yaml:"role"` - Content string `yaml:"content"` - } `yaml:"messages"` -} - // NewRunCommand returns a new gh command for running a model. func NewRunCommand(cfg *command.Config) *cobra.Command { cmd := &cobra.Command{ @@ -226,17 +210,13 @@ func NewRunCommand(cfg *command.Config) *cobra.Command { Args: cobra.ArbitraryArgs, RunE: func(cmd *cobra.Command, args []string) error { filePath, _ := cmd.Flags().GetString("file") - var pf *promptFile + var pf *prompt.File if filePath != "" { - b, err := os.ReadFile(filePath) + var err error + pf, err = prompt.LoadFromFile(filePath) if err != nil { return err } - p := promptFile{} - if err := yaml.Unmarshal(b, &p); err != nil { - return err - } - pf = &p // Inject model name as the first positional arg if user didn't supply one if pf.Model != "" && len(args) == 0 { args = append([]string{pf.Model}, args...) @@ -297,13 +277,21 @@ func NewRunCommand(cfg *command.Config) *cobra.Command { } else { interactiveMode = false + // Template the messages with the input + templateData := map[string]interface{}{ + "input": initialPrompt, + } + for _, m := range pf.Messages { - content := m.Content + content, err := prompt.TemplateString(m.Content, templateData) + if err != nil { + return err + } + switch strings.ToLower(m.Role) { case "system": conversation.systemPrompt = content case "user": - content = strings.ReplaceAll(content, "{{input}}", initialPrompt) conversation.AddMessage(azuremodels.ChatMessageRoleUser, content) case "assistant": conversation.AddMessage(azuremodels.ChatMessageRoleAssistant, content) diff --git a/fixtures/failing_test_prompt.yml b/fixtures/failing_test_prompt.yml new file mode 100644 index 00000000..652f599c --- /dev/null +++ b/fixtures/failing_test_prompt.yml @@ -0,0 +1,23 @@ +name: Failing Evaluation Test +description: Test that will fail to demonstrate model response logging +model: openai/gpt-4o +modelParameters: + temperature: 0.7 + maxTokens: 150 +testData: + - input: "What is the capital of France?" + expected: "Paris" + - input: "What is 2 + 2?" + expected: "4" +messages: + - role: system + content: You are a helpful assistant. + - role: user + content: "{{input}}" +evaluators: + - name: contains-impossible + string: + contains: "this-text-will-never-appear-in-any-response" + - name: starts-with-wrong + string: + startsWith: "ZZZZZ" diff --git a/fixtures/sample_prompt.yml b/fixtures/sample_prompt.yml new file mode 100644 index 00000000..342b4c81 --- /dev/null +++ b/fixtures/sample_prompt.yml @@ -0,0 +1,22 @@ +name: Sample Evaluation +description: A sample evaluation for testing the eval command +model: openai/gpt-4o +modelParameters: + temperature: 0.5 + maxTokens: 50 +testData: + - input: 'hello world' + expected: 'greeting response' + - input: 'goodbye world' + expected: 'farewell response' +messages: + - role: system + content: You are a helpful assistant that responds to greetings and farewells. + - role: user + content: 'Please respond to this message appropriately: {{input}}' +evaluators: + - name: string evaluator + string: + contains: world + - name: similarity check + uses: github/similarity diff --git a/fixtures/test_builtins.yml b/fixtures/test_builtins.yml new file mode 100644 index 00000000..1e8717b2 --- /dev/null +++ b/fixtures/test_builtins.yml @@ -0,0 +1,25 @@ +name: Test Built-in Evaluators +description: Testing the new LLM-based built-in evaluators +model: openai/gpt-4o +modelParameters: + temperature: 0.5 + maxTokens: 100 +testData: + - input: 'What is photosynthesis?' + expected: 'Photosynthesis is the process by which plants convert sunlight into energy using chlorophyll, converting carbon dioxide and water into glucose and oxygen.' +messages: + - role: system + content: You are a helpful assistant that provides accurate scientific information. + - role: user + content: '{{input}}' +evaluators: + - name: similarity test + uses: github/similarity + - name: coherence test + uses: github/coherence + - name: fluency test + uses: github/fluency + - name: relevance test + uses: github/relevance + - name: groundedness test + uses: github/groundedness diff --git a/fixtures/test_single_evaluator.yml b/fixtures/test_single_evaluator.yml new file mode 100644 index 00000000..34f2d414 --- /dev/null +++ b/fixtures/test_single_evaluator.yml @@ -0,0 +1,12 @@ +name: "Test Single Evaluator" +description: "Testing a single built-in evaluator" +model: "openai/gpt-4o" +testData: + - input: "What is machine learning?" + expected: "Machine learning is a subset of artificial intelligence that enables computers to learn and make decisions from data without being explicitly programmed." +messages: + - role: user + content: "{{input}}" +evaluators: + - name: "fluency-test" + uses: "github/fluency" diff --git a/pkg/prompt/prompt.go b/pkg/prompt/prompt.go new file mode 100644 index 00000000..f13d1968 --- /dev/null +++ b/pkg/prompt/prompt.go @@ -0,0 +1,113 @@ +// Package prompt provides shared types and utilities for working with .prompt.yml files +package prompt + +import ( + "fmt" + "os" + "strings" + + "gopkg.in/yaml.v3" +) + +// File represents the structure of a .prompt.yml file +type File struct { + Name string `yaml:"name"` + Description string `yaml:"description"` + Model string `yaml:"model"` + ModelParameters ModelParameters `yaml:"modelParameters"` + Messages []Message `yaml:"messages"` + // TestData and Evaluators are only used by eval command + TestData []map[string]interface{} `yaml:"testData,omitempty"` + Evaluators []Evaluator `yaml:"evaluators,omitempty"` +} + +// ModelParameters represents model configuration parameters +type ModelParameters struct { + MaxTokens *int `yaml:"maxTokens"` + Temperature *float64 `yaml:"temperature"` + TopP *float64 `yaml:"topP"` +} + +// Message represents a conversation message +type Message struct { + Role string `yaml:"role"` + Content string `yaml:"content"` +} + +// Evaluator represents an evaluation method (only used by eval command) +type Evaluator struct { + Name string `yaml:"name"` + String *StringEvaluator `yaml:"string,omitempty"` + LLM *LLMEvaluator `yaml:"llm,omitempty"` + Uses string `yaml:"uses,omitempty"` +} + +// StringEvaluator represents string-based evaluation +type StringEvaluator struct { + EndsWith string `yaml:"endsWith,omitempty"` + StartsWith string `yaml:"startsWith,omitempty"` + Contains string `yaml:"contains,omitempty"` + Equals string `yaml:"equals,omitempty"` +} + +// LLMEvaluator represents LLM-based evaluation +type LLMEvaluator struct { + ModelID string `yaml:"modelId"` + Prompt string `yaml:"prompt"` + Choices []Choice `yaml:"choices"` + SystemPrompt string `yaml:"systemPrompt,omitempty"` +} + +// Choice represents a scoring choice for LLM evaluation +type Choice struct { + Choice string `yaml:"choice"` + Score float64 `yaml:"score"` +} + +// LoadFromFile loads and parses a prompt file from the given path +func LoadFromFile(filePath string) (*File, error) { + data, err := os.ReadFile(filePath) + if err != nil { + return nil, err + } + + var promptFile File + if err := yaml.Unmarshal(data, &promptFile); err != nil { + return nil, err + } + + return &promptFile, nil +} + +// TemplateString templates a string with the given data using simple {{variable}} replacement +func TemplateString(templateStr string, data interface{}) (string, error) { + result := templateStr + + // Convert data to map[string]interface{} if it's not already + var dataMap map[string]interface{} + switch d := data.(type) { + case map[string]interface{}: + dataMap = d + case map[string]string: + dataMap = make(map[string]interface{}) + for k, v := range d { + dataMap[k] = v + } + default: + // If it's not a map, we can't template it + return result, nil + } + + // Replace all {{variable}} patterns with values from the data map + for key, value := range dataMap { + placeholder := "{{" + key + "}}" + if valueStr, ok := value.(string); ok { + result = strings.ReplaceAll(result, placeholder, valueStr) + } else { + // Convert non-string values to string + result = strings.ReplaceAll(result, placeholder, fmt.Sprintf("%v", value)) + } + } + + return result, nil +} diff --git a/pkg/prompt/prompt_test.go b/pkg/prompt/prompt_test.go new file mode 100644 index 00000000..a6ef1264 --- /dev/null +++ b/pkg/prompt/prompt_test.go @@ -0,0 +1,94 @@ +package prompt + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestPromptFile(t *testing.T) { + t.Run("loads and parses prompt file", func(t *testing.T) { + const yamlBody = ` +name: Test Prompt +description: A test prompt file +model: openai/gpt-4o +modelParameters: + temperature: 0.5 + maxTokens: 100 +messages: + - role: system + content: You are a helpful assistant. + - role: user + content: "Hello {{name}}" +testData: + - name: "Alice" + - name: "Bob" +evaluators: + - name: contains-greeting + string: + contains: "hello" +` + + tmpDir := t.TempDir() + promptFilePath := filepath.Join(tmpDir, "test.prompt.yml") + err := os.WriteFile(promptFilePath, []byte(yamlBody), 0644) + require.NoError(t, err) + + promptFile, err := LoadFromFile(promptFilePath) + require.NoError(t, err) + require.Equal(t, "Test Prompt", promptFile.Name) + require.Equal(t, "A test prompt file", promptFile.Description) + require.Equal(t, "openai/gpt-4o", promptFile.Model) + require.Equal(t, 0.5, *promptFile.ModelParameters.Temperature) + require.Equal(t, 100, *promptFile.ModelParameters.MaxTokens) + require.Len(t, promptFile.Messages, 2) + require.Equal(t, "system", promptFile.Messages[0].Role) + require.Equal(t, "You are a helpful assistant.", promptFile.Messages[0].Content) + require.Equal(t, "user", promptFile.Messages[1].Role) + require.Equal(t, "Hello {{name}}", promptFile.Messages[1].Content) + require.Len(t, promptFile.TestData, 2) + require.Equal(t, "Alice", promptFile.TestData[0]["name"]) + require.Equal(t, "Bob", promptFile.TestData[1]["name"]) + require.Len(t, promptFile.Evaluators, 1) + require.Equal(t, "contains-greeting", promptFile.Evaluators[0].Name) + require.Equal(t, "hello", promptFile.Evaluators[0].String.Contains) + }) + + t.Run("templates messages correctly", func(t *testing.T) { + testData := map[string]interface{}{ + "name": "World", + "age": 25, + } + + result, err := TemplateString("Hello {{name}}, you are {{age}} years old", testData) + require.NoError(t, err) + require.Equal(t, "Hello World, you are 25 years old", result) + }) + + t.Run("handles missing template variables", func(t *testing.T) { + testData := map[string]interface{}{ + "name": "World", + } + + result, err := TemplateString("Hello {{name}}, you are {{missing}} years old", testData) + require.NoError(t, err) + require.Equal(t, "Hello World, you are {{missing}} years old", result) + }) + + t.Run("handles file not found", func(t *testing.T) { + _, err := LoadFromFile("/nonexistent/file.yml") + require.Error(t, err) + }) + + t.Run("handles invalid YAML", func(t *testing.T) { + tmpDir := t.TempDir() + promptFilePath := filepath.Join(tmpDir, "invalid.prompt.yml") + err := os.WriteFile(promptFilePath, []byte("invalid: yaml: content: ["), 0644) + require.NoError(t, err) + + _, err = LoadFromFile(promptFilePath) + require.Error(t, err) + }) +} diff --git a/s.prompt.yml b/s.prompt.yml deleted file mode 100644 index b8b577f2..00000000 --- a/s.prompt.yml +++ /dev/null @@ -1,14 +0,0 @@ -name: Text Summarizer -description: Summarizes input text concisely -model: openai/gpt-4o-mini -modelParameters: - temperature: 0.5 -messages: - - role: system - content: You are a text summarizer. Your only job is to summarize text given to you. - - role: user - content: | - Summarize the given text, beginning with "Summary -": - - {{input}} - \ No newline at end of file