From b2f782eb678195ca21da448ad8d5509476e32902 Mon Sep 17 00:00:00 2001 From: jaoppb Date: Tue, 28 Apr 2026 19:23:23 -0300 Subject: [PATCH 1/4] refactor: extract SubmissionGrader and move GraderService to dedicated package - Create autograder.services.grader package. - Relocate GraderService to services/grader/grader_service.py. - Extract stateful tree traversal logic into SubmissionGrader within services/grader/criteria_grader.py. - Update GraderService to act as a stateless coordinator delegating to SubmissionGrader. - Refactor unit tests to use SubmissionGrader and update internal imports. - Update project documentation to reflect structural changes. Co-authored-by: gemini-cli <218195315+gemini-cli@users.noreply.github.com> --- autograder/services/grader/__init__.py | 0 autograder/services/grader/criteria_grader.py | 181 +++++++++++ autograder/services/grader/grader_service.py | 52 +++ autograder/services/grader_service.py | 300 ------------------ autograder/steps/grade_step.py | 2 +- docs/features/command_resolver.md | 4 +- docs/features/grading_engine.md | 2 +- docs/guides/development.md | 2 +- docs/pipeline/04.5-ai-batch.md | 2 +- docs/pipeline/05-grade.md | 2 +- docs/roadmaps/TECHNICAL_DEBT_ROADMAP.md | 6 +- tests/unit/pipeline/test_ai_batch_step.py | 2 +- .../test_command_resolution_at_build_time.py | 59 ++-- 13 files changed, 276 insertions(+), 338 deletions(-) create mode 100644 autograder/services/grader/__init__.py create mode 100644 autograder/services/grader/criteria_grader.py create mode 100644 autograder/services/grader/grader_service.py delete mode 100644 autograder/services/grader_service.py diff --git a/autograder/services/grader/__init__.py b/autograder/services/grader/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/autograder/services/grader/criteria_grader.py b/autograder/services/grader/criteria_grader.py new file mode 100644 index 00000000..81cea902 --- /dev/null +++ b/autograder/services/grader/criteria_grader.py @@ -0,0 +1,181 @@ +import logging +from typing import Dict, Optional, Sequence, List, overload + +from autograder.models.abstract.criteria_tree_processer import CriteriaTreeProcesser +from autograder.models.criteria_tree import ( + CategoryNode, + SubjectNode, + TestNode, +) +from autograder.models.dataclass.submission import SubmissionFile +from autograder.models.dataclass.test_result import TestResult +from autograder.models.result_tree import ( + CategoryResultNode, + SubjectResultNode, + TestResultNode, +) +from autograder.services.command_resolver import CommandResolver + + +class SubmissionGrader(CriteriaTreeProcesser): + """ + Stateful grader responsible for traversing a criteria tree for a single submission. + Implements the CriteriaTreeProcesser interface. + """ + + def __init__( + self, + submission_files: Dict[str, SubmissionFile], + command_resolver: CommandResolver, + sandbox=None, + submission_language=None, + locale: str = "en", + pre_computed_results: Optional[Dict[str, TestResult]] = None, + structural_analysis=None, + ): + self.logger = logging.getLogger("SubmissionGrader") + self.submission_files = submission_files + self.command_resolver = command_resolver + self.sandbox = sandbox + self.submission_language = submission_language + self.locale = locale + self.pre_computed_results = pre_computed_results + self.structural_analysis = structural_analysis + + def __balance_nodes( + self, + nodes: Sequence[CategoryResultNode | SubjectResultNode | TestResultNode], + factor: float, + ) -> None: + """Balance the weights of sibling nodes to sum to 100, scaled by factor.""" + if len(nodes) == 0: + return + + total_weight = sum(node.weight for node in nodes) * factor + + if total_weight == 0: + equal_weight = 100.0 / len(nodes) + for node in nodes: + node.weight = equal_weight + elif total_weight != 100: + scale_factor = 100.0 / total_weight + for node in nodes: + node.weight *= scale_factor + + @overload + def __process_holder(self, holder: CategoryNode) -> CategoryResultNode: ... + + @overload + def __process_holder(self, holder: SubjectNode) -> SubjectResultNode: ... + + def __process_holder( + self, + holder: CategoryNode | SubjectNode, + ) -> CategoryResultNode | SubjectResultNode: + """Process a category or subject node and create corresponding result node.""" + + # Determine subjects and tests weight factors + if holder.subjects and holder.tests: + if not holder.subjects_weight: + raise ValueError(f"missing 'subjects_weight' for {holder.name}") + subjects_factor = holder.subjects_weight / 100.0 + tests_factor = 1 - subjects_factor + else: + subjects_factor = 1.0 + tests_factor = 1.0 + + # Process subjects + subject_results = [] + if holder.subjects: + subject_results = [ + self.process_subject(inner_subject) + for inner_subject in holder.subjects + ] + self.__balance_nodes(subject_results, subjects_factor) + + # Process tests + test_results = [] + if holder.tests: + test_results = [ + self.process_test(test) + for test in holder.tests + ] + self.__balance_nodes(test_results, tests_factor) + + # Create appropriate result node type + if isinstance(holder, CategoryNode): + return CategoryResultNode( + name=holder.name, + weight=holder.weight, + subjects_weight=holder.subjects_weight, + subjects=subject_results, + tests=test_results, + ) + return SubjectResultNode( + name=holder.name, + weight=holder.weight, + subjects_weight=holder.subjects_weight, + subjects=subject_results, + tests=test_results, + ) + + def process_subject(self, subject: SubjectNode) -> SubjectResultNode: + """Process a subject node from criteria tree and create result node.""" + return self.__process_holder(subject) + + def process_test(self, test: TestNode) -> TestResultNode: + """Execute a test and create a test result node.""" + file_target = self.get_file_target(test) + + # Shallow-copy parameters so we don't mutate the original TestNode. + test_params = dict(test.parameters or {}) + + # Resolve program_command eagerly when the language is known. + if self.submission_language and 'program_command' in test_params: + raw_command = test_params['program_command'] + resolved = self.command_resolver.resolve_command( + raw_command, self.submission_language + ) + test_params['program_command'] = resolved + + # Inject the actual submission language so tests like forbidden_import + # always operate on the real language rather than a config-time guess. + if self.submission_language and 'submission_language' in test_params: + test_params['submission_language'] = self.submission_language + + test_result = test.test_function.execute( + files=file_target, + sandbox=self.sandbox, + locale=self.locale, + pre_computed_results=self.pre_computed_results, + structural_analysis=self.structural_analysis, + submission_language=self.submission_language, + **test_params, + ) + return TestResultNode( + name=test.name, + test_node=test, + score=test_result.score, + report=test_result.report, + parameters=test_result.parameters, + weight=test.weight, + ) + + def get_file_target(self, test_node: TestNode) -> Optional[List[SubmissionFile]]: + """Filter out the submission files strictly relevant to the current test node.""" + if not self.submission_files: + return None + + if not test_node.file_target or test_node.file_target == ["all"]: + return list(self.submission_files.values()) + + target_files = [] + for file_name in self.submission_files: + if file_name in test_node.file_target: + target_files.append(self.submission_files[file_name]) + + return target_files + + def process_category(self, category: CategoryNode) -> CategoryResultNode: + """Process a category node from criteria tree and create result node.""" + return self.__process_holder(category) diff --git a/autograder/services/grader/grader_service.py b/autograder/services/grader/grader_service.py new file mode 100644 index 00000000..3577ba90 --- /dev/null +++ b/autograder/services/grader/grader_service.py @@ -0,0 +1,52 @@ +import logging +from typing import Dict, Optional + +from autograder.models.criteria_tree import CriteriaTree +from autograder.models.dataclass.submission import SubmissionFile +from autograder.models.dataclass.test_result import TestResult +from autograder.models.result_tree import ( + ResultTree, + RootResultNode, +) +from autograder.services.command_resolver import CommandResolver +from .criteria_grader import SubmissionGrader + + +class GraderService: + """Service responsible for orchestrating the grading process using a configured criteria tree.""" + + def __init__(self): + self.logger = logging.getLogger("GraderService") + self._command_resolver = CommandResolver() + + def grade_from_tree( + self, + criteria_tree: CriteriaTree, + submission_files: Dict[str, SubmissionFile], + sandbox=None, + submission_language=None, + locale: str = "en", + pre_computed_results: Optional[Dict[str, TestResult]] = None, + structural_analysis=None, + ) -> ResultTree: + """Traverse the generic built criteria tree to resolve inputs, grades and report to ResultTree.""" + grader = SubmissionGrader( + submission_files=submission_files, + command_resolver=self._command_resolver, + sandbox=sandbox, + submission_language=submission_language, + locale=locale, + pre_computed_results=pre_computed_results, + structural_analysis=structural_analysis, + ) + + base_result = grader.process_category(criteria_tree.base) + root = RootResultNode(name="root", base=base_result) + + if criteria_tree.bonus: + root.bonus = grader.process_category(criteria_tree.bonus) + + if criteria_tree.penalty: + root.penalty = grader.process_category(criteria_tree.penalty) + + return ResultTree(root) diff --git a/autograder/services/grader_service.py b/autograder/services/grader_service.py deleted file mode 100644 index 78de48a9..00000000 --- a/autograder/services/grader_service.py +++ /dev/null @@ -1,300 +0,0 @@ -import logging - -from typing import Dict, Optional, Sequence, List, overload -from autograder.services.command_resolver import CommandResolver -from autograder.models.criteria_tree import ( - CriteriaTree, - CategoryNode, - SubjectNode, - TestNode, -) -from autograder.models.dataclass.submission import SubmissionFile -from autograder.models.dataclass.test_result import TestResult -from autograder.models.result_tree import ( - ResultTree, - RootResultNode, - CategoryResultNode, - SubjectResultNode, - TestResultNode, -) - - -class GraderService: - """Service responsible for orchestrating the grading process using a configured criteria tree.""" - - def __init__(self): - self.logger = logging.getLogger("GraderService") - self._command_resolver = CommandResolver() - - def grade_from_tree( - self, - criteria_tree: CriteriaTree, - submission_files: Dict[str, SubmissionFile], - sandbox=None, - submission_language=None, - locale: str = "en", - pre_computed_results: Optional[Dict[str, TestResult]] = None, - structural_analysis=None, - ) -> ResultTree: - """Traverse the generic built criteria tree to resolve inputs, grades and report to ResultTree.""" - base_result = self.process_category( - criteria_tree.base, - submission_files=submission_files, - sandbox=sandbox, - submission_language=submission_language, - locale=locale, - pre_computed_results=pre_computed_results, - structural_analysis=structural_analysis, - ) - root = RootResultNode(name="root", base=base_result) - - if criteria_tree.bonus: - bonus_result = self.process_category( - criteria_tree.bonus, - submission_files=submission_files, - sandbox=sandbox, - submission_language=submission_language, - locale=locale, - pre_computed_results=pre_computed_results, - structural_analysis=structural_analysis, - ) - root.bonus = bonus_result - - if criteria_tree.penalty: - penalty_result = self.process_category( - criteria_tree.penalty, - submission_files=submission_files, - sandbox=sandbox, - submission_language=submission_language, - locale=locale, - pre_computed_results=pre_computed_results, - structural_analysis=structural_analysis, - ) - root.penalty = penalty_result - - return ResultTree(root) - - def __balance_nodes( - self, - nodes: Sequence[CategoryResultNode | SubjectResultNode | TestResultNode], - factor: float, - ) -> None: - """Balance the weights of sibling nodes to sum to 100, scaled by factor.""" - if len(nodes) == 0: - return - - total_weight = sum(node.weight for node in nodes) * factor - - if total_weight == 0: - equal_weight = 100.0 / len(nodes) - for node in nodes: - node.weight = equal_weight - elif total_weight != 100: - scale_factor = 100.0 / total_weight - for node in nodes: - node.weight *= scale_factor - - @overload - def __process_holder(self, holder: CategoryNode) -> CategoryResultNode: ... - - @overload - def __process_holder(self, holder: SubjectNode) -> SubjectResultNode: ... - - def __process_holder( - self, - holder: CategoryNode | SubjectNode, - submission_files: Dict[str, SubmissionFile], - sandbox=None, - submission_language=None, - locale: str = "en", - pre_computed_results: Optional[Dict[str, TestResult]] = None, - structural_analysis=None, - ) -> CategoryResultNode | SubjectResultNode: - """Process a category or subject node and create corresponding result node.""" - - # Determine subjects and tests weight factors - if holder.subjects and holder.tests: - if not holder.subjects_weight: - raise ValueError(f"missing 'subjects_weight' for {holder.name}") - subjects_factor = holder.subjects_weight / 100.0 - tests_factor = 1 - subjects_factor - else: - subjects_factor = 1.0 - tests_factor = 1.0 - - # Process subjects - subject_results = [] - if holder.subjects: - subject_results = [ - self.process_subject( - inner_subject, - submission_files=submission_files, - sandbox=sandbox, - submission_language=submission_language, - locale=locale, - pre_computed_results=pre_computed_results, - structural_analysis=structural_analysis, - ) - for inner_subject in holder.subjects - ] - self.__balance_nodes(subject_results, subjects_factor) - - # Process tests - test_results = [] - if holder.tests: - test_results = [ - self.process_test( - test, - submission_files=submission_files, - sandbox=sandbox, - submission_language=submission_language, - locale=locale, - pre_computed_results=pre_computed_results, - structural_analysis=structural_analysis, - ) - for test in holder.tests - ] - self.__balance_nodes(test_results, tests_factor) - - # Create appropriate result node type - if isinstance(holder, CategoryNode): - return CategoryResultNode( - name=holder.name, - weight=holder.weight, - subjects_weight=holder.subjects_weight, - subjects=subject_results, - tests=test_results, - ) - return SubjectResultNode( - name=holder.name, - weight=holder.weight, - subjects_weight=holder.subjects_weight, - subjects=subject_results, - tests=test_results, - ) - - def process_subject( - self, - subject: SubjectNode, - submission_files: Dict[str, SubmissionFile], - sandbox=None, - submission_language=None, - locale: str = "en", - pre_computed_results: Optional[Dict[str, TestResult]] = None, - structural_analysis=None, - ) -> SubjectResultNode: - """Process a subject node from criteria tree and create result node.""" - return self.__process_holder( - subject, - submission_files=submission_files, - sandbox=sandbox, - submission_language=submission_language, - locale=locale, - pre_computed_results=pre_computed_results, - structural_analysis=structural_analysis, - ) - - def process_test( - self, - test: TestNode, - submission_files: Optional[Dict[str, SubmissionFile]] = None, - sandbox=None, - submission_language=None, - locale: str = "en", - pre_computed_results: Optional[Dict[str, TestResult]] = None, - structural_analysis=None, - ) -> TestResultNode: - """Execute a test and create a test result node. - - Resolves `program_command` in-place before calling execute() so that - test functions always receive a finalized string command. Every key in - ``test_params`` is a declared parameter of the test function. - - ``pre_computed_results`` is intentionally forwarded to - ``test_function.execute()`` as an extra kwarg so that - :class:`~autograder.models.abstract.ai_test_function.AiTestFunction` - implementations can return the pre-computed result without a further API call. - Regular test functions that do not declare this parameter must accept - ``**kwargs`` or ignore it. - """ - file_target = self.get_file_target(test, submission_files=submission_files) - - # Shallow-copy parameters so we don't mutate the original TestNode. - test_params = dict(test.parameters or {}) - - # Resolve program_command eagerly when the language is known. - if submission_language and 'program_command' in test_params: - raw_command = test_params['program_command'] - resolved = self._command_resolver.resolve_command( - raw_command, submission_language - ) - test_params['program_command'] = resolved - - # Inject the actual submission language so tests like forbidden_import - # always operate on the real language rather than a config-time guess. - if submission_language and 'submission_language' in test_params: - test_params['submission_language'] = submission_language - - test_result = test.test_function.execute( - files=file_target, - sandbox=sandbox, - locale=locale, - pre_computed_results=pre_computed_results, - structural_analysis=structural_analysis, - submission_language=submission_language, - **test_params, - ) - return TestResultNode( - name=test.name, - test_node=test, - score=test_result.score, - report=test_result.report, - parameters=test_result.parameters, - weight=test.weight, - ) - - - def get_file_target( - self, - test_node: TestNode, - submission_files: Optional[Dict[str, SubmissionFile]], - ) -> Optional[List[SubmissionFile]]: - """Filter out the submission files strictly relevant to the current test node. - - When no file_target is specified, returns all submission files so that - tests like forbidden_import can scan the actual submitted file regardless - of language. - """ - if not submission_files: - return None - - if not test_node.file_target or test_node.file_target == ["all"]: - return list(submission_files.values()) - - target_files = [] - for file_name in submission_files: - if file_name in test_node.file_target: - target_files.append(submission_files[file_name]) - - return target_files - - def process_category( - self, - category: CategoryNode, - submission_files: Dict[str, SubmissionFile], - sandbox=None, - submission_language=None, - locale: str = "en", - pre_computed_results: Optional[Dict[str, TestResult]] = None, - structural_analysis=None, - ) -> CategoryResultNode: - """Process a category node from criteria tree and create result node.""" - return self.__process_holder( - category, - submission_files=submission_files, - sandbox=sandbox, - submission_language=submission_language, - locale=locale, - pre_computed_results=pre_computed_results, - structural_analysis=structural_analysis, - ) diff --git a/autograder/steps/grade_step.py b/autograder/steps/grade_step.py index 683b14a7..1fce7bae 100644 --- a/autograder/steps/grade_step.py +++ b/autograder/steps/grade_step.py @@ -4,7 +4,7 @@ from autograder.models.pipeline_execution import PipelineExecution from autograder.models.dataclass.step_result import StepResult, StepStatus, StepName from autograder.models.abstract.step import Step -from autograder.services.grader_service import GraderService +from autograder.services.grader.grader_service import GraderService logger = logging.getLogger(__name__) diff --git a/docs/features/command_resolver.md b/docs/features/command_resolver.md index b2228f9c..286398a2 100644 --- a/docs/features/command_resolver.md +++ b/docs/features/command_resolver.md @@ -86,11 +86,11 @@ When a student submits code, it enters an execution pipeline managed by the Auto 1. **Language Association:** The API or Web layer identifies the submitted language and sets it in the overarching `PipelineExecution`. 2. **Configuration Load:** The pipeline loads the standard criteria (e.g., weight, number of tests, configurations like `setup_config` to compile `.java` or `.cpp`). -3. **Execution Routing in `GraderService`:** When evaluating an individual test node, `GraderService` discovers that `program_command` isn't an exact literal. It hands over the raw value (Dict or `"CMD"`) alongside the current `SubmissionLanguage` to the `CommandResolver`. +3. **Execution Routing in `SubmissionGrader`:** When evaluating an individual test node, `SubmissionGrader` discovers that `program_command` isn't an exact literal. It hands over the raw value (Dict or `"CMD"`) alongside the current `SubmissionLanguage` to the `CommandResolver`. 4. **Processing by `CommandResolver`:** - If the value is a dictionary, it looks up the specific key (e.g., `java`) and returns its explicit command string (`java Calculator`). - If the value is `"CMD"`, it assesses the submitted files or fallback defaults to return the right execution instruction. -5. **Sandbox Hand-off:** After finding the appropriate string, the `GraderService` forwards it into the configured Docker container (the sandbox). Since the sandbox matches the submission language context, it spins up and correctly evaluates `java Calculator`, oblivious to the fact that the assignment originally supported Python or Node submissions. +5. **Sandbox Hand-off:** After finding the appropriate string, the `SubmissionGrader` forwards it into the configured Docker container (the sandbox). Since the sandbox matches the submission language context, it spins up and correctly evaluates `java Calculator`, oblivious to the fact that the assignment originally supported Python or Node submissions. --- diff --git a/docs/features/grading_engine.md b/docs/features/grading_engine.md index 403235ab..8bcf0e31 100644 --- a/docs/features/grading_engine.md +++ b/docs/features/grading_engine.md @@ -210,7 +210,7 @@ The `ResultTree` also provides utility methods: | File | Contents | |------|----------| -| `autograder/services/grader_service.py` | `GraderService` — tree traversal, test execution, weight balancing | +| `autograder/services/grader/grader_service.py` | `GraderService` — coordinator for the grading process |\n| `autograder/services/grader/criteria_grader.py` | `SubmissionGrader` — stateful tree traversal, test execution, weight balancing | | `autograder/models/result_tree.py` | `ResultTree`, `RootResultNode`, `CategoryResultNode`, `SubjectResultNode`, `TestResultNode` | | `autograder/models/criteria_tree.py` | `CriteriaTree`, `CategoryNode`, `SubjectNode`, `TestNode` | | `autograder/models/dataclass/grade_step_result.py` | `GradeStepResult` — wrapper for final score + result tree | diff --git a/docs/guides/development.md b/docs/guides/development.md index 3697f4ee..d2067af6 100644 --- a/docs/guides/development.md +++ b/docs/guides/development.md @@ -60,7 +60,7 @@ autograder/ │ │ ├── result_tree.py │ │ └── pipeline_execution.py │ ├── services/ # Business logic -│ │ ├── grader_service.py # Main grading engine +│ │ ├── grader/ # Grading engine\n│ │ │ ├── grader_service.py # Coordinator\n│ │ │ └── criteria_grader.py # Stateful processor │ │ ├── focus_service.py # Focus-based feedback │ │ ├── criteria_tree_service.py # Tree construction │ │ └── template_library_service.py diff --git a/docs/pipeline/04.5-ai-batch.md b/docs/pipeline/04.5-ai-batch.md index 7731576d..525f7cfb 100644 --- a/docs/pipeline/04.5-ai-batch.md +++ b/docs/pipeline/04.5-ai-batch.md @@ -36,7 +36,7 @@ If no `AiTestFunction` instances are found in the criteria tree, the step exits ## How It Integrates with Grade -The `GradeStep` reads the `AI_BATCH` step result from the pipeline and passes it as `pre_computed_results` to `GraderService.grade_from_tree()`. The grader threads this dict through the tree traversal into every `process_test()` call, which forwards it as a kwarg to `test_function.execute()`. +The `GradeStep` reads the `AI_BATCH` step result from the pipeline and passes it as `pre_computed_results` to `GraderService.grade_from_tree()`. The `SubmissionGrader` threads this dict through the tree traversal into every `process_test()` call, which forwards it as a kwarg to `test_function.execute()`. `AiTestFunction.execute()` sees the `pre_computed_results` kwarg, looks up its own `name` in the dict, and returns the pre-computed `TestResult` directly — no further API call, no mutation. diff --git a/docs/pipeline/05-grade.md b/docs/pipeline/05-grade.md index 72712990..2669e5d3 100644 --- a/docs/pipeline/05-grade.md +++ b/docs/pipeline/05-grade.md @@ -75,7 +75,7 @@ Once the results are in, the pipeline proceeds to **[Step 6: Focus](06-focus.md) `autograder/steps/grade_step.py` → `GradeStep` -`autograder/services/grader_service.py` → `GraderService` +`autograder/services/grader/grader_service.py` → `GraderService` `autograder/models/result_tree.py` → `ResultTree`, `RootResultNode`, `CategoryResultNode`, `SubjectResultNode`, `TestResultNode` diff --git a/docs/roadmaps/TECHNICAL_DEBT_ROADMAP.md b/docs/roadmaps/TECHNICAL_DEBT_ROADMAP.md index 95f42e5b..80703fc4 100644 --- a/docs/roadmaps/TECHNICAL_DEBT_ROADMAP.md +++ b/docs/roadmaps/TECHNICAL_DEBT_ROADMAP.md @@ -79,7 +79,7 @@ These items address structural problems where components are tightly coupled, re #### Item 6: Decouple `GraderService` from Mutable State -- **File:** `autograder/services/grader_service.py` +- **File:** `autograder/services/grader/grader_service.py` - **Problem:** `GraderService` is instantiated once per `GradeStep` but accumulates mutable state through setter methods: `set_sandbox()`, `set_submission_language()`, and internal `__submission_files`. This makes the service stateful and order-dependent — callers must remember to call setters before `grade_from_tree()`. It also means the service cannot be safely reused across concurrent submissions. - **Impact:** The setter pattern (`set_sandbox`, `set_submission_language`) is a code smell that indicates these values should be parameters, not instance state. It also creates a hidden coupling: `GradeStep.execute()` must know the exact sequence of setter calls before invoking grading. - **Action:** @@ -89,7 +89,7 @@ These items address structural problems where components are tightly coupled, re #### Item 7: Extract Language Resolution from Test Execution -- **Files:** `autograder/services/grader_service.py`, `autograder/template_library/input_output.py`, `autograder/services/command_resolver.py` +- **Files:** `autograder/services/grader/grader_service.py`, `autograder/template_library/input_output.py`, `autograder/services/command_resolver.py` - **Problem:** The `GraderService.process_test()` method injects `__submission_language__` as a hidden parameter into every test's kwargs dict. The `ExpectOutputTest` and `DontFailTest` in `input_output.py` then extract this hidden parameter to resolve the `program_command` via `CommandResolver`. This creates an implicit contract: tests must know to look for a magic key in their kwargs, and the grader must know to inject it. - **Impact:** The `__submission_language__` convention is undocumented, fragile, and invisible to anyone reading the `TestFunction.execute()` signature. It also means command resolution logic is scattered: `CommandResolver` lives in `services/`, but it's invoked inside individual test functions rather than at the pipeline level. - **Action:** @@ -259,7 +259,7 @@ These items prepare the codebase for future growth by establishing patterns and #### Item 21: Decouple the `AiExecutor` Batch Pattern from Test Execution -- **Files:** `autograder/utils/executors/ai_executor.py`, `autograder/services/grader_service.py` +- **Files:** `autograder/utils/executors/ai_executor.py`, `autograder/services/grader/grader_service.py` - **Problem:** The `AiExecutor` implements a batch-send pattern where individual tests call `executor.add_test()` during tree traversal, and then `GraderService.grade_from_tree()` calls `executor.stop()` after all tests are processed to send a single batch request to OpenAI. This means: 1. `GraderService` must know about `AiExecutor` and check for it after grading (`if hasattr(test_func, "executor") and test_func.executor`). 2. Test functions that use AI don't actually execute during `execute()` — they register themselves and return an empty `TestResult` that gets populated later via `mapback()`. diff --git a/tests/unit/pipeline/test_ai_batch_step.py b/tests/unit/pipeline/test_ai_batch_step.py index 4356136b..57a5921d 100644 --- a/tests/unit/pipeline/test_ai_batch_step.py +++ b/tests/unit/pipeline/test_ai_batch_step.py @@ -27,7 +27,7 @@ from autograder.models.dataclass.test_result import TestResult from autograder.models.pipeline_execution import PipelineExecution from autograder.models.result_tree import CategoryResultNode, ResultTree, RootResultNode -from autograder.services.grader_service import GraderService +from autograder.services.grader.grader_service import GraderService from autograder.steps.ai_batch_step import AiBatchStep from autograder.steps.grade_step import GradeStep from autograder.utils.executors.ai_executor import AiExecutor, TestInput, TestOutput diff --git a/tests/unit/test_command_resolution_at_build_time.py b/tests/unit/test_command_resolution_at_build_time.py index b2fd503c..71b0230f 100644 --- a/tests/unit/test_command_resolution_at_build_time.py +++ b/tests/unit/test_command_resolution_at_build_time.py @@ -7,7 +7,8 @@ from typing import List -from autograder.services.grader_service import GraderService +from autograder.services.grader.criteria_grader import SubmissionGrader +from autograder.services.command_resolver import CommandResolver from autograder.models.criteria_tree import TestNode from autograder.models.abstract.test_function import TestFunction from autograder.models.dataclass.param_description import ParamDescription @@ -44,8 +45,12 @@ def execute(self, files, sandbox, *args, **kwargs): return TestResult(test_name=self.name, score=100.0, report="ok") -def _make_grader(language=None) -> GraderService: - return GraderService(), language +def _make_grader(language=None) -> SubmissionGrader: + return SubmissionGrader( + submission_files={}, + command_resolver=CommandResolver(), + submission_language=language + ) def _make_test_node(params: dict, fn: TestFunction = None) -> TestNode: @@ -63,7 +68,7 @@ class TestDictCommandResolutionInProcessTest: def test_dict_command_resolved_for_python(self): """Test proper dict resolution for Python language.""" - svc, language = _make_grader(Language.PYTHON) + svc = _make_grader(Language.PYTHON) fn = RecordingTestFunction() node = _make_test_node({ "program_command": { @@ -72,13 +77,13 @@ def test_dict_command_resolved_for_python(self): } }, fn) - svc.process_test(node, submission_language=language) + svc.process_test(node) assert fn.recorded_kwargs["program_command"] == "python3 calc.py" def test_dict_command_resolved_for_java(self): """Test proper dict resolution for Java language.""" - svc, language = _make_grader(Language.JAVA) + svc = _make_grader(Language.JAVA) fn = RecordingTestFunction() node = _make_test_node({ "program_command": { @@ -87,13 +92,13 @@ def test_dict_command_resolved_for_java(self): } }, fn) - svc.process_test(node, submission_language=language) + svc.process_test(node) assert fn.recorded_kwargs["program_command"] == "java Calc" def test_dict_command_missing_language_gives_none(self): """When language is not in the dict, resolved value is None.""" - svc, language = _make_grader(Language.NODE) + svc = _make_grader(Language.NODE) fn = RecordingTestFunction() node = _make_test_node({ "program_command": { @@ -102,7 +107,7 @@ def test_dict_command_missing_language_gives_none(self): } }, fn) - svc.process_test(node, submission_language=language) + svc.process_test(node) assert fn.recorded_kwargs["program_command"] is None @@ -116,31 +121,31 @@ class TestCmdPlaceholderResolution: def test_cmd_resolved_for_python(self): """Test CMD placeholder resolves appropriately for Python.""" - svc, language = _make_grader(Language.PYTHON) + svc = _make_grader(Language.PYTHON) fn = RecordingTestFunction() node = _make_test_node({"program_command": "CMD"}, fn) - svc.process_test(node, submission_language=language) + svc.process_test(node) assert fn.recorded_kwargs["program_command"] == "python3 main.py" def test_cmd_resolved_for_java(self): """Test CMD placeholder resolves appropriately for Java.""" - svc, language = _make_grader(Language.JAVA) + svc = _make_grader(Language.JAVA) fn = RecordingTestFunction() node = _make_test_node({"program_command": "CMD"}, fn) - svc.process_test(node, submission_language=language) + svc.process_test(node) assert fn.recorded_kwargs["program_command"] == "java Main" def test_cmd_resolved_for_node(self): """Test CMD placeholder resolves appropriately for Node.js.""" - svc, language = _make_grader(Language.NODE) + svc = _make_grader(Language.NODE) fn = RecordingTestFunction() node = _make_test_node({"program_command": "CMD"}, fn) - svc.process_test(node, submission_language=language) + svc.process_test(node) assert fn.recorded_kwargs["program_command"] == "node index.js" @@ -154,11 +159,11 @@ class TestInvalidFormatHandled: def test_string_command_is_resolved(self): """Test non-CMD single strings are resolved directly.""" - svc, language = _make_grader(Language.PYTHON) + svc = _make_grader(Language.PYTHON) fn = RecordingTestFunction() node = _make_test_node({"program_command": "python3 my_script.py"}, fn) - svc.process_test(node, submission_language=language) + svc.process_test(node) # Single strings are now supported and resolved directly assert fn.recorded_kwargs["program_command"] == "python3 my_script.py" @@ -173,23 +178,23 @@ class TestNoLanguageNoResolution: def test_dict_unchanged_when_no_language(self): """Without language, dict program_command is left as-is (resolver not called).""" - svc, language = _make_grader() # no language + svc = _make_grader() # no language fn = RecordingTestFunction() cmd_dict = {"python": "python3 calc.py"} node = _make_test_node({"program_command": cmd_dict}, fn) - svc.process_test(node, submission_language=language) + svc.process_test(node) # Dict should reach execute() untouched — resolution was skipped assert fn.recorded_kwargs["program_command"] == cmd_dict def test_cmd_unchanged_when_no_language(self): """Without language, CMD placeholder is unchanged (resolver not called).""" - svc, language = _make_grader() + svc = _make_grader() fn = RecordingTestFunction() node = _make_test_node({"program_command": "CMD"}, fn) - svc.process_test(node, submission_language=language) + svc.process_test(node) assert fn.recorded_kwargs["program_command"] == "CMD" @@ -203,33 +208,33 @@ class TestNoHiddenKwarg: def test_no_hidden_kwarg_with_language_set(self): """Even when language is set, __submission_language__ must not appear in execute().""" - svc, language = _make_grader(Language.PYTHON) + svc = _make_grader(Language.PYTHON) fn = RecordingTestFunction() node = _make_test_node({"program_command": "CMD"}, fn) - svc.process_test(node, submission_language=language) + svc.process_test(node) assert "__submission_language__" not in fn.recorded_kwargs def test_no_hidden_kwarg_without_language(self): """Hidden kwarg shouldn't appear even if there is no language to configure.""" - svc, language = _make_grader() + svc = _make_grader() fn = RecordingTestFunction() node = _make_test_node({}, fn) - svc.process_test(node, submission_language=language) + svc.process_test(node) assert "__submission_language__" not in fn.recorded_kwargs def test_no_hidden_kwarg_with_dict_command(self): """Hidden kwarg shouldn't appear even if there is a dict correctly configured.""" - svc, language = _make_grader(Language.JAVA) + svc = _make_grader(Language.JAVA) fn = RecordingTestFunction() node = _make_test_node({ "program_command": {"python": "python3 a.py", "java": "java A"} }, fn) - svc.process_test(node, submission_language=language) + svc.process_test(node) assert "__submission_language__" not in fn.recorded_kwargs From fa6d2309cb82430b5152196a700c1eae21b70b17 Mon Sep 17 00:00:00 2001 From: jaoppb Date: Sat, 2 May 2026 09:47:45 -0300 Subject: [PATCH 2/4] fix: correct weight balancing logic in SubmissionGrader - Refactor __balance_nodes to explicitly scale weights to 100 * factor. - Improve clarity and robustness of sibling weight normalization. - Update documentation to reflect the corrected balancing algorithm. - Add unit tests for SubmissionGrader weight balancing. Co-authored-by: gemini-cli <218195315+gemini-cli@users.noreply.github.com> --- autograder/services/grader/criteria_grader.py | 13 +-- docs/features/grading_engine.md | 18 ++-- .../services/grader/test_submission_grader.py | 99 +++++++++++++++++++ 3 files changed, 116 insertions(+), 14 deletions(-) create mode 100644 tests/unit/services/grader/test_submission_grader.py diff --git a/autograder/services/grader/criteria_grader.py b/autograder/services/grader/criteria_grader.py index 81cea902..e1618888 100644 --- a/autograder/services/grader/criteria_grader.py +++ b/autograder/services/grader/criteria_grader.py @@ -47,18 +47,19 @@ def __balance_nodes( nodes: Sequence[CategoryResultNode | SubjectResultNode | TestResultNode], factor: float, ) -> None: - """Balance the weights of sibling nodes to sum to 100, scaled by factor.""" + """Balance the weights of sibling nodes to sum to a target total (100 * factor).""" if len(nodes) == 0: return - total_weight = sum(node.weight for node in nodes) * factor + target_total = 100.0 * factor + current_sum = sum(node.weight for node in nodes) - if total_weight == 0: - equal_weight = 100.0 / len(nodes) + if current_sum == 0: + equal_weight = target_total / len(nodes) for node in nodes: node.weight = equal_weight - elif total_weight != 100: - scale_factor = 100.0 / total_weight + else: + scale_factor = target_total / current_sum for node in nodes: node.weight *= scale_factor diff --git a/docs/features/grading_engine.md b/docs/features/grading_engine.md index 8bcf0e31..2ecb0b25 100644 --- a/docs/features/grading_engine.md +++ b/docs/features/grading_engine.md @@ -70,23 +70,25 @@ Weights determine how much each node contributes to its parent's score. The engi ### Sibling Balancing -The `__balance_nodes()` method normalizes weights: +The `__balance_nodes()` method normalizes weights to a target total based on a factor: ```python def __balance_nodes(self, nodes, factor): - total_weight = sum(node.weight for node in nodes) * factor - if total_weight == 0: - equal_weight = 100.0 / len(nodes) + target_total = 100.0 * factor + current_sum = sum(node.weight for node in nodes) + + if current_sum == 0: + equal_weight = target_total / len(nodes) for node in nodes: node.weight = equal_weight - elif total_weight != 100: - scale_factor = 100.0 / total_weight + else: + scale_factor = target_total / current_sum for node in nodes: node.weight *= scale_factor ``` -- If all weights are zero, they're distributed equally. -- If they don't sum to 100 (after applying the factor), they're scaled proportionally. +- If all weights are zero, they're distributed equally to sum to `100 * factor`. +- If they don't sum to the target total, they're scaled proportionally. ### Subject/Test Split diff --git a/tests/unit/services/grader/test_submission_grader.py b/tests/unit/services/grader/test_submission_grader.py new file mode 100644 index 00000000..020c182c --- /dev/null +++ b/tests/unit/services/grader/test_submission_grader.py @@ -0,0 +1,99 @@ + +import pytest +from unittest.mock import MagicMock +from autograder.services.grader.criteria_grader import SubmissionGrader +from autograder.models.criteria_tree import CategoryNode, SubjectNode, TestNode +from autograder.models.abstract.test_function import TestFunction +from autograder.models.dataclass.test_result import TestResult + +class MockTestFunction(TestFunction): + @property + def name(self) -> str: + return "mock_test" + @property + def description(self) -> str: + return "mock description" + @property + def parameter_description(self) -> list: + return [] + def execute(self, files=None, sandbox=None, **kwargs): + return TestResult(test_name="mock_test", score=kwargs.get('score', 100.0), report="OK") + +@pytest.fixture +def grader(): + command_resolver = MagicMock() + return SubmissionGrader( + submission_files={}, + command_resolver=command_resolver + ) + +def test_balance_nodes_only_subjects(grader): + # If only subjects, they should sum to 100 + s1 = SubjectNode(name="S1", weight=30) + s2 = SubjectNode(name="S2", weight=30) + cat = CategoryNode(name="base", weight=100, subjects=[s1, s2]) + + # We need to mock process_subject because it calls process_category/process_subject recursively + # Actually SubmissionGrader.process_category calls __process_holder which calls process_subject + + # Let's mock process_test and process_subject to avoid deep recursion for this test + grader.process_subject = MagicMock(side_effect=lambda s: MagicMock(weight=s.weight, score=100.0)) + + result = grader.process_category(cat) + + # Weights should be balanced to 50/50 because original sum was 60 + assert result.subjects[0].weight == 50.0 + assert result.subjects[1].weight == 50.0 + +def test_balance_nodes_with_subjects_weight(grader): + # Scenario: subjects_weight = 80 + # 1 subject, 1 test + tf = MockTestFunction() + s1 = SubjectNode(name="S1", weight=100) + t1 = TestNode(name="T1", test_function=tf, weight=100) + cat = CategoryNode(name="base", weight=100, subjects=[s1], tests=[t1], subjects_weight=80) + + grader.process_subject = MagicMock(side_effect=lambda s: MagicMock(weight=s.weight, score=100.0)) + # process_test is not mocked, it will execute the test + + result = grader.process_category(cat) + + # Subject should have weight 80, Test should have weight 20 + assert result.subjects[0].weight == pytest.approx(80.0) + assert result.tests[0].weight == pytest.approx(20.0) + + # Final score should be 100 if both are 100 + assert result.calculate_score() == pytest.approx(100.0) + +def test_balance_nodes_with_subjects_weight_and_scores(grader): + # Scenario: subjects_weight = 80 + # Subject score = 100, Test score = 0 + tf = MockTestFunction() + s1 = SubjectNode(name="S1", weight=100) + t1 = TestNode(name="T1", test_function=tf, weight=100, parameters={'score': 0.0}) + cat = CategoryNode(name="base", weight=100, subjects=[s1], tests=[t1], subjects_weight=80) + + # Mock process_subject to return a result node with score 100 + from autograder.models.result_tree import SubjectResultNode + grader.process_subject = MagicMock(return_value=SubjectResultNode(name="S1", weight=100, subjects_weight=None, score=100.0)) + + result = grader.process_category(cat) + + assert result.subjects[0].weight == pytest.approx(80.0) + assert result.tests[0].weight == pytest.approx(20.0) + + # Score = 100 * 0.8 + 0 * 0.2 = 80 + assert result.calculate_score() == pytest.approx(80.0) + +def test_balance_nodes_zero_weights(grader): + # If all weights are zero, they should be equal and sum to 100 * factor + s1 = SubjectNode(name="S1", weight=0) + s2 = SubjectNode(name="S2", weight=0) + cat = CategoryNode(name="base", weight=100, subjects=[s1, s2]) + + grader.process_subject = MagicMock(side_effect=lambda s: MagicMock(weight=s.weight, score=100.0)) + + result = grader.process_category(cat) + + assert result.subjects[0].weight == 50.0 + assert result.subjects[1].weight == 50.0 From f9e0af79ee59c7b1492d2cbdcc2b79fe33e27c57 Mon Sep 17 00:00:00 2001 From: Arthur Carvalho Date: Sat, 2 May 2026 16:01:38 -0300 Subject: [PATCH 3/4] Remove documentation roadmap link from index --- docs/index.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 64322c28..b5884204 100644 --- a/docs/index.md +++ b/docs/index.md @@ -47,4 +47,3 @@ This documentation is organized to be both practical for first-time users and de | Pipeline steps | [pipeline/README.md](pipeline/README.md) | | Features | [features/grading_engine.md](features/grading_engine.md) | | Development setup | [guides/development.md](guides/development.md) | -| Documentation roadmap | [roadmaps/MKDOCS_DOCUMENTATION_ROADMAP.md](roadmaps/MKDOCS_DOCUMENTATION_ROADMAP.md) | From e02463e74761f4dd02f9c3f66a7c466ba37f95f3 Mon Sep 17 00:00:00 2001 From: jaoppb Date: Mon, 4 May 2026 00:34:49 -0300 Subject: [PATCH 4/4] docs: removed non-existent docs from mkdocs.yml --- mkdocs.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mkdocs.yml b/mkdocs.yml index 685785b8..17ff94fc 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,5 +1,6 @@ site_name: Autograder Documentation -site_description: Educational-standards-driven autograding platform documentation +site_description: + Educational-standards-driven autograding platform documentation site_url: https://webtech-network.github.io/autograder/ repo_url: https://github.com/webtech-network/autograder repo_name: webtech-network/autograder @@ -86,6 +87,5 @@ nav: - Criteria Configuration Examples: guides/criteria_configuration_examples.md - GitHub Module (legacy): guides/github_module.md - Roadmaps: - - MkDocs Documentation Roadmap: roadmaps/MKDOCS_DOCUMENTATION_ROADMAP.md - Feature Roadmap: roadmaps/FEATURE_ROADMAP.md - Technical Debt Roadmap: roadmaps/TECHNICAL_DEBT_ROADMAP.md