From b2f782eb678195ca21da448ad8d5509476e32902 Mon Sep 17 00:00:00 2001
From: jaoppb <joaopedroperes06@gmail.com>
Date: Tue, 28 Apr 2026 19:23:23 -0300
Subject: [PATCH 1/4] refactor: extract SubmissionGrader and move GraderService
 to dedicated package

- Create autograder.services.grader package.
- Relocate GraderService to services/grader/grader_service.py.
- Extract stateful tree traversal logic into SubmissionGrader within services/grader/criteria_grader.py.
- Update GraderService to act as a stateless coordinator delegating to SubmissionGrader.
- Refactor unit tests to use SubmissionGrader and update internal imports.
- Update project documentation to reflect structural changes.

Co-authored-by: gemini-cli <218195315+gemini-cli@users.noreply.github.com>
---
 autograder/services/grader/__init__.py        |   0
 autograder/services/grader/criteria_grader.py | 181 +++++++++++
 autograder/services/grader/grader_service.py  |  52 +++
 autograder/services/grader_service.py         | 300 ------------------
 autograder/steps/grade_step.py                |   2 +-
 docs/features/command_resolver.md             |   4 +-
 docs/features/grading_engine.md               |   2 +-
 docs/guides/development.md                    |   2 +-
 docs/pipeline/04.5-ai-batch.md                |   2 +-
 docs/pipeline/05-grade.md                     |   2 +-
 docs/roadmaps/TECHNICAL_DEBT_ROADMAP.md       |   6 +-
 tests/unit/pipeline/test_ai_batch_step.py     |   2 +-
 .../test_command_resolution_at_build_time.py  |  59 ++--
 13 files changed, 276 insertions(+), 338 deletions(-)
 create mode 100644 autograder/services/grader/__init__.py
 create mode 100644 autograder/services/grader/criteria_grader.py
 create mode 100644 autograder/services/grader/grader_service.py
 delete mode 100644 autograder/services/grader_service.py

diff --git a/autograder/services/grader/__init__.py b/autograder/services/grader/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/autograder/services/grader/criteria_grader.py b/autograder/services/grader/criteria_grader.py
new file mode 100644
index 00000000..81cea902
--- /dev/null
+++ b/autograder/services/grader/criteria_grader.py
@@ -0,0 +1,181 @@
+import logging
+from typing import Dict, Optional, Sequence, List, overload
+
+from autograder.models.abstract.criteria_tree_processer import CriteriaTreeProcesser
+from autograder.models.criteria_tree import (
+    CategoryNode,
+    SubjectNode,
+    TestNode,
+)
+from autograder.models.dataclass.submission import SubmissionFile
+from autograder.models.dataclass.test_result import TestResult
+from autograder.models.result_tree import (
+    CategoryResultNode,
+    SubjectResultNode,
+    TestResultNode,
+)
+from autograder.services.command_resolver import CommandResolver
+
+
+class SubmissionGrader(CriteriaTreeProcesser):
+    """
+    Stateful grader responsible for traversing a criteria tree for a single submission.
+    Implements the CriteriaTreeProcesser interface.
+    """
+
+    def __init__(
+        self,
+        submission_files: Dict[str, SubmissionFile],
+        command_resolver: CommandResolver,
+        sandbox=None,
+        submission_language=None,
+        locale: str = "en",
+        pre_computed_results: Optional[Dict[str, TestResult]] = None,
+        structural_analysis=None,
+    ):
+        self.logger = logging.getLogger("SubmissionGrader")
+        self.submission_files = submission_files
+        self.command_resolver = command_resolver
+        self.sandbox = sandbox
+        self.submission_language = submission_language
+        self.locale = locale
+        self.pre_computed_results = pre_computed_results
+        self.structural_analysis = structural_analysis
+
+    def __balance_nodes(
+        self,
+        nodes: Sequence[CategoryResultNode | SubjectResultNode | TestResultNode],
+        factor: float,
+    ) -> None:
+        """Balance the weights of sibling nodes to sum to 100, scaled by factor."""
+        if len(nodes) == 0:
+            return
+
+        total_weight = sum(node.weight for node in nodes) * factor
+
+        if total_weight == 0:
+            equal_weight = 100.0 / len(nodes)
+            for node in nodes:
+                node.weight = equal_weight
+        elif total_weight != 100:
+            scale_factor = 100.0 / total_weight
+            for node in nodes:
+                node.weight *= scale_factor
+
+    @overload
+    def __process_holder(self, holder: CategoryNode) -> CategoryResultNode: ...
+
+    @overload
+    def __process_holder(self, holder: SubjectNode) -> SubjectResultNode: ...
+
+    def __process_holder(
+        self,
+        holder: CategoryNode | SubjectNode,
+    ) -> CategoryResultNode | SubjectResultNode:
+        """Process a category or subject node and create corresponding result node."""
+
+        # Determine subjects and tests weight factors
+        if holder.subjects and holder.tests:
+            if not holder.subjects_weight:
+                raise ValueError(f"missing 'subjects_weight' for {holder.name}")
+            subjects_factor = holder.subjects_weight / 100.0
+            tests_factor = 1 - subjects_factor
+        else:
+            subjects_factor = 1.0
+            tests_factor = 1.0
+
+        # Process subjects
+        subject_results = []
+        if holder.subjects:
+            subject_results = [
+                self.process_subject(inner_subject)
+                for inner_subject in holder.subjects
+            ]
+            self.__balance_nodes(subject_results, subjects_factor)
+
+        # Process tests
+        test_results = []
+        if holder.tests:
+            test_results = [
+                self.process_test(test)
+                for test in holder.tests
+            ]
+            self.__balance_nodes(test_results, tests_factor)
+
+        # Create appropriate result node type
+        if isinstance(holder, CategoryNode):
+            return CategoryResultNode(
+                name=holder.name,
+                weight=holder.weight,
+                subjects_weight=holder.subjects_weight,
+                subjects=subject_results,
+                tests=test_results,
+            )
+        return SubjectResultNode(
+            name=holder.name,
+            weight=holder.weight,
+            subjects_weight=holder.subjects_weight,
+            subjects=subject_results,
+            tests=test_results,
+        )
+
+    def process_subject(self, subject: SubjectNode) -> SubjectResultNode:
+        """Process a subject node from criteria tree and create result node."""
+        return self.__process_holder(subject)
+
+    def process_test(self, test: TestNode) -> TestResultNode:
+        """Execute a test and create a test result node."""
+        file_target = self.get_file_target(test)
+
+        # Shallow-copy parameters so we don't mutate the original TestNode.
+        test_params = dict(test.parameters or {})
+
+        # Resolve program_command eagerly when the language is known.
+        if self.submission_language and 'program_command' in test_params:
+            raw_command = test_params['program_command']
+            resolved = self.command_resolver.resolve_command(
+                raw_command, self.submission_language
+            )
+            test_params['program_command'] = resolved
+
+        # Inject the actual submission language so tests like forbidden_import
+        # always operate on the real language rather than a config-time guess.
+        if self.submission_language and 'submission_language' in test_params:
+            test_params['submission_language'] = self.submission_language
+
+        test_result = test.test_function.execute(
+            files=file_target,
+            sandbox=self.sandbox,
+            locale=self.locale,
+            pre_computed_results=self.pre_computed_results,
+            structural_analysis=self.structural_analysis,
+            submission_language=self.submission_language,
+            **test_params,
+        )
+        return TestResultNode(
+            name=test.name,
+            test_node=test,
+            score=test_result.score,
+            report=test_result.report,
+            parameters=test_result.parameters,
+            weight=test.weight,
+        )
+
+    def get_file_target(self, test_node: TestNode) -> Optional[List[SubmissionFile]]:
+        """Filter out the submission files strictly relevant to the current test node."""
+        if not self.submission_files:
+            return None
+
+        if not test_node.file_target or test_node.file_target == ["all"]:
+            return list(self.submission_files.values())
+
+        target_files = []
+        for file_name in self.submission_files:
+            if file_name in test_node.file_target:
+                target_files.append(self.submission_files[file_name])
+
+        return target_files
+
+    def process_category(self, category: CategoryNode) -> CategoryResultNode:
+        """Process a category node from criteria tree and create result node."""
+        return self.__process_holder(category)
diff --git a/autograder/services/grader/grader_service.py b/autograder/services/grader/grader_service.py
new file mode 100644
index 00000000..3577ba90
--- /dev/null
+++ b/autograder/services/grader/grader_service.py
@@ -0,0 +1,52 @@
+import logging
+from typing import Dict, Optional
+
+from autograder.models.criteria_tree import CriteriaTree
+from autograder.models.dataclass.submission import SubmissionFile
+from autograder.models.dataclass.test_result import TestResult
+from autograder.models.result_tree import (
+    ResultTree,
+    RootResultNode,
+)
+from autograder.services.command_resolver import CommandResolver
+from .criteria_grader import SubmissionGrader
+
+
+class GraderService:
+    """Service responsible for orchestrating the grading process using a configured criteria tree."""
+
+    def __init__(self):
+        self.logger = logging.getLogger("GraderService")
+        self._command_resolver = CommandResolver()
+
+    def grade_from_tree(
+        self,
+        criteria_tree: CriteriaTree,
+        submission_files: Dict[str, SubmissionFile],
+        sandbox=None,
+        submission_language=None,
+        locale: str = "en",
+        pre_computed_results: Optional[Dict[str, TestResult]] = None,
+        structural_analysis=None,
+    ) -> ResultTree:
+        """Traverse the generic built criteria tree to resolve inputs, grades and report to ResultTree."""
+        grader = SubmissionGrader(
+            submission_files=submission_files,
+            command_resolver=self._command_resolver,
+            sandbox=sandbox,
+            submission_language=submission_language,
+            locale=locale,
+            pre_computed_results=pre_computed_results,
+            structural_analysis=structural_analysis,
+        )
+
+        base_result = grader.process_category(criteria_tree.base)
+        root = RootResultNode(name="root", base=base_result)
+
+        if criteria_tree.bonus:
+            root.bonus = grader.process_category(criteria_tree.bonus)
+
+        if criteria_tree.penalty:
+            root.penalty = grader.process_category(criteria_tree.penalty)
+
+        return ResultTree(root)
diff --git a/autograder/services/grader_service.py b/autograder/services/grader_service.py
deleted file mode 100644
index 78de48a9..00000000
--- a/autograder/services/grader_service.py
+++ /dev/null
@@ -1,300 +0,0 @@
-import logging
-
-from typing import Dict, Optional, Sequence, List, overload
-from autograder.services.command_resolver import CommandResolver
-from autograder.models.criteria_tree import (
-    CriteriaTree,
-    CategoryNode,
-    SubjectNode,
-    TestNode,
-)
-from autograder.models.dataclass.submission import SubmissionFile
-from autograder.models.dataclass.test_result import TestResult
-from autograder.models.result_tree import (
-    ResultTree,
-    RootResultNode,
-    CategoryResultNode,
-    SubjectResultNode,
-    TestResultNode,
-)
-
-
-class GraderService:
-    """Service responsible for orchestrating the grading process using a configured criteria tree."""
-
-    def __init__(self):
-        self.logger = logging.getLogger("GraderService")
-        self._command_resolver = CommandResolver()
-
-    def grade_from_tree(
-        self,
-        criteria_tree: CriteriaTree,
-        submission_files: Dict[str, SubmissionFile],
-        sandbox=None,
-        submission_language=None,
-        locale: str = "en",
-        pre_computed_results: Optional[Dict[str, TestResult]] = None,
-        structural_analysis=None,
-    ) -> ResultTree:
-        """Traverse the generic built criteria tree to resolve inputs, grades and report to ResultTree."""
-        base_result = self.process_category(
-            criteria_tree.base,
-            submission_files=submission_files,
-            sandbox=sandbox,
-            submission_language=submission_language,
-            locale=locale,
-            pre_computed_results=pre_computed_results,
-            structural_analysis=structural_analysis,
-        )
-        root = RootResultNode(name="root", base=base_result)
-
-        if criteria_tree.bonus:
-            bonus_result = self.process_category(
-                criteria_tree.bonus,
-                submission_files=submission_files,
-                sandbox=sandbox,
-                submission_language=submission_language,
-                locale=locale,
-                pre_computed_results=pre_computed_results,
-                structural_analysis=structural_analysis,
-            )
-            root.bonus = bonus_result
-
-        if criteria_tree.penalty:
-            penalty_result = self.process_category(
-                criteria_tree.penalty,
-                submission_files=submission_files,
-                sandbox=sandbox,
-                submission_language=submission_language,
-                locale=locale,
-                pre_computed_results=pre_computed_results,
-                structural_analysis=structural_analysis,
-            )
-            root.penalty = penalty_result
-
-        return ResultTree(root)
-
-    def __balance_nodes(
-        self,
-        nodes: Sequence[CategoryResultNode | SubjectResultNode | TestResultNode],
-        factor: float,
-    ) -> None:
-        """Balance the weights of sibling nodes to sum to 100, scaled by factor."""
-        if len(nodes) == 0:
-            return
-
-        total_weight = sum(node.weight for node in nodes) * factor
-
-        if total_weight == 0:
-            equal_weight = 100.0 / len(nodes)
-            for node in nodes:
-                node.weight = equal_weight
-        elif total_weight != 100:
-            scale_factor = 100.0 / total_weight
-            for node in nodes:
-                node.weight *= scale_factor
-
-    @overload
-    def __process_holder(self, holder: CategoryNode) -> CategoryResultNode: ...
-
-    @overload
-    def __process_holder(self, holder: SubjectNode) -> SubjectResultNode: ...
-
-    def __process_holder(
-        self,
-        holder: CategoryNode | SubjectNode,
-        submission_files: Dict[str, SubmissionFile],
-        sandbox=None,
-        submission_language=None,
-        locale: str = "en",
-        pre_computed_results: Optional[Dict[str, TestResult]] = None,
-        structural_analysis=None,
-    ) -> CategoryResultNode | SubjectResultNode:
-        """Process a category or subject node and create corresponding result node."""
-
-        # Determine subjects and tests weight factors
-        if holder.subjects and holder.tests:
-            if not holder.subjects_weight:
-                raise ValueError(f"missing 'subjects_weight' for {holder.name}")
-            subjects_factor = holder.subjects_weight / 100.0
-            tests_factor = 1 - subjects_factor
-        else:
-            subjects_factor = 1.0
-            tests_factor = 1.0
-
-        # Process subjects
-        subject_results = []
-        if holder.subjects:
-            subject_results = [
-                self.process_subject(
-                    inner_subject,
-                    submission_files=submission_files,
-                    sandbox=sandbox,
-                    submission_language=submission_language,
-                    locale=locale,
-                    pre_computed_results=pre_computed_results,
-                    structural_analysis=structural_analysis,
-                )
-                for inner_subject in holder.subjects
-            ]
-            self.__balance_nodes(subject_results, subjects_factor)
-
-        # Process tests
-        test_results = []
-        if holder.tests:
-            test_results = [
-                self.process_test(
-                    test,
-                    submission_files=submission_files,
-                    sandbox=sandbox,
-                    submission_language=submission_language,
-                    locale=locale,
-                    pre_computed_results=pre_computed_results,
-                    structural_analysis=structural_analysis,
-                )
-                for test in holder.tests
-            ]
-            self.__balance_nodes(test_results, tests_factor)
-
-        # Create appropriate result node type
-        if isinstance(holder, CategoryNode):
-            return CategoryResultNode(
-                name=holder.name,
-                weight=holder.weight,
-                subjects_weight=holder.subjects_weight,
-                subjects=subject_results,
-                tests=test_results,
-            )
-        return SubjectResultNode(
-            name=holder.name,
-            weight=holder.weight,
-            subjects_weight=holder.subjects_weight,
-            subjects=subject_results,
-            tests=test_results,
-        )
-
-    def process_subject(
-        self,
-        subject: SubjectNode,
-        submission_files: Dict[str, SubmissionFile],
-        sandbox=None,
-        submission_language=None,
-        locale: str = "en",
-        pre_computed_results: Optional[Dict[str, TestResult]] = None,
-        structural_analysis=None,
-    ) -> SubjectResultNode:
-        """Process a subject node from criteria tree and create result node."""
-        return self.__process_holder(
-            subject,
-            submission_files=submission_files,
-            sandbox=sandbox,
-            submission_language=submission_language,
-            locale=locale,
-            pre_computed_results=pre_computed_results,
-            structural_analysis=structural_analysis,
-        )
-
-    def process_test(
-        self,
-        test: TestNode,
-        submission_files: Optional[Dict[str, SubmissionFile]] = None,
-        sandbox=None,
-        submission_language=None,
-        locale: str = "en",
-        pre_computed_results: Optional[Dict[str, TestResult]] = None,
-        structural_analysis=None,
-    ) -> TestResultNode:
-        """Execute a test and create a test result node.
-
-        Resolves `program_command` in-place before calling execute() so that
-        test functions always receive a finalized string command.  Every key in
-        ``test_params`` is a declared parameter of the test function.
-
-        ``pre_computed_results`` is intentionally forwarded to
-        ``test_function.execute()`` as an extra kwarg so that
-        :class:`~autograder.models.abstract.ai_test_function.AiTestFunction`
-        implementations can return the pre-computed result without a further API call.
-        Regular test functions that do not declare this parameter must accept
-        ``**kwargs`` or ignore it.
-        """
-        file_target = self.get_file_target(test, submission_files=submission_files)
-
-        # Shallow-copy parameters so we don't mutate the original TestNode.
-        test_params = dict(test.parameters or {})
-
-        # Resolve program_command eagerly when the language is known.
-        if submission_language and 'program_command' in test_params:
-            raw_command = test_params['program_command']
-            resolved = self._command_resolver.resolve_command(
-                raw_command, submission_language
-            )
-            test_params['program_command'] = resolved
-
-        # Inject the actual submission language so tests like forbidden_import
-        # always operate on the real language rather than a config-time guess.
-        if submission_language and 'submission_language' in test_params:
-            test_params['submission_language'] = submission_language
-
-        test_result = test.test_function.execute(
-            files=file_target,
-            sandbox=sandbox,
-            locale=locale,
-            pre_computed_results=pre_computed_results,
-            structural_analysis=structural_analysis,
-            submission_language=submission_language,
-            **test_params,
-        )
-        return TestResultNode(
-            name=test.name,
-            test_node=test,
-            score=test_result.score,
-            report=test_result.report,
-            parameters=test_result.parameters,
-            weight=test.weight,
-        )
-
-
-    def get_file_target(
-        self,
-        test_node: TestNode,
-        submission_files: Optional[Dict[str, SubmissionFile]],
-    ) -> Optional[List[SubmissionFile]]:
-        """Filter out the submission files strictly relevant to the current test node.
-
-        When no file_target is specified, returns all submission files so that
-        tests like forbidden_import can scan the actual submitted file regardless
-        of language.
-        """
-        if not submission_files:
-            return None
-
-        if not test_node.file_target or test_node.file_target == ["all"]:
-            return list(submission_files.values())
-
-        target_files = []
-        for file_name in submission_files:
-            if file_name in test_node.file_target:
-                target_files.append(submission_files[file_name])
-
-        return target_files
-
-    def process_category(
-        self,
-        category: CategoryNode,
-        submission_files: Dict[str, SubmissionFile],
-        sandbox=None,
-        submission_language=None,
-        locale: str = "en",
-        pre_computed_results: Optional[Dict[str, TestResult]] = None,
-        structural_analysis=None,
-    ) -> CategoryResultNode:
-        """Process a category node from criteria tree and create result node."""
-        return self.__process_holder(
-            category,
-            submission_files=submission_files,
-            sandbox=sandbox,
-            submission_language=submission_language,
-            locale=locale,
-            pre_computed_results=pre_computed_results,
-            structural_analysis=structural_analysis,
-        )
diff --git a/autograder/steps/grade_step.py b/autograder/steps/grade_step.py
index 683b14a7..1fce7bae 100644
--- a/autograder/steps/grade_step.py
+++ b/autograder/steps/grade_step.py
@@ -4,7 +4,7 @@
 from autograder.models.pipeline_execution import PipelineExecution
 from autograder.models.dataclass.step_result import StepResult, StepStatus, StepName
 from autograder.models.abstract.step import Step
-from autograder.services.grader_service import GraderService
+from autograder.services.grader.grader_service import GraderService
 
 logger = logging.getLogger(__name__)
 
diff --git a/docs/features/command_resolver.md b/docs/features/command_resolver.md
index b2228f9c..286398a2 100644
--- a/docs/features/command_resolver.md
+++ b/docs/features/command_resolver.md
@@ -86,11 +86,11 @@ When a student submits code, it enters an execution pipeline managed by the Auto
 
 1. **Language Association:** The API or Web layer identifies the submitted language and sets it in the overarching `PipelineExecution`.
 2. **Configuration Load:** The pipeline loads the standard criteria (e.g., weight, number of tests, configurations like `setup_config` to compile `.java` or `.cpp`).
-3. **Execution Routing in `GraderService`:** When evaluating an individual test node, `GraderService` discovers that `program_command` isn't an exact literal. It hands over the raw value (Dict or `"CMD"`) alongside the current `SubmissionLanguage` to the `CommandResolver`.
+3. **Execution Routing in `SubmissionGrader`:** When evaluating an individual test node, `SubmissionGrader` discovers that `program_command` isn't an exact literal. It hands over the raw value (Dict or `"CMD"`) alongside the current `SubmissionLanguage` to the `CommandResolver`.
 4. **Processing by `CommandResolver`:**
    - If the value is a dictionary, it looks up the specific key (e.g., `java`) and returns its explicit command string (`java Calculator`).
    - If the value is `"CMD"`, it assesses the submitted files or fallback defaults to return the right execution instruction.
-5. **Sandbox Hand-off:** After finding the appropriate string, the `GraderService` forwards it into the configured Docker container (the sandbox). Since the sandbox matches the submission language context, it spins up and correctly evaluates `java Calculator`, oblivious to the fact that the assignment originally supported Python or Node submissions.
+5. **Sandbox Hand-off:** After finding the appropriate string, the `SubmissionGrader` forwards it into the configured Docker container (the sandbox). Since the sandbox matches the submission language context, it spins up and correctly evaluates `java Calculator`, oblivious to the fact that the assignment originally supported Python or Node submissions.
 
 ---
 
diff --git a/docs/features/grading_engine.md b/docs/features/grading_engine.md
index 403235ab..8bcf0e31 100644
--- a/docs/features/grading_engine.md
+++ b/docs/features/grading_engine.md
@@ -210,7 +210,7 @@ The `ResultTree` also provides utility methods:
 
 | File | Contents |
 |------|----------|
-| `autograder/services/grader_service.py` | `GraderService` — tree traversal, test execution, weight balancing |
+| `autograder/services/grader/grader_service.py` | `GraderService` — coordinator for the grading process |\n| `autograder/services/grader/criteria_grader.py` | `SubmissionGrader` — stateful tree traversal, test execution, weight balancing |
 | `autograder/models/result_tree.py` | `ResultTree`, `RootResultNode`, `CategoryResultNode`, `SubjectResultNode`, `TestResultNode` |
 | `autograder/models/criteria_tree.py` | `CriteriaTree`, `CategoryNode`, `SubjectNode`, `TestNode` |
 | `autograder/models/dataclass/grade_step_result.py` | `GradeStepResult` — wrapper for final score + result tree |
diff --git a/docs/guides/development.md b/docs/guides/development.md
index 3697f4ee..d2067af6 100644
--- a/docs/guides/development.md
+++ b/docs/guides/development.md
@@ -60,7 +60,7 @@ autograder/
 │   │   ├── result_tree.py
 │   │   └── pipeline_execution.py
 │   ├── services/           # Business logic
-│   │   ├── grader_service.py       # Main grading engine
+│   │   ├── grader/                 # Grading engine\n│   │   │   ├── grader_service.py   # Coordinator\n│   │   │   └── criteria_grader.py  # Stateful processor
 │   │   ├── focus_service.py        # Focus-based feedback
 │   │   ├── criteria_tree_service.py # Tree construction
 │   │   └── template_library_service.py
diff --git a/docs/pipeline/04.5-ai-batch.md b/docs/pipeline/04.5-ai-batch.md
index 7731576d..525f7cfb 100644
--- a/docs/pipeline/04.5-ai-batch.md
+++ b/docs/pipeline/04.5-ai-batch.md
@@ -36,7 +36,7 @@ If no `AiTestFunction` instances are found in the criteria tree, the step exits
 
 ## How It Integrates with Grade
 
-The `GradeStep` reads the `AI_BATCH` step result from the pipeline and passes it as `pre_computed_results` to `GraderService.grade_from_tree()`. The grader threads this dict through the tree traversal into every `process_test()` call, which forwards it as a kwarg to `test_function.execute()`.
+The `GradeStep` reads the `AI_BATCH` step result from the pipeline and passes it as `pre_computed_results` to `GraderService.grade_from_tree()`. The `SubmissionGrader` threads this dict through the tree traversal into every `process_test()` call, which forwards it as a kwarg to `test_function.execute()`.
 
 `AiTestFunction.execute()` sees the `pre_computed_results` kwarg, looks up its own `name` in the dict, and returns the pre-computed `TestResult` directly — no further API call, no mutation.
 
diff --git a/docs/pipeline/05-grade.md b/docs/pipeline/05-grade.md
index 72712990..2669e5d3 100644
--- a/docs/pipeline/05-grade.md
+++ b/docs/pipeline/05-grade.md
@@ -75,7 +75,7 @@ Once the results are in, the pipeline proceeds to **[Step 6: Focus](06-focus.md)
 
 `autograder/steps/grade_step.py` → `GradeStep`
 
-`autograder/services/grader_service.py` → `GraderService`
+`autograder/services/grader/grader_service.py` → `GraderService`
 
 `autograder/models/result_tree.py` → `ResultTree`, `RootResultNode`, `CategoryResultNode`, `SubjectResultNode`, `TestResultNode`
 
diff --git a/docs/roadmaps/TECHNICAL_DEBT_ROADMAP.md b/docs/roadmaps/TECHNICAL_DEBT_ROADMAP.md
index 95f42e5b..80703fc4 100644
--- a/docs/roadmaps/TECHNICAL_DEBT_ROADMAP.md
+++ b/docs/roadmaps/TECHNICAL_DEBT_ROADMAP.md
@@ -79,7 +79,7 @@ These items address structural problems where components are tightly coupled, re
 
 #### Item 6: Decouple `GraderService` from Mutable State
 
-- **File:** `autograder/services/grader_service.py`
+- **File:** `autograder/services/grader/grader_service.py`
 - **Problem:** `GraderService` is instantiated once per `GradeStep` but accumulates mutable state through setter methods: `set_sandbox()`, `set_submission_language()`, and internal `__submission_files`. This makes the service stateful and order-dependent — callers must remember to call setters before `grade_from_tree()`. It also means the service cannot be safely reused across concurrent submissions.
 - **Impact:** The setter pattern (`set_sandbox`, `set_submission_language`) is a code smell that indicates these values should be parameters, not instance state. It also creates a hidden coupling: `GradeStep.execute()` must know the exact sequence of setter calls before invoking grading.
 - **Action:**
@@ -89,7 +89,7 @@ These items address structural problems where components are tightly coupled, re
 
 #### Item 7: Extract Language Resolution from Test Execution
 
-- **Files:** `autograder/services/grader_service.py`, `autograder/template_library/input_output.py`, `autograder/services/command_resolver.py`
+- **Files:** `autograder/services/grader/grader_service.py`, `autograder/template_library/input_output.py`, `autograder/services/command_resolver.py`
 - **Problem:** The `GraderService.process_test()` method injects `__submission_language__` as a hidden parameter into every test's kwargs dict. The `ExpectOutputTest` and `DontFailTest` in `input_output.py` then extract this hidden parameter to resolve the `program_command` via `CommandResolver`. This creates an implicit contract: tests must know to look for a magic key in their kwargs, and the grader must know to inject it.
 - **Impact:** The `__submission_language__` convention is undocumented, fragile, and invisible to anyone reading the `TestFunction.execute()` signature. It also means command resolution logic is scattered: `CommandResolver` lives in `services/`, but it's invoked inside individual test functions rather than at the pipeline level.
 - **Action:**
@@ -259,7 +259,7 @@ These items prepare the codebase for future growth by establishing patterns and
 
 #### Item 21: Decouple the `AiExecutor` Batch Pattern from Test Execution
 
-- **Files:** `autograder/utils/executors/ai_executor.py`, `autograder/services/grader_service.py`
+- **Files:** `autograder/utils/executors/ai_executor.py`, `autograder/services/grader/grader_service.py`
 - **Problem:** The `AiExecutor` implements a batch-send pattern where individual tests call `executor.add_test()` during tree traversal, and then `GraderService.grade_from_tree()` calls `executor.stop()` after all tests are processed to send a single batch request to OpenAI. This means:
   1. `GraderService` must know about `AiExecutor` and check for it after grading (`if hasattr(test_func, "executor") and test_func.executor`).
   2. Test functions that use AI don't actually execute during `execute()` — they register themselves and return an empty `TestResult` that gets populated later via `mapback()`.
diff --git a/tests/unit/pipeline/test_ai_batch_step.py b/tests/unit/pipeline/test_ai_batch_step.py
index 4356136b..57a5921d 100644
--- a/tests/unit/pipeline/test_ai_batch_step.py
+++ b/tests/unit/pipeline/test_ai_batch_step.py
@@ -27,7 +27,7 @@
 from autograder.models.dataclass.test_result import TestResult
 from autograder.models.pipeline_execution import PipelineExecution
 from autograder.models.result_tree import CategoryResultNode, ResultTree, RootResultNode
-from autograder.services.grader_service import GraderService
+from autograder.services.grader.grader_service import GraderService
 from autograder.steps.ai_batch_step import AiBatchStep
 from autograder.steps.grade_step import GradeStep
 from autograder.utils.executors.ai_executor import AiExecutor, TestInput, TestOutput
diff --git a/tests/unit/test_command_resolution_at_build_time.py b/tests/unit/test_command_resolution_at_build_time.py
index b2fd503c..71b0230f 100644
--- a/tests/unit/test_command_resolution_at_build_time.py
+++ b/tests/unit/test_command_resolution_at_build_time.py
@@ -7,7 +7,8 @@
 
 from typing import List
 
-from autograder.services.grader_service import GraderService
+from autograder.services.grader.criteria_grader import SubmissionGrader
+from autograder.services.command_resolver import CommandResolver
 from autograder.models.criteria_tree import TestNode
 from autograder.models.abstract.test_function import TestFunction
 from autograder.models.dataclass.param_description import ParamDescription
@@ -44,8 +45,12 @@ def execute(self, files, sandbox, *args, **kwargs):
         return TestResult(test_name=self.name, score=100.0, report="ok")
 
 
-def _make_grader(language=None) -> GraderService:
-    return GraderService(), language
+def _make_grader(language=None) -> SubmissionGrader:
+    return SubmissionGrader(
+        submission_files={},
+        command_resolver=CommandResolver(),
+        submission_language=language
+    )
 
 
 def _make_test_node(params: dict, fn: TestFunction = None) -> TestNode:
@@ -63,7 +68,7 @@ class TestDictCommandResolutionInProcessTest:
 
     def test_dict_command_resolved_for_python(self):
         """Test proper dict resolution for Python language."""
-        svc, language = _make_grader(Language.PYTHON)
+        svc = _make_grader(Language.PYTHON)
         fn = RecordingTestFunction()
         node = _make_test_node({
             "program_command": {
@@ -72,13 +77,13 @@ def test_dict_command_resolved_for_python(self):
             }
         }, fn)
 
-        svc.process_test(node, submission_language=language)
+        svc.process_test(node)
 
         assert fn.recorded_kwargs["program_command"] == "python3 calc.py"
 
     def test_dict_command_resolved_for_java(self):
         """Test proper dict resolution for Java language."""
-        svc, language = _make_grader(Language.JAVA)
+        svc = _make_grader(Language.JAVA)
         fn = RecordingTestFunction()
         node = _make_test_node({
             "program_command": {
@@ -87,13 +92,13 @@ def test_dict_command_resolved_for_java(self):
             }
         }, fn)
 
-        svc.process_test(node, submission_language=language)
+        svc.process_test(node)
 
         assert fn.recorded_kwargs["program_command"] == "java Calc"
 
     def test_dict_command_missing_language_gives_none(self):
         """When language is not in the dict, resolved value is None."""
-        svc, language = _make_grader(Language.NODE)
+        svc = _make_grader(Language.NODE)
         fn = RecordingTestFunction()
         node = _make_test_node({
             "program_command": {
@@ -102,7 +107,7 @@ def test_dict_command_missing_language_gives_none(self):
             }
         }, fn)
 
-        svc.process_test(node, submission_language=language)
+        svc.process_test(node)
 
         assert fn.recorded_kwargs["program_command"] is None
 
@@ -116,31 +121,31 @@ class TestCmdPlaceholderResolution:
 
     def test_cmd_resolved_for_python(self):
         """Test CMD placeholder resolves appropriately for Python."""
-        svc, language = _make_grader(Language.PYTHON)
+        svc = _make_grader(Language.PYTHON)
         fn = RecordingTestFunction()
         node = _make_test_node({"program_command": "CMD"}, fn)
 
-        svc.process_test(node, submission_language=language)
+        svc.process_test(node)
 
         assert fn.recorded_kwargs["program_command"] == "python3 main.py"
 
     def test_cmd_resolved_for_java(self):
         """Test CMD placeholder resolves appropriately for Java."""
-        svc, language = _make_grader(Language.JAVA)
+        svc = _make_grader(Language.JAVA)
         fn = RecordingTestFunction()
         node = _make_test_node({"program_command": "CMD"}, fn)
 
-        svc.process_test(node, submission_language=language)
+        svc.process_test(node)
 
         assert fn.recorded_kwargs["program_command"] == "java Main"
 
     def test_cmd_resolved_for_node(self):
         """Test CMD placeholder resolves appropriately for Node.js."""
-        svc, language = _make_grader(Language.NODE)
+        svc = _make_grader(Language.NODE)
         fn = RecordingTestFunction()
         node = _make_test_node({"program_command": "CMD"}, fn)
 
-        svc.process_test(node, submission_language=language)
+        svc.process_test(node)
 
         assert fn.recorded_kwargs["program_command"] == "node index.js"
 
@@ -154,11 +159,11 @@ class TestInvalidFormatHandled:
 
     def test_string_command_is_resolved(self):
         """Test non-CMD single strings are resolved directly."""
-        svc, language = _make_grader(Language.PYTHON)
+        svc = _make_grader(Language.PYTHON)
         fn = RecordingTestFunction()
         node = _make_test_node({"program_command": "python3 my_script.py"}, fn)
 
-        svc.process_test(node, submission_language=language)
+        svc.process_test(node)
 
         # Single strings are now supported and resolved directly
         assert fn.recorded_kwargs["program_command"] == "python3 my_script.py"
@@ -173,23 +178,23 @@ class TestNoLanguageNoResolution:
 
     def test_dict_unchanged_when_no_language(self):
         """Without language, dict program_command is left as-is (resolver not called)."""
-        svc, language = _make_grader()          # no language
+        svc = _make_grader()          # no language
         fn = RecordingTestFunction()
         cmd_dict = {"python": "python3 calc.py"}
         node = _make_test_node({"program_command": cmd_dict}, fn)
 
-        svc.process_test(node, submission_language=language)
+        svc.process_test(node)
 
         # Dict should reach execute() untouched — resolution was skipped
         assert fn.recorded_kwargs["program_command"] == cmd_dict
 
     def test_cmd_unchanged_when_no_language(self):
         """Without language, CMD placeholder is unchanged (resolver not called)."""
-        svc, language = _make_grader()
+        svc = _make_grader()
         fn = RecordingTestFunction()
         node = _make_test_node({"program_command": "CMD"}, fn)
 
-        svc.process_test(node, submission_language=language)
+        svc.process_test(node)
 
         assert fn.recorded_kwargs["program_command"] == "CMD"
 
@@ -203,33 +208,33 @@ class TestNoHiddenKwarg:
 
     def test_no_hidden_kwarg_with_language_set(self):
         """Even when language is set, __submission_language__ must not appear in execute()."""
-        svc, language = _make_grader(Language.PYTHON)
+        svc = _make_grader(Language.PYTHON)
         fn = RecordingTestFunction()
         node = _make_test_node({"program_command": "CMD"}, fn)
 
-        svc.process_test(node, submission_language=language)
+        svc.process_test(node)
 
         assert "__submission_language__" not in fn.recorded_kwargs
 
     def test_no_hidden_kwarg_without_language(self):
         """Hidden kwarg shouldn't appear even if there is no language to configure."""
-        svc, language = _make_grader()
+        svc = _make_grader()
         fn = RecordingTestFunction()
         node = _make_test_node({}, fn)
 
-        svc.process_test(node, submission_language=language)
+        svc.process_test(node)
 
         assert "__submission_language__" not in fn.recorded_kwargs
 
     def test_no_hidden_kwarg_with_dict_command(self):
         """Hidden kwarg shouldn't appear even if there is a dict correctly configured."""
-        svc, language = _make_grader(Language.JAVA)
+        svc = _make_grader(Language.JAVA)
         fn = RecordingTestFunction()
         node = _make_test_node({
             "program_command": {"python": "python3 a.py", "java": "java A"}
         }, fn)
 
-        svc.process_test(node, submission_language=language)
+        svc.process_test(node)
 
         assert "__submission_language__" not in fn.recorded_kwargs
 

From fa6d2309cb82430b5152196a700c1eae21b70b17 Mon Sep 17 00:00:00 2001
From: jaoppb <joaopedroperes06@gmail.com>
Date: Sat, 2 May 2026 09:47:45 -0300
Subject: [PATCH 2/4] fix: correct weight balancing logic in SubmissionGrader

- Refactor __balance_nodes to explicitly scale weights to 100 * factor.
- Improve clarity and robustness of sibling weight normalization.
- Update documentation to reflect the corrected balancing algorithm.
- Add unit tests for SubmissionGrader weight balancing.

Co-authored-by: gemini-cli <218195315+gemini-cli@users.noreply.github.com>
---
 autograder/services/grader/criteria_grader.py | 13 +--
 docs/features/grading_engine.md               | 18 ++--
 .../services/grader/test_submission_grader.py | 99 +++++++++++++++++++
 3 files changed, 116 insertions(+), 14 deletions(-)
 create mode 100644 tests/unit/services/grader/test_submission_grader.py

diff --git a/autograder/services/grader/criteria_grader.py b/autograder/services/grader/criteria_grader.py
index 81cea902..e1618888 100644
--- a/autograder/services/grader/criteria_grader.py
+++ b/autograder/services/grader/criteria_grader.py
@@ -47,18 +47,19 @@ def __balance_nodes(
         nodes: Sequence[CategoryResultNode | SubjectResultNode | TestResultNode],
         factor: float,
     ) -> None:
-        """Balance the weights of sibling nodes to sum to 100, scaled by factor."""
+        """Balance the weights of sibling nodes to sum to a target total (100 * factor)."""
         if len(nodes) == 0:
             return
 
-        total_weight = sum(node.weight for node in nodes) * factor
+        target_total = 100.0 * factor
+        current_sum = sum(node.weight for node in nodes)
 
-        if total_weight == 0:
-            equal_weight = 100.0 / len(nodes)
+        if current_sum == 0:
+            equal_weight = target_total / len(nodes)
             for node in nodes:
                 node.weight = equal_weight
-        elif total_weight != 100:
-            scale_factor = 100.0 / total_weight
+        else:
+            scale_factor = target_total / current_sum
             for node in nodes:
                 node.weight *= scale_factor
 
diff --git a/docs/features/grading_engine.md b/docs/features/grading_engine.md
index 8bcf0e31..2ecb0b25 100644
--- a/docs/features/grading_engine.md
+++ b/docs/features/grading_engine.md
@@ -70,23 +70,25 @@ Weights determine how much each node contributes to its parent's score. The engi
 
 ### Sibling Balancing
 
-The `__balance_nodes()` method normalizes weights:
+The `__balance_nodes()` method normalizes weights to a target total based on a factor:
 
 ```python
 def __balance_nodes(self, nodes, factor):
-    total_weight = sum(node.weight for node in nodes) * factor
-    if total_weight == 0:
-        equal_weight = 100.0 / len(nodes)
+    target_total = 100.0 * factor
+    current_sum = sum(node.weight for node in nodes)
+    
+    if current_sum == 0:
+        equal_weight = target_total / len(nodes)
         for node in nodes:
             node.weight = equal_weight
-    elif total_weight != 100:
-        scale_factor = 100.0 / total_weight
+    else:
+        scale_factor = target_total / current_sum
         for node in nodes:
             node.weight *= scale_factor
 ```
 
-- If all weights are zero, they're distributed equally.
-- If they don't sum to 100 (after applying the factor), they're scaled proportionally.
+- If all weights are zero, they're distributed equally to sum to `100 * factor`.
+- If they don't sum to the target total, they're scaled proportionally.
 
 ### Subject/Test Split
 
diff --git a/tests/unit/services/grader/test_submission_grader.py b/tests/unit/services/grader/test_submission_grader.py
new file mode 100644
index 00000000..020c182c
--- /dev/null
+++ b/tests/unit/services/grader/test_submission_grader.py
@@ -0,0 +1,99 @@
+
+import pytest
+from unittest.mock import MagicMock
+from autograder.services.grader.criteria_grader import SubmissionGrader
+from autograder.models.criteria_tree import CategoryNode, SubjectNode, TestNode
+from autograder.models.abstract.test_function import TestFunction
+from autograder.models.dataclass.test_result import TestResult
+
+class MockTestFunction(TestFunction):
+    @property
+    def name(self) -> str:
+        return "mock_test"
+    @property
+    def description(self) -> str:
+        return "mock description"
+    @property
+    def parameter_description(self) -> list:
+        return []
+    def execute(self, files=None, sandbox=None, **kwargs):
+        return TestResult(test_name="mock_test", score=kwargs.get('score', 100.0), report="OK")
+
+@pytest.fixture
+def grader():
+    command_resolver = MagicMock()
+    return SubmissionGrader(
+        submission_files={},
+        command_resolver=command_resolver
+    )
+
+def test_balance_nodes_only_subjects(grader):
+    # If only subjects, they should sum to 100
+    s1 = SubjectNode(name="S1", weight=30)
+    s2 = SubjectNode(name="S2", weight=30)
+    cat = CategoryNode(name="base", weight=100, subjects=[s1, s2])
+    
+    # We need to mock process_subject because it calls process_category/process_subject recursively
+    # Actually SubmissionGrader.process_category calls __process_holder which calls process_subject
+    
+    # Let's mock process_test and process_subject to avoid deep recursion for this test
+    grader.process_subject = MagicMock(side_effect=lambda s: MagicMock(weight=s.weight, score=100.0))
+    
+    result = grader.process_category(cat)
+    
+    # Weights should be balanced to 50/50 because original sum was 60
+    assert result.subjects[0].weight == 50.0
+    assert result.subjects[1].weight == 50.0
+
+def test_balance_nodes_with_subjects_weight(grader):
+    # Scenario: subjects_weight = 80
+    # 1 subject, 1 test
+    tf = MockTestFunction()
+    s1 = SubjectNode(name="S1", weight=100)
+    t1 = TestNode(name="T1", test_function=tf, weight=100)
+    cat = CategoryNode(name="base", weight=100, subjects=[s1], tests=[t1], subjects_weight=80)
+    
+    grader.process_subject = MagicMock(side_effect=lambda s: MagicMock(weight=s.weight, score=100.0))
+    # process_test is not mocked, it will execute the test
+    
+    result = grader.process_category(cat)
+    
+    # Subject should have weight 80, Test should have weight 20
+    assert result.subjects[0].weight == pytest.approx(80.0)
+    assert result.tests[0].weight == pytest.approx(20.0)
+    
+    # Final score should be 100 if both are 100
+    assert result.calculate_score() == pytest.approx(100.0)
+
+def test_balance_nodes_with_subjects_weight_and_scores(grader):
+    # Scenario: subjects_weight = 80
+    # Subject score = 100, Test score = 0
+    tf = MockTestFunction()
+    s1 = SubjectNode(name="S1", weight=100)
+    t1 = TestNode(name="T1", test_function=tf, weight=100, parameters={'score': 0.0})
+    cat = CategoryNode(name="base", weight=100, subjects=[s1], tests=[t1], subjects_weight=80)
+    
+    # Mock process_subject to return a result node with score 100
+    from autograder.models.result_tree import SubjectResultNode
+    grader.process_subject = MagicMock(return_value=SubjectResultNode(name="S1", weight=100, subjects_weight=None, score=100.0))
+    
+    result = grader.process_category(cat)
+    
+    assert result.subjects[0].weight == pytest.approx(80.0)
+    assert result.tests[0].weight == pytest.approx(20.0)
+    
+    # Score = 100 * 0.8 + 0 * 0.2 = 80
+    assert result.calculate_score() == pytest.approx(80.0)
+
+def test_balance_nodes_zero_weights(grader):
+    # If all weights are zero, they should be equal and sum to 100 * factor
+    s1 = SubjectNode(name="S1", weight=0)
+    s2 = SubjectNode(name="S2", weight=0)
+    cat = CategoryNode(name="base", weight=100, subjects=[s1, s2])
+    
+    grader.process_subject = MagicMock(side_effect=lambda s: MagicMock(weight=s.weight, score=100.0))
+    
+    result = grader.process_category(cat)
+    
+    assert result.subjects[0].weight == 50.0
+    assert result.subjects[1].weight == 50.0

From f9e0af79ee59c7b1492d2cbdcc2b79fe33e27c57 Mon Sep 17 00:00:00 2001
From: Arthur Carvalho <arthurcarvalhorodrigues2409@gmail.com>
Date: Sat, 2 May 2026 16:01:38 -0300
Subject: [PATCH 3/4] Remove documentation roadmap link from index

---
 docs/index.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/index.md b/docs/index.md
index 64322c28..b5884204 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -47,4 +47,3 @@ This documentation is organized to be both practical for first-time users and de
 | Pipeline steps | [pipeline/README.md](pipeline/README.md) |
 | Features | [features/grading_engine.md](features/grading_engine.md) |
 | Development setup | [guides/development.md](guides/development.md) |
-| Documentation roadmap | [roadmaps/MKDOCS_DOCUMENTATION_ROADMAP.md](roadmaps/MKDOCS_DOCUMENTATION_ROADMAP.md) |

From e02463e74761f4dd02f9c3f66a7c466ba37f95f3 Mon Sep 17 00:00:00 2001
From: jaoppb <joaopedroperes06@gmail.com>
Date: Mon, 4 May 2026 00:34:49 -0300
Subject: [PATCH 4/4] docs: removed non-existent docs from mkdocs.yml

---
 mkdocs.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mkdocs.yml b/mkdocs.yml
index 685785b8..17ff94fc 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,5 +1,6 @@
 site_name: Autograder Documentation
-site_description: Educational-standards-driven autograding platform documentation
+site_description:
+  Educational-standards-driven autograding platform documentation
 site_url: https://webtech-network.github.io/autograder/
 repo_url: https://github.com/webtech-network/autograder
 repo_name: webtech-network/autograder
@@ -86,6 +87,5 @@ nav:
       - Criteria Configuration Examples: guides/criteria_configuration_examples.md
       - GitHub Module (legacy): guides/github_module.md
   - Roadmaps:
-      - MkDocs Documentation Roadmap: roadmaps/MKDOCS_DOCUMENTATION_ROADMAP.md
       - Feature Roadmap: roadmaps/FEATURE_ROADMAP.md
       - Technical Debt Roadmap: roadmaps/TECHNICAL_DEBT_ROADMAP.md