From 42bdf8723e6f4e65338ff24131eb19bae4b14491 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Fri, 15 May 2026 13:11:05 -0700
Subject: [PATCH 1/9] Remove simulator shim files; import directly from
 model_provisioner

Delete 17 shim files from simulator/ that re-exported from
streamwise.model_provisioner. Update simulator/__init__.py to add
streamwise/ to sys.path so model_provisioner is importable.

Update imports in simulator/provisioning.py, multirequests.py, and
plot_utils.py to use model_provisioner.* prefixed imports.

Update all 19 test files in tests/simulator/ to:
- Pass both 'simulator' and 'streamwise' to temp_sys_path
- Use model_provisioner.* prefixed imports for moved modules
- Fix patch.dict target in test_models.py
- Fix inline import in test_hexgen.py

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 simulator/__init__.py                         |   15 +
 simulator/actions.py                          |  737 ------------
 simulator/auto_model_allocator.py             |  109 --
 simulator/constants.py                        |  142 ---
 simulator/data_loading.py                     |  298 -----
 simulator/evaluator.py                        |  414 -------
 simulator/greedy.py                           |  573 ---------
 simulator/helix.py                            |  403 -------
 simulator/hexgen.py                           |  629 ----------
 simulator/milp.py                             | 1070 -----------------
 simulator/model_allocator.py                  |  282 -----
 simulator/models.py                           |  811 -------------
 simulator/multirequests.py                    |   26 +-
 simulator/naive_baseline.py                   |  484 --------
 simulator/plot_utils.py                       |   10 +-
 simulator/policies.py                         |  252 ----
 simulator/provisioning.py                     |   42 +-
 simulator/sim_types.py                        |  796 ------------
 simulator/sim_types_json.py                   |   58 -
 simulator/utils.py                            |  297 -----
 simulator/workflows.py                        |  253 ----
 tests/simulator/test_auto_model_allocator.py  |   36 +-
 tests/simulator/test_data_loading.py          |   10 +-
 tests/simulator/test_evaluator.py             |   34 +-
 tests/simulator/test_greedy.py                |   22 +-
 tests/simulator/test_helix.py                 |   20 +-
 tests/simulator/test_hexgen.py                |   16 +-
 tests/simulator/test_milp.py                  |   32 +-
 tests/simulator/test_models.py                |   58 +-
 tests/simulator/test_multirequests_derive.py  |    8 +-
 tests/simulator/test_simulator.py             |   26 +-
 tests/simulator/test_simulator_actions.py     |   12 +-
 tests/simulator/test_simulator_baseline.py    |   34 +-
 tests/simulator/test_simulator_energy.py      |   26 +-
 .../simulator/test_simulator_multirequests.py |   12 +-
 tests/simulator/test_simulator_plotutils.py   |   10 +-
 tests/simulator/test_simulator_policies.py    |    8 +-
 .../simulator/test_simulator_provisioning.py  |   18 +-
 tests/simulator/test_simulator_types.py       |   22 +-
 tests/simulator/test_simulator_utils.py       |   26 +-
 tests/simulator/test_workflows.py             |   14 +-
 41 files changed, 282 insertions(+), 7863 deletions(-)
 delete mode 100644 simulator/actions.py
 delete mode 100644 simulator/auto_model_allocator.py
 delete mode 100644 simulator/constants.py
 delete mode 100644 simulator/data_loading.py
 delete mode 100644 simulator/evaluator.py
 delete mode 100644 simulator/greedy.py
 delete mode 100644 simulator/helix.py
 delete mode 100644 simulator/hexgen.py
 delete mode 100644 simulator/milp.py
 delete mode 100644 simulator/model_allocator.py
 delete mode 100644 simulator/models.py
 delete mode 100644 simulator/naive_baseline.py
 delete mode 100644 simulator/policies.py
 delete mode 100644 simulator/sim_types.py
 delete mode 100644 simulator/sim_types_json.py
 delete mode 100644 simulator/utils.py
 delete mode 100644 simulator/workflows.py

diff --git a/simulator/__init__.py b/simulator/__init__.py
index e69de29b..24058e01 100644
--- a/simulator/__init__.py
+++ b/simulator/__init__.py
@@ -0,0 +1,15 @@
+"""
+Simulator package.
+
+The core allocation logic lives in ``streamwise.model_provisioner``.
+This package adds provisioning sweeps, multi-request analysis, and plotting
+on top of that shared foundation.
+"""
+import os
+import sys
+
+# Make model_provisioner importable for simulator modules and child processes.
+_STREAMWISE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "streamwise")
+_STREAMWISE_DIR = os.path.normpath(_STREAMWISE_DIR)
+if _STREAMWISE_DIR not in sys.path:
+    sys.path.insert(0, _STREAMWISE_DIR)
diff --git a/simulator/actions.py b/simulator/actions.py
deleted file mode 100644
index debea677..00000000
--- a/simulator/actions.py
+++ /dev/null
@@ -1,737 +0,0 @@
-"""
-Actions for scaling models for the greedy allocator.
-"""
-
-from __future__ import annotations
-
-import random
-
-from collections import Counter
-
-from copy import deepcopy
-
-from typing import Optional
-
-from constants import DEVICE_OPTIONS
-from constants import SINGLE_INSTANCE_MODELS
-from constants import SINGLE_DEVICE_MODELS
-
-from sim_types import Action
-from sim_types import ActionName
-from sim_types import Model
-from sim_types import ModelAllocation
-from sim_types import GPUType
-from sim_types import WorkflowConfig
-from sim_types import LatencyData
-from sim_types import PowerData
-from sim_types import Objective
-from sim_types import Policy
-
-from policies import STREAMWISE_POLICY
-
-from models import get_model_allocation
-
-from evaluator import evaluate_model_allocation
-from evaluator import calc_used_gpus
-
-
-def _is_single_instance(
-    model_name: Model,
-    workflow: Optional[WorkflowConfig] = None,
-) -> bool:
-    """Check if a model is single-instance, considering workflow parallelism settings."""
-    if model_name not in SINGLE_INSTANCE_MODELS:
-        return False
-    if workflow is not None and workflow.is_parallelizable(model_name):
-        return False
-    return True
-
-
-def find_next_devices(
-    device_options: list[int],
-    num_devices: int,
-    num_replicas: int,
-    remaining_devices: int,
-    max_num_devices: Optional[int] = None,
-) -> Optional[int]:
-    """
-    Find the next device combination.
-    For example, with device options [2, 4, 8, 16, 40], current devices 8, 1 replica, we get 16.
-    """
-    if num_replicas == 0:
-        # means we haven't allocated any replicas yet so start from smallest device option
-        return device_options[0] if device_options[0] <= remaining_devices else None
-
-    for device_option in device_options:
-        # if device_option > num_devices and device_option <= remaining_devices + num_devices:
-        if (
-            device_option > num_devices
-            and (device_option - num_devices) * num_replicas <= remaining_devices
-            and (max_num_devices is None or device_option <= max_num_devices)
-        ):
-            return device_option
-    return None
-
-
-def choose_action(
-    actions: list[Action],
-    objective: Objective,
-    switch_objective: bool = False,
-) -> Optional[Action]:
-    """Schedule requests."""
-    if not actions:
-        return None
-
-    if objective == Objective.TIME_COST:
-        # return min(actions, key=lambda a: a.time)
-        return min(
-            actions,
-            key=lambda a: (
-                a.time_cost(),
-                a.time,
-            ),
-        )
-    if objective == Objective.TIME_COST:
-        return min(
-            actions,
-            key=lambda a: (
-                a.time_cost(),
-                a.time,
-            ),
-        )
-    if objective == Objective.TTFF_COST:
-        return min(
-            actions,
-            key=lambda a: (
-                a.ttff_cost(),
-                a.ttff,
-            ),
-        )
-    if objective == Objective.FIFO:
-        # return min(actions, key=lambda a: a.arrival_time_s)
-        return min(actions, key=lambda a: a.get_order())
-    if objective == Objective.TIME:
-        return min(actions, key=lambda a: a.time)
-    if objective == Objective.TTFF:
-        return min(actions, key=lambda a: a.ttff)
-    if objective == Objective.COST:
-        return min(actions, key=lambda a: a.cost)
-    if objective == Objective.ENERGY:
-        return min(actions, key=lambda a: a.energy)
-    if objective == Objective.TIME_ENERGY:
-        return min(actions, key=lambda a: a.time_energy())
-    if objective == Objective.ENERGY_COST:
-        return min(actions, key=lambda a: a.energy_cost())
-    if objective == Objective.RANDOM:
-        # randomly pick an improvement to simulate naive allocation
-        return random.choice(actions)
-    if objective == Objective.TTFF_THEN_TIME:
-        if switch_objective:
-            return min(actions, key=lambda a: a.time)
-        else:
-            return min(actions, key=lambda a: a.ttff)
-    if objective == Objective.NONE:
-        return None
-    raise ValueError(f"Cannot recognize objective {objective}")
-
-
-def apply_action(
-    action: Action,
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
-    """Apply the chosen action to the models and update remaining devices."""
-
-    for gpu_type in action.models.keys():
-        if gpu_type not in models:
-            raise ValueError(f"Cannot find gpu type {gpu_type} in {models.keys()}")
-        for model in action.models[gpu_type].keys():
-            if model not in models[gpu_type]:
-                raise ValueError(f"Cannot find model {model} in {models[gpu_type].keys()}")
-            allocs_to_remove = []
-            for alloc_id in range(len(action.models[gpu_type][model])):
-                # check if devices and replicas are non-negative
-                num_devices = action.models[gpu_type][model][alloc_id].devices
-                if num_devices < 0:
-                    raise ValueError(f"Action devices {num_devices} must be >= 0")
-                if action.models[gpu_type][model][alloc_id].replicas <= 0:
-                    # remove that instance if replicas is 0 or negative
-                    allocs_to_remove.append(alloc_id)
-            for alloc_id in reversed(allocs_to_remove):
-                del action.models[gpu_type][model][alloc_id]
-
-    return action.models
-
-
-def gen_actions(
-    workflow: WorkflowConfig,
-    num_gpus: dict[GPUType, int],
-    latency_data: LatencyData,
-    power_data: Optional[PowerData] = None,
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {},
-    policy: Policy = STREAMWISE_POLICY,
-    allow_removal: bool = False,
-    allow_merging: bool = False,
-    look_ahead_replicas: int = 3,
-) -> list[Action]:
-    actions: list[Action] = []
-
-    # Extract GPU types from models
-    gpu_types = list(models.keys())
-    assert len(gpu_types) == len(num_gpus), \
-        f"Number of GPU types in models {len(gpu_types)} must match num_gpus {len(num_gpus)}"
-
-    remaining_gpus = {}
-    for gpu_type in num_gpus.keys():
-        remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]})
-
-    # Option 1: Provision more by increasing <devices, replicas> for each model allocation
-    for model in Model:
-        if model not in workflow.models:
-            continue
-        for gpu_type in gpu_types:
-            for alloc_id in range(len(models[gpu_type][model])):
-                actions.extend(_gen_add_device_replica_actions(
-                    models=models,
-                    num_gpus=num_gpus,
-                    remaining_gpus=remaining_gpus[gpu_type],
-                    gpu_type=gpu_type,
-                    model_name=model,
-                    allocation_id=alloc_id,
-                    workflow=workflow,
-                    policy=policy,
-                    latency_data=latency_data,
-                    power_data=power_data,
-                    look_ahead_replicas=look_ahead_replicas,
-                ))
-
-    # Option 2: Add a model instance of <devices, replicas>
-    for model in Model:
-        if model not in workflow.models:
-            continue
-        for gpu_type in gpu_types:
-            actions.extend(_gen_add_instance(
-                models=models,
-                num_gpus=num_gpus,
-                remaining_gpus=remaining_gpus[gpu_type],
-                gpu_type=gpu_type,
-                model_name=model,
-                workflow=workflow,
-                policy=policy,
-                latency_data=latency_data,
-                power_data=power_data,
-                look_ahead_replicas=look_ahead_replicas,
-            ))
-
-    if allow_removal:
-        # Option 3: Remove replicas for each model allocation
-        for model in Model:
-            if model not in workflow.models:
-                continue
-            for gpu_type in gpu_types:
-                model_instances = models[gpu_type][model]
-                for alloc_id in range(len(model_instances)):
-                    action = _gen_remove_replica_action(
-                        models=models,
-                        num_gpus=num_gpus,
-                        gpu_type=gpu_type,
-                        model_name=model,
-                        allocation_id=alloc_id,
-                        workflow=workflow,
-                        policy=policy,
-                        latency_data=latency_data,
-                        power_data=power_data,
-                    )
-                    if action:
-                        actions.append(action)
-
-    if allow_merging:
-        # Option 4: Merge across model allocations
-        for model in Model:
-            if model not in workflow.models:
-                continue
-            for gpu_type in gpu_types:
-                actions.extend(_gen_merge_replicas_actions(
-                    models=models,
-                    num_gpus=num_gpus,
-                    gpu_type=gpu_type,
-                    model_name=model,
-                    workflow=workflow,
-                    policy=policy,
-                    latency_data=latency_data,
-                    power_data=power_data,
-                ))
-
-    return actions
-
-
-def _get_min_device_combinations(
-    num_gpus: int,
-    model: Model,
-) -> list[tuple[int, int]]:
-    """
-    Get the minimum device combinations for a given number of GPUs and model.
-    [(device_count, num_replicas), ...]
-    For example, for 64, it would return [(40, 1), (16, 1)].
-    """
-    remaining = num_gpus
-    result: list[int] = []
-    for size in sorted(DEVICE_OPTIONS[model], reverse=True):
-        while remaining >= size:
-            result.append(size)
-            remaining -= size
-    if remaining > 0:
-        raise ValueError(f"Cannot exactly decompose {num_gpus} with DEVICE_OPTIONS")
-    counts = Counter(result)
-    return sorted(counts.items(), reverse=True)  # Sort by device count descending
-
-
-def _get_large_instance_many_small_combinations(
-    num_gpus: int,
-    model: Model,
-) -> list[tuple[int, int]]:
-    """
-    Get the largest instance possible and then split the rest into 1 GPU instances.
-    For example, for 64, it would return [(40, 1), (1, 16)].
-    """
-    assert num_gpus > 0
-    assert model in DEVICE_OPTIONS
-    assert DEVICE_OPTIONS[model][0] == 1  # must have 1 GPU option to use this function
-
-    remaining_gpus = num_gpus
-    result: list[tuple[int, int]] = []
-    for size in sorted(DEVICE_OPTIONS[model], reverse=True):
-        if remaining_gpus >= size:
-            result = [(size, 1)]
-            remaining_gpus -= size
-            break
-    if remaining_gpus > 0:
-        result.append((1, remaining_gpus))
-    return result
-
-
-def _gen_add_device_replica_actions(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-    num_gpus: dict[GPUType, int],
-    remaining_gpus: int,
-    gpu_type: GPUType,
-    model_name: Model,
-    allocation_id: int,
-    workflow: WorkflowConfig,
-    policy: Policy,
-    latency_data: LatencyData,
-    power_data: Optional[PowerData] = None,
-    look_ahead_replicas: int = 3,
-) -> list[Action]:
-    """
-    Generate actions that explore all valid (replicas, devices) provisioning
-    options for a given model allocation, using the remaining GPUs.
-
-    From the current replicas * devices, find the next options by distributing the remaining devices.
-    For example, if currently 2 replicas at parallelism 4 with 4 remaining devices, options include:
-      - 3 replicas, 4 devices  (uses 12 total, 4 more than current 8)
-      - 1 replica, 10 devices  (uses 10 total, 2 more than current 8)
-      - etc.
-    """
-    actions: list[Action] = []
-
-    if model_name in SINGLE_DEVICE_MODELS and _is_single_instance(model_name, workflow):
-        return actions  # No scaling possible
-
-    alloc = models[gpu_type][model_name][allocation_id]
-    current_total = alloc.devices * max(alloc.replicas, 0)
-    current_replicas = alloc.replicas
-    total_available = current_total + remaining_gpus
-
-    max_num_devices = latency_data[gpu_type].get_max_parallelism(model_name)
-    max_replicas = alloc.get_max_replicas(workflow)
-    is_single_instance = _is_single_instance(model_name, workflow)
-    is_single_device = model_name in SINGLE_DEVICE_MODELS
-
-    seen: set[tuple[int, int]] = set()
-    seen.add((max(alloc.replicas, 0), alloc.devices))  # skip current config
-
-    for new_devices in DEVICE_OPTIONS[model_name]:
-        if new_devices > max_num_devices:
-            continue  # Exceeds max parallelism from latency data
-        if is_single_device and new_devices > 1:
-            continue  # Model only supports single device
-        if (model_name, new_devices) not in latency_data[gpu_type]:
-            continue  # No latency data for this device count
-
-        # Determine the range of replicas possible with this device count
-        if is_single_instance:
-            replica_candidates = [1]
-        else:
-            max_r = min(max_replicas, total_available // new_devices) if new_devices > 0 else 0
-            # limit max replicas to original replicas + X to avoid too many combinations
-            max_r = min(max_r, current_replicas + look_ahead_replicas)
-            replica_candidates = list(range(1, max_r + 1))
-
-        for new_replicas in replica_candidates:
-            new_total = new_replicas * new_devices
-            if new_total <= current_total:
-                continue  # Must be an increase
-            if new_total > total_available:
-                continue  # Not enough GPUs
-            if (new_replicas, new_devices) in seen:
-                continue
-            seen.add((new_replicas, new_devices))
-
-            try:
-                new_models = deepcopy(models)
-                new_models[gpu_type][model_name][allocation_id] = get_model_allocation(
-                    model=model_name,
-                    gpu_type=gpu_type,
-                    devices=new_devices,
-                    replicas=new_replicas,
-                )
-                action_result = evaluate_model_allocation(
-                    models=new_models,
-                    num_gpus=num_gpus,
-                    workflow=workflow,
-                    latency_data=latency_data,
-                    power_data=power_data,
-                    policy=policy,
-                    include_models=[model_name],
-                )
-                actions.append(Action(
-                    name=ActionName.ADD_DEVICE_REPLICA,
-                    model=model_name,
-                    gpu_type=gpu_type,
-                    models=new_models,
-                    action_result=action_result,
-                    arrival_time_s=alloc.time,
-                ))
-            except Exception:
-                pass  # Invalid configuration, skip
-
-    return actions
-
-
-def _gen_add_device_action(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-    num_gpus: dict[GPUType, int],
-    remaining_gpus: int,
-    gpu_type: GPUType,
-    model_name: Model,
-    allocation_id: int,
-    workflow: WorkflowConfig,
-    policy: Policy,
-    latency_data: LatencyData,
-    power_data: Optional[PowerData] = None,
-) -> Optional[Action]:
-    """
-    Action to add devices (increase parallelism) for a specific model allocation.
-    """
-    action: Optional[Action] = None
-
-    if model_name in SINGLE_DEVICE_MODELS:
-        return action  # These models only run on a single GPU, so we don't add more devices
-
-    alloc = models[gpu_type][model_name][allocation_id]
-
-    max_num_devices = latency_data[gpu_type].get_max_parallelism(model_name)
-    next_num_devices = find_next_devices(
-        DEVICE_OPTIONS[model_name],
-        num_devices=alloc.devices,
-        num_replicas=alloc.replicas,
-        remaining_devices=remaining_gpus,
-        max_num_devices=max_num_devices)
-
-    if not next_num_devices:
-        return action  # No valid next device option, skip
-    if (model_name, next_num_devices) not in latency_data[gpu_type]:
-        return action  # No latency data for this device option, skip
-
-    new_models = deepcopy(models)
-    new_models[gpu_type][model_name][allocation_id] = get_model_allocation(
-        model=model_name,
-        gpu_type=gpu_type,
-        devices=next_num_devices,
-        replicas=max(1, alloc.replicas),
-    )
-    try:
-        action_result = evaluate_model_allocation(
-            models=new_models,
-            num_gpus=num_gpus,
-            workflow=workflow,
-            latency_data=latency_data,
-            power_data=power_data,
-            policy=policy,
-            include_models=[model_name],
-        )
-        action = Action(
-            name=ActionName.ADD_DEVICE,
-            model=model_name,
-            gpu_type=gpu_type,
-            models=new_models,
-            action_result=action_result,
-            arrival_time_s=alloc.time,
-        )
-    except Exception:
-        pass  # Invalid action
-
-    return action
-
-
-def _gen_merge_replicas_actions(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-    gpu_type: GPUType,
-    model_name: Model,
-    num_gpus: dict[GPUType, int],
-    workflow: WorkflowConfig,
-    policy: Policy,
-    latency_data: LatencyData,
-    power_data: Optional[PowerData] = None,
-) -> list[Action]:
-    actions: list[Action] = []
-
-    if _is_single_instance(model_name, workflow):
-        return actions  # These models only support a single instance, so no need to merge
-
-    model_instances = models[gpu_type][model_name]
-    model_num_gpus = 0
-    for model_instance in model_instances:
-        model_num_gpus += model_instance.get_num_gpus()
-    if model_num_gpus <= 1:
-        return actions  # No replicas to merge for this model and GPU type
-
-    for device_combos in [
-        _get_min_device_combinations(model_num_gpus, model_name),
-        _get_large_instance_many_small_combinations(model_num_gpus, model_name)
-    ]:
-        new_models = deepcopy(models)
-        new_models[gpu_type][model_name] = []
-
-        for new_num_devices, new_num_replicas in device_combos:
-            new_models[gpu_type][model_name].append(get_model_allocation(
-                model=model_name,
-                gpu_type=gpu_type,
-                devices=new_num_devices,
-                replicas=new_num_replicas,
-            ))
-
-        try:
-            action_result = evaluate_model_allocation(
-                models=new_models,
-                num_gpus=num_gpus,
-                workflow=workflow,
-                latency_data=latency_data,
-                power_data=power_data,
-                policy=policy,
-                include_models=[model_name],
-            )
-
-            instance_id = 0
-            actions.append(Action(
-                name=ActionName.MERGE,
-                model=model_name,
-                gpu_type=gpu_type,
-                models=new_models,
-                action_result=action_result,
-                arrival_time_s=new_models[gpu_type][model_name][instance_id].time,
-            ))
-        except Exception:
-            pass  # Invalid action
-
-    return actions
-
-
-def _gen_add_instance(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-    num_gpus: dict[GPUType, int],
-    remaining_gpus: int,
-    gpu_type: GPUType,
-    model_name: Model,
-    workflow: WorkflowConfig,
-    policy: Policy,
-    latency_data: LatencyData,
-    power_data: Optional[PowerData] = None,
-    look_ahead_replicas: int = 3,
-) -> list[Action]:
-    actions: list[Action] = []
-
-    if _is_single_instance(model_name, workflow):
-        return actions  # These models only support a single instance, so we don't add more
-
-    for new_num_devices in DEVICE_OPTIONS[model_name]:
-        for new_num_replicas in list(range(1, look_ahead_replicas + 1)):
-            new_instance = get_model_allocation(
-                model=model_name,
-                gpu_type=gpu_type,
-                devices=new_num_devices,
-                replicas=new_num_replicas,
-            )
-            if new_instance.get_num_gpus() > remaining_gpus:
-                continue  # Not enough remaining GPUs for this new instance
-
-            new_models = deepcopy(models)
-            new_models[gpu_type][model_name].append(new_instance)
-
-            try:
-                action_result = evaluate_model_allocation(
-                    models=new_models,
-                    num_gpus=num_gpus,
-                    workflow=workflow,
-                    latency_data=latency_data,
-                    power_data=power_data,
-                    policy=policy,
-                    include_models=[model_name],
-                )
-                action = Action(
-                    name=ActionName.ADD_INSTANCE,
-                    model=model_name,
-                    gpu_type=gpu_type,
-                    models=new_models,
-                    action_result=action_result,
-                    arrival_time_s=new_instance.time,
-                )
-                actions.append(action)
-            except Exception:
-                pass  # Invalid action
-
-    return actions
-
-
-def _gen_remove_replica_action(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-    num_gpus: dict[GPUType, int],
-    gpu_type: GPUType,
-    model_name: Model,
-    allocation_id: int,
-    workflow: WorkflowConfig,
-    policy: Policy,
-    latency_data: LatencyData,
-    power_data: Optional[PowerData] = None,
-) -> Optional[Action]:
-    action: Optional[Action] = None
-
-    model = models[gpu_type][model_name][allocation_id]
-
-    if model.replicas == 0:
-        return action  # No replicas to remove for this model and GPU type
-
-    new_models = deepcopy(models)
-    new_models[gpu_type][model_name][allocation_id] = get_model_allocation(
-        model=model_name,
-        gpu_type=gpu_type,
-        devices=model.devices,
-        replicas=model.replicas - 1,
-    )
-
-    if len(num_gpus) == 2:
-        # For dual GPU setting, initialize removed replica on the other GPU type to see if it improves performance
-        gpu_types = list(num_gpus.keys())
-        other_gpu_type = gpu_types[0] if gpu_type == gpu_types[1] else gpu_types[1]
-        if _is_single_instance(model_name, workflow):
-            if new_models[gpu_type][model_name][allocation_id].replicas == 0:
-                # If this is a single instance model and we're removing the only replica, add it to the other GPU type
-                new_models[other_gpu_type][model_name].append(get_model_allocation(
-                    model=model_name,
-                    gpu_type=other_gpu_type,
-                    devices=model.devices,
-                    replicas=1,
-                ))
-
-    try:
-        action_result = evaluate_model_allocation(
-            models=new_models,
-            num_gpus=num_gpus,
-            workflow=workflow,
-            latency_data=latency_data,
-            power_data=power_data,
-            policy=policy,
-            include_models=[model_name],
-        )
-        action = Action(
-            name=ActionName.REMOVE_REPLICA,
-            model=model_name,
-            gpu_type=gpu_type,
-            models=new_models,
-            action_result=action_result,
-            arrival_time_s=new_models[gpu_type][model_name][allocation_id].time,
-        )
-    except Exception:
-        pass  # Ignore not possible action
-    return action
-
-
-def _gen_add_replica_action(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-    num_gpus: dict[GPUType, int],
-    remaining_gpus: int,
-    gpu_type: GPUType,
-    model_name: Model,
-    allocation_id: int,
-    workflow: WorkflowConfig,
-    policy: Policy,
-    latency_data: LatencyData,
-    power_data: Optional[PowerData] = None,
-) -> Optional[Action]:
-    """
-    Action to add replicas for a specific model allocation.
-    """
-    action: Optional[Action] = None
-
-    if _is_single_instance(model_name, workflow):
-        return action  # These models don't support replication, so we skip
-
-    model = models[gpu_type][model_name][allocation_id]
-
-    if remaining_gpus < model.devices:
-        return action  # Not enough remaining GPUs to add another replica
-
-    max_replicas = model.get_max_replicas(workflow)
-    if model.replicas >= max_replicas:
-        return action  # Already at max replicas, skip
-
-    new_num_replicas = min(
-        model.replicas + 1,
-        max_replicas,  # - models[other_gpu_type][Model.HF].replicas
-        model.replicas + remaining_gpus // model.devices
-    )
-    if new_num_replicas == model.replicas:
-        return action  # No changes, skip
-
-    new_models = deepcopy(models)
-    new_models[gpu_type][model_name][allocation_id] = get_model_allocation(
-        model=model_name,
-        gpu_type=gpu_type,
-        devices=model.devices,
-        replicas=new_num_replicas,
-    )
-
-    try:
-        action_result = evaluate_model_allocation(
-            models=new_models,
-            num_gpus=num_gpus,
-            workflow=workflow,
-            latency_data=latency_data,
-            power_data=power_data,
-            policy=policy,
-            include_models=[model_name],
-        )
-        action = Action(
-            name=ActionName.ADD_REPLICA,
-            model=model_name,
-            gpu_type=gpu_type,
-            models=new_models,
-            action_result=action_result,
-            arrival_time_s=model.time,
-        )
-    except Exception:
-        pass  # Invalid action
-
-    return action
-
-
-def max_time(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-    model_name: Model,
-) -> float:
-    values = []
-    for models_gpu in models.values():
-        if model_name in models_gpu:
-            for alloc in models_gpu[model_name]:
-                values.append(alloc.time)
-    return max(values)
diff --git a/simulator/auto_model_allocator.py b/simulator/auto_model_allocator.py
deleted file mode 100644
index ea0fda61..00000000
--- a/simulator/auto_model_allocator.py
+++ /dev/null
@@ -1,109 +0,0 @@
-"""
-Factory helpers for selecting the right model allocator implementation.
-"""
-
-from __future__ import annotations
-
-import logging
-
-from dataclasses import replace
-from typing import Optional
-
-from sim_types import Policy
-from sim_types import WorkflowConfig
-from sim_types import LatencyData
-from sim_types import Model
-from sim_types import PowerData
-from sim_types import QualityLevel
-from sim_types import Solver
-from sim_types import GPUType
-from sim_types import Result
-
-from policies import STREAMWISE_POLICY
-
-from model_allocator import ModelAllocator
-
-
-class AutoModelAllocator(ModelAllocator):
-    """Allocator wrapper that routes to a concrete allocator by solver."""
-
-    policy: Policy
-
-    def __init__(
-        self,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-        power_data: Optional[PowerData] = None,
-        policy: Policy = STREAMWISE_POLICY,
-    ) -> None:
-        super().__init__(
-            workflow=workflow,
-            latency_data=latency_data,
-            power_data=power_data,
-            policy=policy,
-        )
-        self._allocator = self._build_allocator()
-
-    def _build_allocator(self) -> ModelAllocator:
-        """Create concrete allocator based on configured solver."""
-        if self.policy.solver == Solver.GREEDY:
-            from greedy import GreedyAllocator
-            return GreedyAllocator(
-                workflow=self.workflow,
-                latency_data=self.latency_data,
-                power_data=self.power_data,
-                policy=self.policy,
-            )
-        if self.policy.solver == Solver.NAIVE:
-            from naive_baseline import NaiveAllocator
-            return NaiveAllocator(
-                workflow=self.workflow,
-                latency_data=self.latency_data,
-                power_data=self.power_data,
-                policy=self.policy,
-            )
-        if self.policy.solver in {Solver.GUROBI, Solver.HIGHS}:
-            from milp import MILPAllocator
-            return MILPAllocator(
-                workflow=self.workflow,
-                latency_data=self.latency_data,
-                power_data=self.power_data,
-                policy=self.policy,
-            )
-        if self.policy.solver == Solver.HEXGEN:
-            from hexgen import HexGenAllocator
-            return HexGenAllocator(
-                workflow=self.workflow,
-                latency_data=self.latency_data,
-                power_data=self.power_data,
-                policy=self.policy,
-            )
-        if self.policy.solver == Solver.HELIX:
-            from helix import HelixAllocator
-            return HelixAllocator(
-                workflow=self.workflow,
-                latency_data=self.latency_data,
-                power_data=self.power_data,
-                policy=self.policy,
-            )
-        raise ValueError(f"Unsupported solver for allocator selection: {self.policy.solver}")
-
-    def allocate(
-        self,
-        num_gpus: dict[GPUType, int],
-        verbose: bool = False,
-    ) -> Result:
-        if self.policy.use_upscaler and self.workflow.target_resolution == QualityLevel.LOW:
-            logging.warning(
-                f"Policy {self.policy.name} uses upscaler, but workflow target resolution is LOW. "
-                f"Disabling upscaler for this allocation.")
-            self.policy = replace(self.policy, use_upscaler=False)
-            self._allocator.policy = self.policy
-            # Remove upscaler from model work
-            self.workflow.model_work.pop(Model.UPSCALER, None)
-            self._allocator.workflow = self.workflow
-
-        return self._allocator.allocate(
-            num_gpus=num_gpus,
-            verbose=verbose,
-        )
diff --git a/simulator/constants.py b/simulator/constants.py
deleted file mode 100644
index bb6f9034..00000000
--- a/simulator/constants.py
+++ /dev/null
@@ -1,142 +0,0 @@
-from __future__ import annotations
-
-import math
-
-from sim_types import WorkflowConfig
-from sim_types import GPUType
-from sim_types import Model
-
-
-SECONDS_IN_MINUTE = 60.0
-SECONDS_IN_HOUR = 60.0 * 60.0
-
-# Video resolution constants (16:10)
-NUM_PIXELS_ORIGINAL = 1280 * 800
-NUM_PIXELS_ORIGINAL_FLUX = 1280 * 800
-NUM_PIXELS_ORIGINAL_HF = 512 * 320
-NUM_PIXELS_ORIGINAL_FT = 640 * 400
-NUM_PIXELS_ORIGINAL_UPSCALER = 1280 * 800
-
-NUM_PIXELS_MEDIUM = 640 * 400
-NUM_PIXELS_MEDIUM_FLUX = 640 * 400
-NUM_PIXELS_MEDIUM_HF = 256 * 160
-NUM_PIXELS_MEDIUM_FT = 320 * 200
-NUM_PIXELS_MEDIUM_UPSCALER = 640 * 400
-
-NUM_PIXELS_LOW = 320 * 200
-NUM_PIXELS_LOW_FLUX = 320 * 200
-NUM_PIXELS_LOW_HF = 128 * 80
-NUM_PIXELS_LOW_FT = 160 * 100
-NUM_PIXELS_LOW_UPSCALER = 320 * 200
-
-# StreamCast constants
-TOTAL_INPUT_TOKENS = 20 * 1024  # 20K tokens for instructions, PDFs, etc.
-TOTAL_VIDEO_SECONDS = 10 * 60  # 10 minutes video
-TOTAL_SUBSCENES = 172  # each subscene is 3.5 seconds -> limited by fantasytalking 81 frames at 23 FPS
-TOTAL_SCENES = 43  # each scene is 4 subscenes
-FPS: dict[Model, float] = {
-    Model.HF: 30,
-    Model.FT: 23,
-}
-NUM_STEPS: dict[Model, int] = {
-    Model.FLUX: 25,
-    Model.HF: 10,
-    Model.FT: 10,
-}
-FRAMES_OPTIONS: dict[Model, list[int]] = {
-    Model.HF: [36, 72, 108, 144, 324],
-    Model.FT: [9, 21, 41, 61, 77],
-}
-FRAMES_PER_STEP_IDX = 4
-
-DEFAULT_WORKFLOW_CONFIG = WorkflowConfig(
-    total_video_seconds=TOTAL_VIDEO_SECONDS,
-    total_scenes=TOTAL_SCENES,
-    total_frames={
-        Model.HF: math.ceil(TOTAL_VIDEO_SECONDS * FPS[Model.HF]),
-        Model.FT: math.ceil(TOTAL_VIDEO_SECONDS * FPS[Model.FT]),
-    },
-    total_subscenes=TOTAL_SUBSCENES,
-    per_subscene_frames={
-        Model.HF: math.ceil(TOTAL_VIDEO_SECONDS * FPS[Model.HF] / TOTAL_SUBSCENES),
-        Model.FT: math.ceil(TOTAL_VIDEO_SECONDS * FPS[Model.FT] / TOTAL_SUBSCENES),
-    },
-    # default per-frame number of denoising steps
-    num_steps=dict(NUM_STEPS),
-    # supported number of generation frames
-    hf_frames=FRAMES_OPTIONS[Model.HF],
-    ft_frames=FRAMES_OPTIONS[Model.FT],
-    frames_per_step_idx=FRAMES_PER_STEP_IDX,
-    total_input_tokens=TOTAL_INPUT_TOKENS,
-)
-
-# Available device counts for scaling
-# Tensor parallelism (TP) or sequence parallelism (SP)
-DEVICE_OPTIONS = {
-    Model.GEMMA: [1, 2, 4, 8],
-    Model.FLUX: [1, 2, 4, 8, 16],
-    Model.OTHERS: [1],  # Single GPU, no parallelism
-    Model.HF: [1, 2, 4, 8, 10, 16, 20, 24, 32, 40],
-    Model.HF_VAE: [1],  # Single GPU, no parallelism
-    Model.FT: [1, 2, 4, 8, 10, 16, 20, 24, 32, 40],
-    Model.FT_VAE: [1],  # Single GPU, no parallelism
-    Model.UPSCALER: [1, 2, 4, 8],  # Single GPU, no parallelism
-}
-
-# Models that only have one instance in the system, so not scaling them across GPU types
-SINGLE_INSTANCE_MODELS = [
-    Model.GEMMA,
-    Model.FLUX,
-    Model.OTHERS,
-]
-
-# Models that can only be run on a single GPU
-SINGLE_DEVICE_MODELS = [
-    Model.OTHERS,
-    Model.HF_VAE,
-    Model.FT_VAE,
-]
-
-
-NUM_GPUS_PER_SERVER = {
-    GPUType.A100: 8,
-    GPUType.H100: 8,
-    GPUType.H200: 8,
-    GPUType.GB200: 8,  # This is technically 4 GPUs per server, but nothing fits
-}
-
-
-POWER_GPU_IDLE = {
-    GPUType.A100: 65.0,  # Watts
-    GPUType.H100: 80.0,  # Watts TODO placeholder value
-    GPUType.H200: 80.0,  # Watts TODO placeholder value
-    GPUType.GB200: 170.0,  # Watts
-}
-
-
-POWER_GPU_TDP = {
-    GPUType.A100: 400.0,  # Watts
-    GPUType.H100: 700.0,  # Watts
-    GPUType.H200: 700.0,  # Watts
-    GPUType.GB200: 1200.0,  # Watts
-}
-
-
-# Cost per GPU
-GPU_SPOT_COST = {
-    # $ / hour (Spot prices)
-    GPUType.A100: 1.07,  # $8.56 for 8 GPUs
-    GPUType.H100: 4.03,  # $32.24 for 8 GPUs
-    GPUType.H200: 4.22,  # $33.76 for 8 GPUs
-    GPUType.GB200: 10.76  # $43.04 for 4 GPUs
-}
-
-GPU_RESERVED_COST = {
-    # $ / hour (Reserved prices)
-    GPUType.A100: 3.4,  # $27.2 for 8 GPUs
-    GPUType.H100: 5.39,  # $43.12 for 8 GPUs
-    GPUType.H200: 5.64,  # $45.12 for 8 GPUs
-    GPUType.GB200: 14.42  # $57.68 for 4 GPUs
-}
-
-GPU_COST = GPU_SPOT_COST
diff --git a/simulator/data_loading.py b/simulator/data_loading.py
deleted file mode 100644
index 6ee59ec5..00000000
--- a/simulator/data_loading.py
+++ /dev/null
@@ -1,298 +0,0 @@
-"""
-Module for loading latency and power consumption data from CSV files.
-"""
-
-import pandas as pd
-
-from pathlib import Path
-
-from sim_types import LatencyData
-from sim_types import PowerData
-from sim_types import GPUType
-from sim_types import LatencyGPUTypeData
-from sim_types import PowerGPUTypeData
-from sim_types import QualityLevel
-
-from constants import NUM_PIXELS_ORIGINAL_UPSCALER
-from constants import NUM_PIXELS_ORIGINAL_FT
-from constants import NUM_PIXELS_ORIGINAL_HF
-from constants import NUM_PIXELS_ORIGINAL_FLUX
-from constants import NUM_PIXELS_LOW_FT
-from constants import NUM_PIXELS_LOW_HF
-from constants import NUM_PIXELS_LOW_FLUX
-from constants import NUM_PIXELS_LOW_UPSCALER
-from constants import NUM_PIXELS_MEDIUM_FT
-from constants import NUM_PIXELS_MEDIUM_HF
-from constants import NUM_PIXELS_MEDIUM_UPSCALER
-from constants import NUM_PIXELS_MEDIUM_FLUX
-from constants import POWER_GPU_IDLE
-from constants import POWER_GPU_TDP
-
-
-def load_latency_data(
-    data_dir: str = "data/",
-) -> LatencyData:
-    """
-    Load latency and throughput mapping data from CSV files.
-
-    Args:
-        data_dir (str): The directory where the CSV files are stored.
-    Returns:
-        LatencyData: An object containing all loaded latency data.
-    """
-    data_path = Path(data_dir)
-
-    data = LatencyData(gpus={})
-    for gpu_type in GPUType:
-        data.gpus[gpu_type] = LatencyGPUTypeData(gpu_type=gpu_type)
-
-        # Flux time -> per image generation
-        csv_flux_path = data_path / f"latency_flux_mapping_{gpu_type.value.lower()}.csv"
-        df_flux = pd.read_csv(csv_flux_path, comment='#')
-        data[gpu_type].flux = dict(zip(
-            df_flux["world_size"],
-            df_flux["avg_steps_time"]))
-
-        # Hunyuan Framepack per step time -> [36, 72, 108, 144, 324] frames generation
-        csv_hf_path = data_path / f"latency_hf_mapping_{gpu_type.value.lower()}.csv"
-        df_hf = pd.read_csv(csv_hf_path, comment='#')
-        data[gpu_type].hf = dict(zip(
-            df_hf["world_size"],
-            df_hf["avg_steps_time"]))
-
-        # Hunyuan Framepack VAE time -> per inference iteration
-        # Derived: steps * avg_step_time * vae_pct(vae_time / total_time)
-        data[gpu_type].hf_vae = dict(zip(
-            df_hf["world_size"],
-            df_hf["vae_time"]))
-
-        # Fantasy Talking per step time -> [9, 21, 41, 61, 77] frames generation
-        csv_ft_path = data_path / f"latency_ft_mapping_{gpu_type.value.lower()}.csv"
-        df_ft = pd.read_csv(csv_ft_path, comment='#')
-        data[gpu_type].ft = dict(zip(
-            df_ft["world_size"],
-            df_ft["avg_steps_time"]))
-
-        # Fantasy Talking VAE time -> per inference iteration
-        # Derived: steps * avg_step_time * vae_pct(vae_time / total_time)
-        data[gpu_type].ft_vae = dict(zip(
-            df_ft["world_size"],
-            df_ft["vae_time"]))
-
-        # Upscaler time -> per image frame
-        csv_upscaler_path = data_path / f"latency_upscaler_{gpu_type.value.lower()}.csv"
-        df_upscaler = pd.read_csv(csv_upscaler_path, comment='#')
-        data[gpu_type].upscaler = dict(zip(
-            df_upscaler['world_size'],
-            df_upscaler['avg_steps_time']))
-
-        # Gemma time -> first scene and per scene
-        csv_gemma_path = data_path / f"latency_gemma_{gpu_type.value.lower()}.csv"
-        df_gemma = pd.read_csv(csv_gemma_path, comment='#')
-        data[gpu_type].gemma_first_scene = dict(zip(
-            df_gemma['tp'],
-            df_gemma['first_scene_time']))
-        data[gpu_type].gemma_per_scene = dict(zip(
-            df_gemma['tp'],
-            df_gemma['per_scene_time']))
-
-        # Others time -> kokoro and other overheads -> time per scene
-        csv_others_path = data_path / f"latency_others_{gpu_type.value.lower()}.csv"
-        df_others = pd.read_csv(csv_others_path, comment='#')
-        data[gpu_type].others = dict(zip(
-            df_others['world_size'],
-            df_others['time']))
-
-    return data
-
-
-def load_power_data(
-    data_dir: str = "data/"
-) -> PowerData:
-    """
-    Load power consumption data from CSV files.
-
-    Args:
-        data_dir (str): The directory where the CSV files are stored.
-    Returns:
-        PowerData: An object containing all loaded power consumption data.
-    """
-    data_path = Path(data_dir)
-
-    data = PowerData(gpus={})
-    for gpu_type in GPUType:
-        data.gpus[gpu_type] = PowerGPUTypeData(gpu_type=gpu_type)
-
-        # Flux power profile
-        power_flux_file_name = data_path / f'power_flux_mapping_{gpu_type.value.lower()}.csv'
-        power_flux_df = pd.read_csv(power_flux_file_name, comment='#')
-        data[gpu_type].flux = dict(zip(
-            power_flux_df['world_size'],
-            power_flux_df['power_watts']))
-
-        # Hunyuan Framepack 640x400 power profile
-        power_hf_file_name = data_path / f'power_hf_mapping_{gpu_type.value.lower()}.csv'
-        power_hf_df = pd.read_csv(power_hf_file_name, comment='#')
-        data[gpu_type].hf = dict(zip(
-            power_hf_df['world_size'],
-            power_hf_df['power_watts']))
-
-        # Hunyuan Framepack 1280x800 power profile
-        power_hf_file_name_high = data_path / f'power_hf_mapping_{gpu_type.value.lower()}_high.csv'
-        power_hf_high_df = pd.read_csv(power_hf_file_name_high, comment='#')
-        data[gpu_type].hf_high = dict(zip(
-            power_hf_high_df['world_size'],
-            power_hf_high_df['power_watts']))
-
-        # Hunyuan Framepack VAE power profile
-        power_hf_vae_file_name = data_path / f'power_hf_vae_{gpu_type.value.lower()}.csv'
-        power_hf_vae_df = pd.read_csv(power_hf_vae_file_name, comment='#')
-        data[gpu_type].hf_vae = dict(zip(
-            power_hf_vae_df['world_size'],
-            power_hf_vae_df['power_watts']))
-
-        # Hunyuan Framepack VAE high power profile
-        power_hf_vae_high_file_name = data_path / f'power_hf_vae_{gpu_type.value.lower()}_high.csv'
-        power_hf_vae_high_df = pd.read_csv(power_hf_vae_high_file_name, comment='#')
-        data[gpu_type].hf_vae_high = dict(zip(
-            power_hf_vae_high_df['world_size'],
-            power_hf_vae_high_df['power_watts']))
-
-        # Fantasy Talking 640x400 power profile
-        power_ft_file_name = data_path / f'power_ft_mapping_{gpu_type.value.lower()}.csv'
-        power_ft_df = pd.read_csv(power_ft_file_name, comment='#')
-        data[gpu_type].ft = dict(zip(
-            power_ft_df['world_size'],
-            power_ft_df['power_watts']))
-
-        # Fantasy Talking 1280x800 power profile
-        power_ft_high_file_name = data_path / f'power_ft_mapping_{gpu_type.value.lower()}_high.csv'
-        power_ft_high_df = pd.read_csv(power_ft_high_file_name, comment='#')
-        data[gpu_type].ft_high = dict(zip(
-            power_ft_high_df['world_size'],
-            power_ft_high_df['power_watts']))
-
-        # Fantasy Talking VAE mapping
-        power_ft_vae_file_name = data_path / f'power_ft_vae_mapping_{gpu_type.value.lower()}.csv'
-        power_ft_vae_df = pd.read_csv(power_ft_vae_file_name, comment='#')
-        data[gpu_type].ft_vae = dict(zip(
-            power_ft_vae_df['world_size'],
-            power_ft_vae_df['power_watts']))
-
-        # Fantasy Talking VAE high mapping
-        power_ft_vae_high_file_name = data_path / f'power_ft_vae_mapping_{gpu_type.value.lower()}_high.csv'
-        power_ft_vae_high_df = pd.read_csv(power_ft_vae_high_file_name, comment='#')
-        data[gpu_type].ft_vae_high = dict(zip(
-            power_ft_vae_high_df['world_size'],
-            power_ft_vae_high_df['power_watts']))
-
-        # Upscaler power profile
-        power_upscaler_file_name = data_path / f'power_upscaler_{gpu_type.value.lower()}.csv'
-        power_upscaler_df = pd.read_csv(power_upscaler_file_name, comment='#')
-        data[gpu_type].upscaler = dict(zip(
-            power_upscaler_df['world_size'],
-            power_upscaler_df['power_watts']))
-
-        # Gemma power profile
-        power_gemma_first_scene_file_name = data_path / f'power_gemma_first_scene_{gpu_type.value.lower()}.csv'
-        power_gemma_per_scene_file_name = data_path / f'power_gemma_per_scene_{gpu_type.value.lower()}.csv'
-        power_gemma_first_scene_df = pd.read_csv(power_gemma_first_scene_file_name, comment='#')
-        power_gemma_per_scene_df = pd.read_csv(power_gemma_per_scene_file_name, comment='#')
-        data[gpu_type].gemma_first_scene = dict(zip(
-            power_gemma_first_scene_df['world_size'],
-            power_gemma_first_scene_df['power_watts']
-        ))
-        data[gpu_type].gemma_per_scene = dict(zip(
-            power_gemma_per_scene_df['world_size'],
-            power_gemma_per_scene_df['power_watts']
-        ))
-
-    # Idle and TDP power profiles
-    for gpu_type in GPUType:
-        data[gpu_type].idle = POWER_GPU_IDLE[gpu_type]
-        data[gpu_type].tdp = POWER_GPU_TDP[gpu_type]
-
-    return data
-
-
-def load_adaptive_quality_data(
-    data_dir: str,
-    level: QualityLevel,
-) -> LatencyData:
-    """Load latency data for adaptive quality."""
-    assert isinstance(level, QualityLevel)
-
-    latency_data = load_latency_data(data_dir=data_dir)
-
-    if level == QualityLevel.ORIGINAL or level == QualityLevel.HIGH:
-        return latency_data
-
-    if level == QualityLevel.MEDIUM:
-        ratio_flux = NUM_PIXELS_MEDIUM_FLUX / NUM_PIXELS_ORIGINAL_FLUX
-        ratio_hf = NUM_PIXELS_MEDIUM_HF / NUM_PIXELS_ORIGINAL_HF
-        ratio_hf_vae = NUM_PIXELS_MEDIUM_HF / NUM_PIXELS_ORIGINAL_HF
-        ratio_ft = NUM_PIXELS_MEDIUM_FT / NUM_PIXELS_ORIGINAL_FT
-        ratio_ft_vae = NUM_PIXELS_MEDIUM_FT / NUM_PIXELS_ORIGINAL_FT
-        ratio_upscaler = NUM_PIXELS_MEDIUM_UPSCALER / NUM_PIXELS_ORIGINAL_UPSCALER
-        for gpu_type in GPUType:
-            latency_data[gpu_type].flux = {
-                k: v * ratio_flux
-                for k, v in latency_data[gpu_type].flux.items()
-            }
-            latency_data[gpu_type].hf = {
-                k: v * ratio_hf
-                for k, v in latency_data[gpu_type].hf.items()
-            }
-            latency_data[gpu_type].hf_vae = {
-                k: v * ratio_hf_vae
-                for k, v in latency_data[gpu_type].hf_vae.items()
-            }
-            latency_data[gpu_type].ft = {
-                k: v * ratio_ft
-                for k, v in latency_data[gpu_type].ft.items()
-            }
-            latency_data[gpu_type].ft_vae = {
-                k: v * ratio_ft_vae
-                for k, v in latency_data[gpu_type].ft_vae.items()
-            }
-            latency_data[gpu_type].upscaler = {
-                k: v * ratio_upscaler
-                for k, v in latency_data[gpu_type].upscaler.items()
-            }
-        return latency_data
-
-    if level == QualityLevel.LOW:
-        ratio_flux = NUM_PIXELS_LOW_FLUX / NUM_PIXELS_ORIGINAL_FLUX
-        ratio_hf = NUM_PIXELS_LOW_HF / NUM_PIXELS_ORIGINAL_HF
-        ratio_hf_vae = NUM_PIXELS_LOW_HF / NUM_PIXELS_ORIGINAL_HF
-        ratio_ft = NUM_PIXELS_LOW_FT / NUM_PIXELS_ORIGINAL_FT
-        ratio_ft_vae = NUM_PIXELS_LOW_FT / NUM_PIXELS_ORIGINAL_FT
-        ratio_upscaler = NUM_PIXELS_LOW_UPSCALER / NUM_PIXELS_ORIGINAL_UPSCALER
-        for gpu_type in GPUType:
-            latency_data[gpu_type].flux = {
-                k: v * ratio_flux
-                for k, v in latency_data[gpu_type].flux.items()
-            }
-            latency_data[gpu_type].hf = {
-                k: v * ratio_hf
-                for k, v in latency_data[gpu_type].hf.items()
-            }
-            latency_data[gpu_type].hf_vae = {
-                k: v * ratio_hf_vae
-                for k, v in latency_data[gpu_type].hf_vae.items()
-            }
-            latency_data[gpu_type].ft = {
-                k: v * ratio_ft
-                for k, v in latency_data[gpu_type].ft.items()
-            }
-            latency_data[gpu_type].ft_vae = {
-                k: v * ratio_ft_vae
-                for k, v in latency_data[gpu_type].ft_vae.items()
-            }
-            latency_data[gpu_type].upscaler = {
-                k: v * ratio_upscaler
-                for k, v in latency_data[gpu_type].upscaler.items()
-            }
-        return latency_data
-
-    return latency_data
diff --git a/simulator/evaluator.py b/simulator/evaluator.py
deleted file mode 100644
index a9730bb2..00000000
--- a/simulator/evaluator.py
+++ /dev/null
@@ -1,414 +0,0 @@
-"""
-Evaluate the performance of a given model allocation in terms of time, energy, and cost.
-It includes some assertions (e.g., only one instance of Gemma and Flux).
-"""
-from __future__ import annotations
-
-import math
-import logging
-
-from typing import Optional
-
-from constants import NUM_GPUS_PER_SERVER
-from constants import TOTAL_INPUT_TOKENS
-from constants import SECONDS_IN_HOUR
-
-from sim_types import Result
-from sim_types import GPUType
-from sim_types import WorkflowConfig
-from sim_types import PowerData
-from sim_types import LatencyData
-from sim_types import Model
-from sim_types import ModelAllocation
-from sim_types import Policy
-
-from sim_types_json import models_to_json
-from sim_types_json import workflow_to_json
-from sim_types_json import policy_to_json
-
-
-def _count_instances(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-    model: Model,
-) -> int:
-    num_instances = 0
-    for model_gpus in models.values():
-        if model in model_gpus:
-            for model_allocation in model_gpus[model]:
-                if model_allocation.get_num_gpus() > 0:
-                    num_instances += 1
-    return num_instances
-
-
-def _assert_single_instance(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-    model: Model,
-) -> None:
-    num_instances = _count_instances(models, model)
-    assert num_instances == 1, f"Expected exactly one instance of {model}, but found {num_instances}"
-
-
-def _assert_at_least_one_instance(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-    model: Model,
-) -> None:
-    num_instances = _count_instances(models, model)
-    assert num_instances > 0, f"Expected at least one instance of {model}, but found {num_instances}"
-
-
-def _assert_no_instances(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-    model: Model,
-) -> None:
-    num_instances = _count_instances(models, model)
-    assert num_instances == 0, f"Expected no instances of {model}, but found {num_instances}"
-
-
-def evaluate_times(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-    latency_data: LatencyData,
-    workflow: WorkflowConfig,
-    policy: Policy,
-    include_models: Optional[list[Model]] = None,
-) -> None:
-    """
-    Compute the total time for the given model allocation and workflow, using the latency data.
-    It only evaluates the models specified in "include_models" if provided.
-    """
-    gpu_types = list(models.keys())
-
-    upscaler_gpus = sum(
-        model_alloc.get_num_gpus()
-        for gpu_type in gpu_types
-        for model_alloc in models.get(gpu_type, {}).get(Model.UPSCALER, [])
-    )
-    if not policy.use_upscaler:
-        assert upscaler_gpus == 0
-
-    for model_name in workflow.models:
-        if include_models is not None and model_name not in include_models:
-            continue
-
-        # Special conditions: models that require a policy flag
-        if model_name == Model.HF_VAE and not policy.is_disaggregated(Model.HF):
-            _assert_no_instances(models, Model.HF_VAE)
-            continue
-        if model_name == Model.FT_VAE and not policy.is_disaggregated(Model.FT):
-            _assert_no_instances(models, Model.FT_VAE)
-            continue
-        if model_name == Model.UPSCALER and not policy.use_upscaler:
-            _assert_no_instances(models, Model.UPSCALER)
-            continue
-
-        _assert_at_least_one_instance(models, model_name)
-
-        if not workflow.is_parallelizable(model_name):
-            # Single-instance: no work splitting
-            for gpu_type in gpu_types:
-                if model_name in models[gpu_type]:
-                    for model_alloc in models[gpu_type][model_name]:
-                        model_alloc.calculate_time(
-                            policy, workflow, latency_data)
-                        model_alloc.calculate_time_first(
-                            policy, workflow, latency_data)
-            continue
-
-        # Parallel: capacity-based work splitting (throughput-weighted)
-        capacities: dict[GPUType, list[float]] = {}
-        for gpu_type in gpu_types:
-            capacities[gpu_type] = []
-            if model_name not in models[gpu_type]:
-                continue
-            for model_alloc in models[gpu_type][model_name]:
-                if model_alloc.get_num_gpus() > 0:
-                    latency = latency_data[gpu_type][model_name, model_alloc.devices]
-                    # When not disaggregated, include VAE overhead in capacity
-                    if model_name == Model.FT and not policy.is_disaggregated(Model.FT):
-                        latency += latency_data[gpu_type][Model.FT_VAE, 1] / workflow.num_steps[Model.FT]
-                    if model_name == Model.HF and not policy.is_disaggregated(Model.HF):
-                        latency += latency_data[gpu_type][Model.HF_VAE, 1] / workflow.num_steps[Model.HF]
-                    if model_name in (Model.HF, Model.HF_VAE, Model.FT, Model.FT_VAE):
-                        latency *= workflow.get_resolution_scale(policy.use_upscaler)
-                    if model_name == Model.GEMMA:
-                        latency *= workflow.total_input_tokens / TOTAL_INPUT_TOKENS
-                    if latency == 0:
-                        capacities[gpu_type].append(0.0)
-                    else:
-                        capacities[gpu_type].append(model_alloc.replicas / latency)
-
-        total_capacity = sum(sum(c) for c in capacities.values())
-        for gpu_type in gpu_types:
-            if model_name not in models[gpu_type]:
-                continue
-            cap_idx = 0
-            for model_alloc in models[gpu_type][model_name]:
-                if model_alloc.get_num_gpus() > 0:
-                    work_pct = capacities[gpu_type][cap_idx] / total_capacity if total_capacity > 0 else 0.0
-                    model_alloc.calculate_time(
-                        policy, workflow, latency_data,
-                        work_pct=work_pct)
-                    model_alloc.calculate_time_first(
-                        policy, workflow, latency_data)
-                    cap_idx += 1
-
-
-def evaluate_energy(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-    power_data: PowerData,
-    workflow: WorkflowConfig,
-    total_time_s: float = 0.0,
-) -> None:
-    """
-    Calculate total energy (power * time * replicas for each model).
-    Need to run after evaluate_times since energy calculation depends on time.
-    """
-    for gpu_type_allocs in models.values():
-        for model_allocation_list in gpu_type_allocs.values():
-            for model_allocation in model_allocation_list:
-                model_allocation.calculate_energy(
-                    workflow,
-                    power_data,
-                    total_time_s)
-
-
-def evaluate_cost(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-    total_time_s: float,
-    policy: Policy,
-) -> None:
-    """
-    Calculate total cost based on GPU hours used.
-    Need to run after evaluate_times since cost calculation depends on time.
-    """
-    for gpu_type_allocs in models.values():
-        for model_allocation_list in gpu_type_allocs.values():
-            for model in model_allocation_list:
-                model.calculate_cost(policy, total_time_s)
-
-
-_EVALUATOR_CACHE: dict[str, Result] = {}
-
-
-def evaluate_model_allocation(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-    num_gpus: dict[GPUType, int],
-    workflow: WorkflowConfig,
-    latency_data: LatencyData,
-    power_data: Optional[PowerData],
-    policy: Policy,
-    include_models: Optional[list[Model]] = None,
-    cache_results: bool = False,
-    round_up_cost_to_server: bool = False,
-) -> Result:
-    """
-    Evaluate the metrics for a given allocation of models to GPUs.
-    It only evaluates the models in "include_models" if specified.
-    """
-    cache_key = None
-    if cache_results:
-        cache_key = models_to_json(models) + \
-            workflow_to_json(workflow) + \
-            str(latency_data) + \
-            str(power_data) + \
-            policy_to_json(policy) + \
-            str(include_models)
-        if cache_key in _EVALUATOR_CACHE:
-            return _EVALUATOR_CACHE[cache_key]
-
-    # Check if setup is possible
-    gpus_used = {}
-    for gpu_type, model_gpu in models.items():
-        gpus_used[gpu_type] = calc_used_gpus({gpu_type: model_gpu})
-        assert num_gpus[gpu_type] % NUM_GPUS_PER_SERVER[gpu_type] == 0, \
-            f"{gpu_type.value}: {num_gpus[gpu_type]} % {NUM_GPUS_PER_SERVER[gpu_type]}"
-        assert gpus_used[gpu_type] <= num_gpus[gpu_type], \
-            f"{gpu_type.value}: {gpus_used[gpu_type]} > {num_gpus[gpu_type]}"
-
-    # Assert input models are built correctly
-    for gpu_type in models.keys():
-        for model_name in models[gpu_type].keys():
-            for instance_id in range(len(models[gpu_type][model_name])):
-                assert models[gpu_type][model_name][instance_id].model == model_name
-                assert models[gpu_type][model_name][instance_id].gpu_type == gpu_type
-
-    # Actual evaluation
-    evaluate_times(
-        models, latency_data, workflow, policy,
-        include_models=include_models,
-    )
-    time_s = calc_total_time(models)
-
-    first_chunk_time = calc_ttff(models)
-    ttff_s = max(
-        first_chunk_time,
-        time_s - workflow.total_video_seconds
-    )
-
-    num_frames = (workflow.total_frames[Model.FT] - workflow.per_subscene_frames[Model.FT])
-    tbf_s = (time_s - first_chunk_time) / num_frames
-    if tbf_s < 0:
-        logging.debug(
-            f"Negative TBF: "
-            F"{tbf_s:.2f} = ({time_s:.2f} - {first_chunk_time:.2f}) / {num_frames}")
-        tbf_s = 0.0
-
-    # Calculate total energy (power * time * replicas for each model)
-    energy = 0.0
-    if power_data is not None:
-        evaluate_energy(models, power_data, workflow, time_s)
-        energy = calc_energy(models=models)
-
-    evaluate_cost(models, time_s, policy)
-    cost = calc_cost(
-        models, time_s, policy,
-        round_up_to_server=round_up_cost_to_server)
-
-    ret = Result(
-        models=models,
-        gpus_used=gpus_used,
-        gpus_total=num_gpus,
-        total_time_s=time_s,
-        first_chunk_time=first_chunk_time,
-        ttff_s=ttff_s,
-        tbf_s=tbf_s,
-        total_energy=energy if power_data else 0.0,
-        cost=cost,
-    )
-
-    if cache_key is not None:
-        _EVALUATOR_CACHE[cache_key] = ret
-
-    return ret
-
-
-def calc_energy(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-) -> float:
-    """
-    Calculate total energy (power * time * replicas for each model).
-    Energy in Watt x seconds (Joules).
-    This assumes that evaluate_energy() has been called already.
-    """
-    energy = 0.0  # Total energy in Watt-seconds (Joules = Watt x second)
-    for model_dict in models.values():
-        for model_allocations in model_dict.values():
-            for model_allocation in model_allocations:
-                energy += model_allocation.energy
-    return energy
-
-
-def calc_model_cost(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-) -> float:
-    """
-    Calculate total cost based on GPU hours used.
-    This assumes that evaluate_cost() has been called already.
-    """
-    costs = {}
-    for gpu_type, model_dict in models.items():
-        costs[gpu_type] = 0.0
-        for model_allocations in model_dict.values():
-            for model_allocation in model_allocations:
-                costs[gpu_type] += model_allocation.cost
-    return sum(costs.values())
-
-
-def calc_cost(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-    time_s: float,
-    policy: Policy,
-    round_up_to_server: bool = True,
-) -> float:
-    """
-    Calculate total cost based on GPU hours used.
-    """
-    used_gpus = calc_used_gpus_per_type(models)
-
-    # Round up to the nearest server (pack of GPUs) since we pay for whole servers
-    if round_up_to_server:
-        for gpu_type, used in used_gpus.items():
-            used_pack = math.ceil(used / NUM_GPUS_PER_SERVER[gpu_type]) * NUM_GPUS_PER_SERVER[gpu_type]
-            used_gpus[gpu_type] = used_pack
-
-    return calc_cost_total(used_gpus, time_s, policy)
-
-
-def calc_cost_total(
-    num_gpus: dict[GPUType, int],
-    time_s: float,
-    policy: Policy,
-) -> float:
-    """
-    Calculate total cost based on GPU hours used.
-    It includes the idle GPUs not assigned to a model.
-    """
-    cost = 0.0
-    for gpu_type, num in num_gpus.items():
-        cost += num * (time_s / SECONDS_IN_HOUR) * policy.gpu_cost[gpu_type]
-    return cost
-
-
-def calc_used_gpus_per_type(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-) -> dict[GPUType, int]:
-    """
-    Calculate number of GPUs used per GPU type across all models.
-    """
-    gpus_used = {}
-    for gpu_type, model_gpu in models.items():
-        gpus_used[gpu_type] = 0
-        for model_allocations in model_gpu.values():
-            for model_allocation in model_allocations:
-                gpus_used[gpu_type] += model_allocation.get_num_gpus()
-    return gpus_used
-
-
-def calc_used_gpus(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-) -> int:
-    """
-    Calculate total number of GPUs used across all models and GPU types.
-    """
-    gpus_used = calc_used_gpus_per_type(models)
-    return sum(gpus_used.values())
-
-
-def calc_total_time(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-) -> float:
-    """
-    Calculate total time considering all stages and dependencies.
-    This assumes that evaluate_time() has been called already.
-    """
-    total_time_secs = 0.0
-    for model_name in Model:
-        model_alloc_times = [
-            model_alloc.time
-            for gpu_type in GPUType
-            if gpu_type in models and model_name in models[gpu_type]
-            for model_alloc in models[gpu_type][model_name]
-        ]
-        model_time = max(model_alloc_times) if model_alloc_times else 0.0
-        total_time_secs += model_time
-    return total_time_secs
-
-
-def calc_ttff(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-) -> float:
-    """
-    Calculate time to first frame (chunk).
-    It takes the time to first frame (TTFF) for each model.
-    This assumes that evaluate_time() has been called already.
-    """
-    models_time_first: dict[Model, float] = {}
-    for model_name in Model:
-        times_first = []
-        for gpu_type in models.keys():
-            if model_name in models[gpu_type]:
-                for model_alloc in models[gpu_type][model_name]:
-                    if model_alloc.get_num_gpus() > 0:
-                        times_first.append(model_alloc.time_first)
-        if len(times_first) > 0:
-            models_time_first[model_name] = min(times_first)  # The fastest model determines TTFF
-    return sum(models_time_first.values())
diff --git a/simulator/greedy.py b/simulator/greedy.py
deleted file mode 100644
index 459742e5..00000000
--- a/simulator/greedy.py
+++ /dev/null
@@ -1,573 +0,0 @@
-"""
-Greedy algorithm for the StreamWise workflow allocation problem.
-"""
-
-from __future__ import annotations
-
-import logging
-
-from tabulate import tabulate
-
-from typing import Optional
-
-from operator import itemgetter
-
-from constants import NUM_GPUS_PER_SERVER
-from constants import SECONDS_IN_MINUTE
-from constants import SECONDS_IN_HOUR
-
-from sim_types import Result
-from sim_types import GPUType
-from sim_types import WorkflowConfig
-from sim_types import LatencyData
-from sim_types import PowerData
-from sim_types import Model
-from sim_types import ModelAllocation
-from sim_types import Policy
-from sim_types import Solver
-
-from utils import simplify_model_allocations
-
-from evaluator import calc_used_gpus
-from evaluator import evaluate_model_allocation
-
-from model_allocator import ModelAllocator
-
-from policies import STREAMWISE_POLICY
-from policies import MAX_ITERATIONS
-from policies import USE_ALL_GPUS
-
-from actions import gen_actions
-from actions import choose_action
-from actions import apply_action
-
-
-class GreedyAllocator(ModelAllocator):
-    """
-    Greedy allocator that iteratively applies the best action.
-    """
-    def __init__(
-        self,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-        power_data: Optional[PowerData] = None,
-        policy: Policy = STREAMWISE_POLICY,
-    ) -> None:
-        super().__init__(
-            workflow,
-            latency_data,
-            power_data,
-            policy,
-        )
-        assert self.policy.solver in {Solver.GREEDY, Solver.HEXGEN}
-
-    def allocate(
-        self,
-        num_gpus: dict[GPUType, int],
-        verbose: bool = False,
-        # Greedy policy parameters
-        allow_removal: bool = False,
-        allow_merging: bool = False,
-        look_ahead_replicas: int = 3,
-    ) -> Result:
-        total_gpus = sum(num_gpus.values())
-        assert total_gpus >= 8, f"Total number of GPUs must be at least 8 ({num_gpus})"
-
-        gpu_types = [
-            gpu_type
-            for gpu_type, count in num_gpus.items()
-            if count > 0
-        ]
-        assert 1 <= len(gpu_types) <= 2, f"Only up to two GPU types are supported ({len(gpu_types)})"
-        gpu_type1 = gpu_types[0]
-
-        if len(gpu_types) == 1 and num_gpus[gpu_type1] == 8:
-            # 8 x GPUs
-            return self._pick_from_single_server(
-                gpu_type=gpu_type1,
-                verbose=verbose,
-            )
-
-        if len(gpu_types) == 1:
-            # More than 8 x GPUs
-            return self._pick_from_single_device_mapping(
-                num_gpus.get(gpu_type1, 0),
-                gpu_type=gpu_type1,
-                verbose=verbose,
-                allow_removal=allow_removal,
-                allow_merging=allow_merging,
-                look_ahead_replicas=look_ahead_replicas,
-            )
-
-        # Mixed setup of GPU types (e.g., A100 and H100)
-        return self._pick_from_both_devices_mapping(
-            num_gpus,
-            verbose=verbose,
-            allow_removal=allow_removal,
-            allow_merging=allow_merging,
-            look_ahead_replicas=look_ahead_replicas,
-        )
-
-    def _pick_from_both_devices_mapping(
-        self,
-        num_gpus: dict[GPUType, int],
-        verbose: bool = False,
-        allow_removal: bool = False,
-        allow_merging: bool = False,
-        look_ahead_replicas: int = 3,
-    ) -> Result:
-        """
-        Calculate based on two GPU types.
-        """
-        gpu_types = list(num_gpus.keys())
-        assert len(gpu_types) == 2
-        assert len(num_gpus) == 2
-        gpu_type1 = gpu_types[0]
-        gpu_type2 = gpu_types[1]
-        assert num_gpus[gpu_type1] >= NUM_GPUS_PER_SERVER[gpu_type1]
-        assert num_gpus[gpu_type2] >= NUM_GPUS_PER_SERVER[gpu_type2]
-
-        # Initialize allocations with minimal setup
-        models = self._init_both_devices_models(gpu_type1, gpu_type2)
-
-        remaining_gpus = {}
-        for gpu_type in num_gpus.keys():
-            remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]})
-
-        # Optimization loop
-        if verbose:
-            evaluate_model_allocation(
-                models=models,
-                num_gpus=num_gpus,
-                workflow=self.workflow,
-                latency_data=self.latency_data,
-                power_data=self.power_data,
-                policy=self.policy,
-                round_up_cost_to_server=True,
-            )
-            self._print_iteration(0, models, num_gpus)
-
-        it = 1
-        prev_metric = None
-        switch_objective = False
-        while sum(remaining_gpus.values()) > 0:
-            # Calculate current iteration times
-            evaluate_model_allocation(
-                models=models,
-                num_gpus=num_gpus,
-                workflow=self.workflow,
-                latency_data=self.latency_data,
-                power_data=self.power_data,
-                policy=self.policy,
-                round_up_cost_to_server=False,
-            )
-
-            # Calculate potential actions for each optimization option
-            actions = gen_actions(
-                workflow=self.workflow,
-                latency_data=self.latency_data,
-                power_data=self.power_data,
-                num_gpus=num_gpus,
-                models=models,
-                policy=self.policy,
-                allow_removal=allow_removal,
-                allow_merging=allow_merging,
-                look_ahead_replicas=look_ahead_replicas,
-            )
-
-            if not actions:
-                logging.debug(f"No more actions possible after {it} iterations for {self.policy}.")
-                break
-
-            best_action = choose_action(actions, self.policy.objective, switch_objective=switch_objective)
-
-            if not best_action:
-                logging.debug("No actions selected.")
-                break
-
-            new_metric = best_action.get_metric(self.policy.objective, switch_objective=switch_objective)
-
-            if self.policy.objective.is_monotonic() and prev_metric is not None and new_metric >= prev_metric:
-                msg = f"No improvement after {it} iterations for {self.policy}."
-                msg += f" Best action: {best_action}, metric: {new_metric:.2f} >= previous {prev_metric:.2f}."
-                if verbose:
-                    print(msg)
-                logging.debug(msg)
-                if not USE_ALL_GPUS:
-                    logging.debug("Not using all GPUs as USE_ALL_GPUS is False. Stopping optimization loop.")
-                    break
-                switch_objective = True
-
-            prev_metric = new_metric
-
-            models = apply_action(best_action, models=models)
-
-            models = simplify_model_allocations(models)
-
-            remaining_gpus.clear()
-            for gpu_type in num_gpus.keys():
-                remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]})
-
-            if verbose:
-                self._print_iteration(it, models, num_gpus)
-                print(f"{len(actions)} actions:")
-                for action in actions:
-                    if action == best_action:
-                        print(f"* {action} (best)")
-                    else:
-                        print(f"  {action}")
-                print(f"Metric: {new_metric:.2f}")
-                print("Remaining devices:")
-                for gpu_type in remaining_gpus.keys():
-                    print(f"  {remaining_gpus[gpu_type]} x {gpu_type.value}")
-
-            it += 1
-            if it > MAX_ITERATIONS:
-                logging.debug(f"Reached max iterations ({MAX_ITERATIONS}). Stopping optimization loop.")
-                break
-
-        # Adjust for no disaggregation
-        if not self.policy.is_disaggregated(Model.HF):
-            for models_gpu in models.values():
-                for instance_id in range(len(models_gpu[Model.HF_VAE])):
-                    assert models_gpu[Model.HF_VAE][instance_id].get_num_gpus() == 0, \
-                        "HF_VAE must have 0 GPUs when HF disaggregation is disabled"
-        if not self.policy.is_disaggregated(Model.FT):
-            for models_gpu in models.values():
-                for instance_id in range(len(models_gpu[Model.FT_VAE])):
-                    assert models_gpu[Model.FT_VAE][instance_id].get_num_gpus() == 0, \
-                        "FT_VAE must have 0 GPUs when FT disaggregation is disabled"
-
-        # Final calculations
-        result = evaluate_model_allocation(
-            models=models,
-            num_gpus=num_gpus,
-            workflow=self.workflow,
-            latency_data=self.latency_data,
-            power_data=self.power_data,
-            policy=self.policy,
-            round_up_cost_to_server=True,
-        )
-
-        if verbose:
-            self._print_final_allocation(
-                models=models,
-                used_devices=result.gpus_used,
-                total_devices={
-                    gpu_type1: num_gpus.get(gpu_type1, 0),
-                    gpu_type2: num_gpus.get(gpu_type2, 0),
-                },
-                power_data=self.power_data,
-                total_time_s=result.total_time_s,
-                ttff_s=result.ttff_s,
-                first_chunk_time=result.first_chunk_time,
-                tbf_s=result.tbf_s,
-                total_energy=result.total_energy if self.power_data else 0.0,
-                cost=result.cost,
-            )
-
-        assert result.gpus_used[gpu_type1] <= num_gpus.get(gpu_type1, 0), \
-            f"{gpu_type1.value}: {result.gpus_used[gpu_type1]} > {num_gpus.get(gpu_type1, 0)}"
-        assert result.gpus_used[gpu_type2] <= num_gpus.get(gpu_type2, 0), \
-            f"{gpu_type2.value}: {result.gpus_used[gpu_type2]} > {num_gpus.get(gpu_type2, 0)}"
-
-        return Result(
-            total_time_s=result.total_time_s,
-            models=models,
-            gpus_used=result.gpus_used,
-            ttff_s=result.ttff_s,
-            tbf_s=result.tbf_s,
-            total_energy=result.total_energy if self.power_data else 0.0,
-            cost=result.cost,
-        )
-
-    def _pick_from_single_server(
-        self,
-        gpu_type: GPUType,
-        verbose: bool = False,
-    ) -> Result:
-        """
-        The minimal setup with a servers with a single server (8 GPUs or 4 for GB200).
-        No parallelism across scenes/subscenes.
-        """
-
-        # Number of devices
-        num_gpus = NUM_GPUS_PER_SERVER[gpu_type]
-        models = self._init_single_server_models(gpu_type)
-
-        result = evaluate_model_allocation(
-            models=models,
-            num_gpus={gpu_type: num_gpus},
-            workflow=self.workflow,
-            latency_data=self.latency_data,
-            power_data=self.power_data,
-            policy=self.policy,
-            round_up_cost_to_server=True,
-        )
-
-        if verbose:
-            model_device = models[gpu_type]
-            print_data = [
-                [Model.GEMMA.value, round(model_device[Model.GEMMA][0].time, 2)],
-                [Model.FLUX.value, round(model_device[Model.FLUX][0].time, 2)],
-                [Model.HF.value, round(model_device[Model.HF][0].time, 2)],
-                [Model.HF_VAE.value, round(model_device[Model.HF_VAE][0].time, 2)],
-                [Model.FT.value, round(model_device[Model.FT][0].time, 2)],
-                [Model.FT_VAE.value, round(model_device[Model.FT_VAE][0].time, 2)],
-            ]
-            if self.policy.use_upscaler:
-                print_data.append([Model.UPSCALER.value, round(model_device[Model.UPSCALER][0].time, 2)])
-            print(f"Total time: {result.total_time_s:.2f} seconds")
-            print(tabulate(
-                print_data,
-                headers=["Model", "Time (seconds)"],
-                tablefmt="pretty",
-                colalign=["left", "right"]
-            ))
-            self._print_final_allocation(
-                models=models,
-                used_devices={gpu_type: num_gpus},
-                total_devices={gpu_type: num_gpus},
-                power_data=self.power_data,
-                total_time_s=result.total_time_s,
-                ttff_s=result.ttff_s,
-                first_chunk_time=result.first_chunk_time,
-                tbf_s=result.tbf_s,
-                total_energy=result.total_energy if self.power_data else 0.0,
-                cost=result.cost,
-            )
-
-        return Result(
-            total_time_s=result.total_time_s,
-            models=models,
-            gpus_used={gpu_type: num_gpus},
-            ttff_s=result.ttff_s,
-            tbf_s=result.tbf_s,
-            total_energy=result.total_energy if self.power_data else 0.0,
-            cost=result.cost,
-        )
-
-    def _pick_from_single_device_mapping(
-        self,
-        num_gpus: int,
-        gpu_type: GPUType,
-        verbose: bool = False,
-        allow_removal: bool = False,
-        allow_merging: bool = False,
-        look_ahead_replicas: int = 3,
-    ) -> Result:
-        """
-        Calculate time and energy based on a single GPU type.
-        """
-        assert num_gpus >= NUM_GPUS_PER_SERVER[gpu_type]
-        latency_gpu_data = self.latency_data[gpu_type]
-        assert gpu_type == latency_gpu_data.gpu_type
-
-        if self.power_data is not None:
-            power_gpu_data = self.power_data[gpu_type]
-            assert gpu_type == power_gpu_data.gpu_type
-
-        # Initialize allocations
-        models = self._init_single_device_models(gpu_type)
-
-        remaining_gpus = num_gpus - calc_used_gpus(models)
-
-        assert 0 <= remaining_gpus <= num_gpus
-
-        # Optimization loop
-        it = 0
-        prev_metric = None
-        switch_objective = False
-        while remaining_gpus > 0:
-            # Calculate current iteration times
-            evaluate_model_allocation(
-                models=models,
-                num_gpus={gpu_type: num_gpus},
-                workflow=self.workflow,
-                latency_data=self.latency_data,
-                power_data=self.power_data,
-                policy=self.policy,
-                round_up_cost_to_server=False,
-            )
-
-            # Calculate potential actions for each optimization option
-            actions = gen_actions(
-                num_gpus={gpu_type: num_gpus},
-                latency_data=self.latency_data,
-                power_data=self.power_data,
-                workflow=self.workflow,
-                models=models,
-                policy=self.policy,
-                allow_removal=allow_removal,
-                allow_merging=allow_merging,
-                look_ahead_replicas=look_ahead_replicas,
-            )
-
-            if not actions:
-                logging.debug(f"No more actions possible after {it} iterations for {self.policy}")
-                break
-
-            best_action = choose_action(
-                actions,
-                self.policy.objective,
-                switch_objective=switch_objective)
-
-            if not best_action:
-                logging.debug("No action selected.")
-                break
-
-            new_metric = best_action.get_metric(self.policy.objective, switch_objective=switch_objective)
-            if self.policy.objective.is_monotonic() and prev_metric is not None and new_metric >= prev_metric:
-                msg = f"No improvement from actions after {it} iterations for {self.policy}."
-                msg += f" Best action: {best_action}, metric: {new_metric:.2f} >= previous {prev_metric:.2f}."
-                if verbose:
-                    print(msg)
-                logging.debug(msg)
-                if not USE_ALL_GPUS:
-                    logging.debug("Not using all GPUs as USE_ALL_GPUS is False. Stopping optimization loop.")
-                    break
-                switch_objective = True
-
-            models = apply_action(best_action, models)
-
-            models = simplify_model_allocations(models)
-
-            remaining_gpus = num_gpus - calc_used_gpus(models)
-            prev_metric = new_metric
-
-            if verbose:
-                self._print_iteration(it, models, {gpu_type: num_gpus})
-                print(f"Metric: {new_metric:.2f}")
-                print(f"{len(actions)} actions:")
-                for action in actions:
-                    if action == best_action:
-                        print(f"  * {action} (best)")
-                    else:
-                        print(f"    {action}")
-                print(f"Applied: {best_action}")
-                print(f"Remaining devices: {remaining_gpus}x{gpu_type}")
-
-            it += 1
-            if it > MAX_ITERATIONS:
-                logging.debug(f"Reached max iterations ({MAX_ITERATIONS}). Stopping optimization loop.")
-                break
-
-        result = evaluate_model_allocation(
-            models=models,
-            num_gpus={gpu_type: num_gpus},
-            workflow=self.workflow,
-            latency_data=self.latency_data,
-            power_data=self.power_data,
-            policy=self.policy,
-            round_up_cost_to_server=True,
-        )
-
-        if verbose:
-            self._print_final_allocation(
-                models=models,
-                used_devices=result.gpus_used,
-                total_devices={gpu_type: num_gpus},
-                power_data=self.power_data,
-                total_time_s=result.total_time_s,
-                ttff_s=result.ttff_s,
-                first_chunk_time=result.first_chunk_time,
-                tbf_s=result.tbf_s,
-                total_energy=result.total_energy if self.power_data else 0.0,
-                cost=result.cost,
-            )
-
-        if not self.policy.is_disaggregated(Model.HF):
-            if models[gpu_type][Model.HF_VAE]:
-                assert models[gpu_type][Model.HF_VAE][0].get_num_gpus() == 0, \
-                    "HF_VAE must have 0 GPUs when HF disaggregation is disabled"
-        if not self.policy.is_disaggregated(Model.FT):
-            if models[gpu_type][Model.FT_VAE]:
-                assert models[gpu_type][Model.FT_VAE][0].get_num_gpus() == 0, \
-                    "FT_VAE must have 0 GPUs when FT disaggregation is disabled"
-        num_gpus_used = result.gpus_used[gpu_type]
-        assert num_gpus_used <= num_gpus, f"{num_gpus_used}>{num_gpus} for {gpu_type.value}"
-
-        return Result(
-            total_time_s=result.total_time_s,
-            models=models,
-            gpus_used={gpu_type: num_gpus_used},
-            gpus_total={gpu_type: num_gpus},
-            ttff_s=result.ttff_s,
-            tbf_s=result.tbf_s,
-            total_energy=result.total_energy if self.power_data else 0.0,
-            cost=result.cost,
-        )
-
-    def _print_iteration(
-        self,
-        it: int,
-        models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-        num_gpus: dict[GPUType, int],
-    ) -> None:
-        print(f"--- Iteration {it} ---")
-
-        for gpu_type in models.keys():
-            total_gpus = calc_used_gpus({gpu_type: models[gpu_type]})
-            print(f"Current {gpu_type.value} allocation: {total_gpus}/{num_gpus[gpu_type]} GPUs")
-            for model in Model:
-                for model_instance in models[gpu_type][model]:
-                    if model_instance.get_num_gpus() > 0:
-                        print(f"  {model.value:10s}:\t{model_instance}")
-
-        # Find the bottleneck stage
-        stage_times: dict[Model, float] = {}
-        ttff_times: dict[Model, float] = {}
-        for model_name in Model:
-            times = []
-            times_first = []
-            for gpu_type in models.keys():
-                for model_alloc in models[gpu_type][model_name]:
-                    times.append(model_alloc.time)
-                    times_first.append(model_alloc.time_first)
-            stage_times[model_name] = max(times) if times else 0.0
-            ttff_times[model_name] = max(times_first) if times_first else 0.0
-
-        bottleneck_stage, bottleneck_time = max(
-            stage_times.items(),
-            key=itemgetter(1)
-        )
-        bottleneck_ttff_stage, bottleneck_ttff_time = max(
-            ttff_times.items(),
-            key=itemgetter(1)
-        )
-        print(f"Bottleneck: {bottleneck_stage} ({bottleneck_time:.2f}s)")
-        print(f"Bottleneck TTFF: {bottleneck_ttff_stage} ({bottleneck_ttff_time:.2f}s)")
-        # bottleneck stage is not necessarily the stage with the
-        # highest potential gain from scaling up/out
-
-    def _print_final_allocation(
-        self,
-        models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-        used_devices: dict[GPUType, int],
-        total_devices: dict[GPUType, int],
-        power_data: Optional[PowerData],
-        total_time_s: float,
-        ttff_s: float,
-        first_chunk_time: float,
-        tbf_s: float,
-        total_energy: float,
-        cost: float,
-    ) -> None:
-        print("=== FINAL ALLOCATION ===")
-        print("Total devices used/available:")
-        for gpu_type, total_device in total_devices.items():
-            used_device = used_devices[gpu_type]
-            print(f"  {gpu_type.value}: {used_device}/{total_device}")
-        print("Model allocations:")
-        for gpu_type in models.keys():
-            print(f"  {gpu_type.value} ({used_devices[gpu_type]} used):")
-            for model in Model:
-                for model_alloc in models[gpu_type][model]:
-                    print(f"    {model.value:10s}:\t{model_alloc}")
-        print(f"Total time: {total_time_s:.2f} seconds ({total_time_s / SECONDS_IN_MINUTE:.2f} minutes)")
-        print(f"TTFF: {ttff_s:.2f} seconds")
-        print(f"First chunk time: {first_chunk_time:.2f} seconds")
-        print(f"TBF: {tbf_s:.2f} seconds")
-        print(f"Total cost: ${cost:.2f}")
-        if power_data is not None:
-            print(f"Total energy: {total_energy:.2f} Ws ({total_energy / SECONDS_IN_HOUR / 1000:.2f} kWh)")
diff --git a/simulator/helix.py b/simulator/helix.py
deleted file mode 100644
index 5891538f..00000000
--- a/simulator/helix.py
+++ /dev/null
@@ -1,403 +0,0 @@
-"""
-Helix algorithm for the StreamWise workflow allocation problem.
-
-Reference: https://github.com/Thesys-lab/Helix-ASPLOS25
-
-Helix optimizes models one-by-one following MODEL_ORDER, using MILP
-for each model's resource allocation.  After each model reaches convergence
-(solver optimality or per-model time limit), its allocation is fixed and the
-remaining GPU budget is passed to the next model.
-
-Design rationale:
-    HelixAllocator does NOT inherit from MILPAllocator because the parent's
-    allocate() builds a single joint MILP for all models simultaneously.
-    Instead, HelixAllocator extends ModelAllocator and *composes*
-    MILPAllocator instances — one per model in the workflow.
-
-    For each model, a per-model WorkflowConfig is created where only the
-    target model has non-zero work (all others set to 0).  The existing MILP
-    constraints (is_active <= work, gpus <= num_gpus * is_active) naturally
-    force 0 GPU allocation for those 0-work models, so no changes to
-    milp.py are required.
-"""
-
-from __future__ import annotations
-
-import logging
-
-from dataclasses import replace
-from typing import Optional
-
-from sim_types import Result
-from sim_types import GPUType
-from sim_types import WorkflowConfig
-from sim_types import PowerData
-from sim_types import LatencyData
-from sim_types import Model
-from sim_types import ModelAllocation
-from sim_types import Policy
-from sim_types import Solver
-from sim_types import MODEL_ORDER
-
-from model_allocator import ModelAllocator
-
-from evaluator import evaluate_model_allocation
-
-from milp import MILPAllocator
-
-from policies import HELIX_POLICY
-from policies import MAX_DEVICES
-
-from constants import DEVICE_OPTIONS
-
-
-# Default per-model MILP solver time limit in seconds.
-# Each model gets this long to converge before the solver moves on.
-DEFAULT_PER_MODEL_TIME_LIMIT = 30
-
-
-def _compute_per_model_gpu_budget(
-    model_order: list[Model],
-    num_gpus: dict[GPUType, int],
-    workflow: WorkflowConfig,
-) -> dict[Model, dict[GPUType, int]]:
-    """Compute a per-model GPU budget so every model gets a fair share.
-
-    Budget is proportional to each model's ``MAX_DEVICES`` weight (capped
-    by the model's actual maximum useful device count from ``DEVICE_OPTIONS``).
-    Models not in ``MAX_DEVICES`` (e.g. OTHERS, UPSCALER) receive a minimum
-    allocation of ``min(DEVICE_OPTIONS)`` GPUs.
-
-    The allocations are floored per model, and any remainder is distributed
-    round-robin starting from the first model.
-
-    Returns:
-        Mapping ``model -> {gpu_type -> max_gpus}`` that the model may use.
-    """
-    # Effective weight per model (max useful devices)
-    weights: dict[Model, int] = {}
-    for m in model_order:
-        if workflow.model_work.get(m, 0) == 0:
-            continue
-        if m in MAX_DEVICES:
-            weights[m] = MAX_DEVICES[m]
-        else:
-            # Models not in MAX_DEVICES (OTHERS, UPSCALER) get min allocation
-            weights[m] = min(DEVICE_OPTIONS.get(m, [1]))
-
-    total_weight = sum(weights.values())
-    if total_weight == 0:
-        # Fallback: equal split
-        total_weight = len(weights) or 1
-        weights = {m: 1 for m in weights}
-
-    budget: dict[Model, dict[GPUType, int]] = {}
-    for gpu_type, total in num_gpus.items():
-        # Floor allocation per model
-        allocated = 0
-        per_model: dict[Model, int] = {}
-        for m in model_order:
-            if m not in weights:
-                continue
-            share = int(total * weights[m] / total_weight)
-            # Ensure at least 1 GPU per model (if GPUs available)
-            share = max(share, 1) if total - allocated >= 1 else 0
-            per_model[m] = share
-            allocated += share
-
-        # Distribute remainder round-robin
-        remainder = total - allocated
-        idx = 0
-        models_list = [m for m in model_order if m in per_model]
-        while remainder > 0 and models_list:
-            m = models_list[idx % len(models_list)]
-            per_model[m] += 1
-            remainder -= 1
-            idx += 1
-
-        for m in model_order:
-            if m not in per_model:
-                continue
-            if m not in budget:
-                budget[m] = {}
-            budget[m][gpu_type] = per_model[m]
-
-    return budget
-
-
-class HelixAllocator(ModelAllocator):
-    """
-    Helix-style allocator that optimizes models one at a time
-    using MILP, sequentially following MODEL_ORDER.
-
-    Reference: https://github.com/Thesys-lab/Helix-ASPLOS25
-
-    Key approach:
-    1. For each model in MODEL_ORDER, create a per-model MILP sub-problem
-       where only the target model has non-zero work.
-    2. Solve the MILP with the remaining GPU budget and a per-model time limit.
-    3. Fix the allocation for that model and subtract used GPUs.
-    4. Move to the next model with the remaining GPU budget.
-    5. Combine all per-model allocations into the final result.
-
-    The HelixAllocator uses composition (not inheritance) with MILPAllocator,
-    creating a separate MILPAllocator instance for each model's sub-problem.
-    This avoids modifying the joint MILP formulation and allows per-model
-    solver configurations.
-    """
-
-    def __init__(
-        self,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-        power_data: Optional[PowerData] = None,
-        policy: Policy = HELIX_POLICY,
-    ) -> None:
-        super().__init__(
-            workflow,
-            latency_data,
-            power_data,
-            policy,
-        )
-        assert self.policy.solver == Solver.HELIX
-
-    def allocate(
-        self,
-        num_gpus: dict[GPUType, int],
-        verbose: bool = False,
-        per_model_time_limit: int = DEFAULT_PER_MODEL_TIME_LIMIT,
-        milp_solver: Solver = Solver.HIGHS,
-    ) -> Result:
-        """
-        Allocate resources model-by-model following MODEL_ORDER.
-
-        For each model, a MILPAllocator is created with a workflow where
-        only the target model has non-zero work.  The MILP solver optimizes
-        the allocation for that model within the remaining GPU budget.
-
-        Args:
-            num_gpus: Available GPUs per type.
-            verbose: If True, print per-model allocation details.
-            per_model_time_limit: Time limit (seconds) for each per-model MILP solve.
-            milp_solver: MILP solver backend to use (GUROBI or HIGHS).
-
-        Returns:
-            Combined Result across all models.
-        """
-        assert milp_solver in (Solver.GUROBI, Solver.HIGHS), \
-            f"milp_solver must be GUROBI or HIGHS, got {milp_solver}"
-
-        model_order = self.workflow.get_model_order()
-        if not self.policy.use_upscaler and Model.UPSCALER in model_order:
-            # Remove UPSCALER from model_order if not using upscaler to avoid unnecessary MILP solve
-            model_order.remove(Model.UPSCALER)
-        remaining_gpus = dict(num_gpus)
-
-        # ---- GPU budget partitioning ----
-        # Pre-compute a per-model GPU budget proportional to MAX_DEVICES
-        # so that early models cannot starve later ones.  Unused GPUs from
-        # one model roll over to subsequent models.
-        gpu_budget = _compute_per_model_gpu_budget(
-            model_order, num_gpus, self.workflow,
-        )
-
-        if verbose:
-            logging.info("Helix GPU budget per model:")
-            for m in model_order:
-                if m in gpu_budget:
-                    logging.info(f"  {m.value}: {gpu_budget[m]}")
-
-        # Accumulated per-model allocations and metrics
-        all_model_allocations: dict[GPUType, dict[Model, list[ModelAllocation]]] = {}
-        total_makespan = 0.0
-        total_ttff = 0.0
-        total_cost = 0.0
-        total_energy = 0.0
-        total_gpus_used: dict[GPUType, int] = {gt: 0 for gt in num_gpus}
-
-        for model in model_order:
-            work = self.workflow.model_work.get(model, 0)
-            if work == 0:
-                continue
-
-            # Skip VAE models when disaggregation is disabled for the parent.
-            # Their latency is folded into the parent model's time calculation.
-            if model == Model.HF_VAE and not self.policy.is_disaggregated(Model.HF):
-                continue
-            if model == Model.FT_VAE and not self.policy.is_disaggregated(Model.FT):
-                continue
-
-            # Check if any GPUs remain
-            if all(v <= 0 for v in remaining_gpus.values()):
-                logging.warning(
-                    f"Helix: No GPUs remaining for {model.value}. Skipping.")
-                continue
-
-            # Filter out GPU types with 0 remaining.
-            # Cap per-model GPUs to the budget so later models are not starved.
-            model_budget = gpu_budget.get(model, {})
-            active_gpus = {
-                gt: min(count, model_budget.get(gt, count))
-                for gt, count in remaining_gpus.items()
-                if count > 0 and (gt not in model_budget or model_budget[gt] > 0)
-            }
-
-            if verbose:
-                logging.info(
-                    f"--- Helix: Optimizing {model.value} "
-                    f"(work={work}) with remaining GPUs: {active_gpus} ---"
-                )
-
-            # ---- build per-model workflow ----
-            # Only the target model has work; other models are excluded from
-            # model_work so the MILP only builds variables/constraints for it.
-            per_model_work = {model: self.workflow.model_work[model]}
-            per_model_workflow = replace(
-                self.workflow,
-                model_work=per_model_work,
-            )
-
-            # ---- build MILP-compatible policy ----
-            # The inner MILPAllocator requires solver ∈ {GUROBI, HIGHS}.
-            # Force disaggregation / use_upscaler flags so that the inner
-            # MILP's ``model_names`` list includes VAE / UPSCALER when those
-            # are the target model.  Without this, the MILP would construct
-            # an empty model set and produce a trivial (infeasible) problem.
-            disag = {}  # dict(self.policy.disaggregation)
-            if model == Model.HF_VAE and self.policy.is_disaggregated(Model.HF):
-                disag[Model.HF] = True
-            if model == Model.FT_VAE and self.policy.is_disaggregated(Model.FT):
-                disag[Model.FT] = True
-            milp_policy = Policy(
-                name=self.policy.name,
-                gpu_cost=self.policy.gpu_cost,
-                objective=self.policy.objective,
-                # disaggregation=self.policy.disaggregation or model == Model.HF_VAE,
-                disaggregation=disag,
-                use_upscaler=self.policy.use_upscaler or model == Model.UPSCALER,
-                hardware=self.policy.hardware,
-                solver=milp_solver,
-            )
-
-            # ---- solve per-model MILP ----
-            milp_allocator = MILPAllocator(
-                workflow=per_model_workflow,
-                latency_data=self.latency_data,
-                power_data=self.power_data,
-                policy=milp_policy,
-            )
-
-            result = milp_allocator.allocate(
-                num_gpus=active_gpus,
-                verbose=verbose,
-                time_limit=per_model_time_limit,
-                # Use running_cost=True for linear cost formulation (HiGHS-compatible)
-                running_cost=(milp_solver == Solver.HIGHS),
-                # Skip server constraint: per-model allocations don't need
-                # to be multiples of NUM_GPUS_PER_SERVER.
-                skip_server_constraint=True,
-            )
-
-            if result.total_time_s == 0.0 and not result.models:
-                logging.warning(
-                    f"Helix: MILP failed for {model.value}. Skipping.")
-                continue
-
-            # ---- record allocations & snap devices to DEVICE_OPTIONS ----
-            # The MILP constrains devices to DEVICE_OPTIONS, but floating-point
-            # precision in the solver can occasionally produce off-by-one values
-            # (e.g. 31 instead of 32).  Snap each replica to the nearest valid
-            # option, adjusting the GPU accounting so we don't exceed the total
-            # budget passed to evaluate_model_allocation at the end.
-            for gpu_type, model_dict in result.models.items():
-                if gpu_type not in all_model_allocations:
-                    all_model_allocations[gpu_type] = {}
-                for m_name, allocs in model_dict.items():
-                    for alloc in allocs:
-                        valid_devices = DEVICE_OPTIONS.get(m_name, [1])
-                        if alloc.devices not in valid_devices:
-                            nearest = min(valid_devices, key=lambda d: abs(d - alloc.devices))
-                            diff = nearest - alloc.devices  # positive = round up
-                            gpu_avail = remaining_gpus.get(gpu_type, 0) - result.gpus_used.get(gpu_type, 0)
-                            if diff > 0 and gpu_avail < diff:
-                                # Not enough spare GPUs to round up; round down instead
-                                nearest = max(
-                                    (d for d in valid_devices if d <= alloc.devices),
-                                    default=valid_devices[0],
-                                )
-                                diff = nearest - alloc.devices
-                            logging.info(
-                                f"Helix: snapping {m_name.value} from "
-                                f"{alloc.devices} to {nearest} devices "
-                                f"(solver precision fix, diff={diff:+d})")
-                            # Adjust GPU accounting for this model's result
-                            result.gpus_used[gpu_type] = result.gpus_used.get(gpu_type, 0) + diff
-                            alloc.devices = nearest
-                    all_model_allocations[gpu_type][m_name] = allocs
-
-            # ---- accumulate metrics ----
-            total_makespan += result.total_time_s
-            total_ttff += result.ttff_s
-            total_cost += result.cost
-            total_energy += result.total_energy
-            if verbose:
-                print(f'Model {model.value} - Time: {result.total_time_s:.2f}s,'
-                      f'TTFF: {result.ttff_s:.2f}s, Cost: ${result.cost:.2f}')
-                print(f'Total cost so far: ${total_cost:.2f}, Total time so far: {total_makespan:.2f}s,'
-                      f'Total TTFF so far: {total_ttff:.2f}s')
-                print(f'GPUs allocated for {model.value}: {result.gpus_used}')
-
-            # ---- subtract used GPUs ----
-            for gpu_type, used in result.gpus_used.items():
-                remaining_gpus[gpu_type] = remaining_gpus.get(gpu_type, 0) - used
-                total_gpus_used[gpu_type] = total_gpus_used.get(gpu_type, 0) + used
-
-            # ---- roll over unused budget to next models ----
-            # If this model used fewer GPUs than its budget, the surplus
-            # is distributed evenly among the remaining models.
-            remaining_models = [
-                m for m in model_order
-                if m in gpu_budget and MODEL_ORDER.get(m, 0) > MODEL_ORDER.get(model, 0)
-            ]
-            if remaining_models:
-                for gpu_type in num_gpus:
-                    budget_for_model = model_budget.get(gpu_type, 0)
-                    used_by_model = result.gpus_used.get(gpu_type, 0)
-                    surplus = budget_for_model - used_by_model
-                    if surplus > 0:
-                        per_model_extra = surplus // len(remaining_models)
-                        leftover = surplus % len(remaining_models)
-                        for i, rm in enumerate(remaining_models):
-                            extra = per_model_extra + (1 if i < leftover else 0)
-                            gpu_budget[rm][gpu_type] = gpu_budget[rm].get(gpu_type, 0) + extra
-
-            if verbose:
-                print(
-                    f"Helix: {model.value} allocated.  "
-                    f"Time: {result.total_time_s:.2f}s, "
-                    f"TTFF: {result.ttff_s:.2f}s, "
-                    f"GPUs used: {result.gpus_used}, "
-                    f"Remaining: {remaining_gpus}"
-                )
-
-        result = evaluate_model_allocation(
-            workflow=self.workflow,
-            latency_data=self.latency_data,
-            power_data=self.power_data,
-            policy=self.policy,
-            models=all_model_allocations,
-            num_gpus=num_gpus,
-        )
-
-        if verbose:
-            print(
-                f"=== Helix final: "
-                f"Makespan={result.total_time_s:.2f}s, "
-                f"TTFF={result.ttff_s:.2f}s, "
-                f"TBF={result.tbf_s:.4f}s, "
-                f"Cost=${result.cost:.2f}, "
-                f"Energy={result.total_energy:.2f}Ws, "
-                f"GPUs used={result.gpus_used} ==="
-            )
-
-        return result
diff --git a/simulator/hexgen.py b/simulator/hexgen.py
deleted file mode 100644
index 64c64160..00000000
--- a/simulator/hexgen.py
+++ /dev/null
@@ -1,629 +0,0 @@
-"""
-HexGen algorithm for the StreamWise workflow allocation problem.
-
-Reference: https://arxiv.org/abs/2311.11514
-
-HexGen treats each model in the workflow as an independent component for optimization.
-It tracks metrics per model and optimizes models sequentially according to MODEL_ORDER.
-When a model's metric converges (stops dropping), it moves to the next model.
-After the last model converges, it cycles back to the first model and allocates
-remaining GPUs until exhausted.
-"""
-
-from __future__ import annotations
-import logging
-from typing import Optional
-
-from sim_types import Result
-from sim_types import GPUType
-from sim_types import WorkflowConfig
-from sim_types import PowerData
-from sim_types import LatencyData
-from sim_types import Model
-from sim_types import ModelAllocation
-from sim_types import Policy
-from sim_types import Solver
-from sim_types import MODEL_ORDER
-
-from utils import simplify_model_allocations
-
-from evaluator import calc_used_gpus
-from evaluator import evaluate_model_allocation
-
-from greedy import GreedyAllocator
-
-from actions import gen_actions
-from actions import choose_action
-from actions import apply_action
-
-from policies import HEXGEN_POLICY
-from policies import MAX_ITERATIONS
-from policies import USE_ALL_GPUS
-
-
-def _get_model_order(workflow: WorkflowConfig) -> list[Model]:
-    """Get ordered list of models in the workflow, sorted by MODEL_ORDER."""
-    return sorted(
-        [m for m in workflow.models if m in MODEL_ORDER],
-        key=lambda m: MODEL_ORDER[m],
-    )
-
-
-class HexGenAllocator(GreedyAllocator):
-    """
-    HexGen-style allocator that optimizes models one at a time,
-    sequentially following MODEL_ORDER.
-
-    Reference: https://arxiv.org/abs/2311.11514
-
-    Key differences from GreedyAllocator:
-    1. Each model is treated as an independent optimization target.
-    2. Per-model metrics are tracked separately.
-    3. Models are optimized in MODEL_ORDER sequence. When a model's metric
-       converges, it moves to the next model. After the last model converges,
-       it cycles back to the first and allocates remaining GPUs.
-    """
-
-    def __init__(
-        self,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-        power_data: Optional[PowerData] = None,
-        policy: Policy = HEXGEN_POLICY,
-    ) -> None:
-        super().__init__(
-            workflow,
-            latency_data,
-            power_data,
-            policy,
-        )
-        assert self.policy.solver == Solver.HEXGEN
-
-    def _pick_from_single_device_mapping(
-        self,
-        num_gpus: int,
-        gpu_type: GPUType,
-        verbose: bool = False,
-        allow_removal: bool = False,
-        allow_merging: bool = False,
-        look_ahead_replicas: int = 3,
-    ) -> Result:
-        """
-        HexGen-style allocation for a single GPU type (>8 GPUs).
-        Optimizes models one at a time following MODEL_ORDER.
-        """
-        from constants import NUM_GPUS_PER_SERVER
-
-        assert num_gpus >= NUM_GPUS_PER_SERVER[gpu_type]
-
-        # Initialize allocations (same as GreedyAllocator)
-        models = self._init_single_device_models(gpu_type)
-
-        remaining_gpus = num_gpus - calc_used_gpus(models)
-        assert 0 <= remaining_gpus <= num_gpus
-
-        # --- HexGen per-model sequential optimization ---
-        model_order = _get_model_order(self.workflow)
-        per_model_metrics: dict[Model, Optional[float]] = {m: None for m in model_order}
-
-        it = 0
-        current_model_idx = 0
-        cycles_without_progress = 0  # track full cycles without any improvement
-        total_models = len(model_order)
-
-        while remaining_gpus > 0:
-            if current_model_idx >= total_models:
-                # Completed a full cycle, wrap around
-                current_model_idx = 0
-                cycles_without_progress += 1
-                if cycles_without_progress >= 1:
-                    logging.debug(
-                        f"HexGen: No progress after {cycles_without_progress} full cycles.")
-                    break
-
-            current_model = model_order[current_model_idx]
-
-            if verbose:
-                print(f"--- HexGen: Optimizing {current_model.value} "
-                      f"(model {current_model_idx + 1}/{total_models}) ---")
-
-            # Inner loop: keep optimizing current model until convergence
-            inner_it = 0
-            while remaining_gpus > 0:
-                # Evaluate current state
-                evaluate_model_allocation(
-                    models=models,
-                    num_gpus={gpu_type: num_gpus},
-                    workflow=self.workflow,
-                    latency_data=self.latency_data,
-                    power_data=self.power_data,
-                    policy=self.policy,
-                    round_up_cost_to_server=False,
-                )
-
-                # Generate actions only for the current model
-                all_actions = gen_actions(
-                    num_gpus={gpu_type: num_gpus},
-                    latency_data=self.latency_data,
-                    power_data=self.power_data,
-                    workflow=self.workflow,
-                    models=models,
-                    policy=self.policy,
-                )
-
-                # Filter to actions targeting the current model only
-                model_actions = [a for a in all_actions if a.model == current_model]
-
-                if not model_actions:
-                    logging.debug(
-                        f"HexGen: No actions for {current_model.value} after {inner_it} inner iterations.")
-                    break
-
-                best_action = choose_action(model_actions, self.policy.objective)
-
-                if not best_action:
-                    logging.debug(f"HexGen: No action selected for {current_model.value}.")
-                    break
-
-                new_metric = best_action.get_metric(self.policy.objective)
-                prev_metric = per_model_metrics[current_model]
-
-                if self.policy.objective.is_monotonic() and prev_metric is not None and new_metric >= prev_metric:
-                    msg = (
-                        f"HexGen: {current_model.value} converged after {inner_it} inner iterations. "
-                        f"Metric: {new_metric:.2f} >= previous {prev_metric:.2f}."
-                    )
-                    if verbose:
-                        print(msg)
-                    logging.debug(msg)
-                    break
-
-                per_model_metrics[current_model] = new_metric
-
-                models = apply_action(best_action, models=models)
-                models = simplify_model_allocations(models)
-
-                remaining_gpus = num_gpus - calc_used_gpus(models)
-
-                if verbose:
-                    self._print_iteration(it, models, {gpu_type: num_gpus})
-                    print(f"HexGen: Applied action for {current_model.value}, "
-                          f"metric: {new_metric:.2f}, remaining: {remaining_gpus}")
-
-                it += 1
-                inner_it += 1
-
-                if it > MAX_ITERATIONS:
-                    logging.debug(f"HexGen: Reached max iterations ({MAX_ITERATIONS}). Stopping.")
-                    break
-
-            if it > MAX_ITERATIONS:
-                break
-
-            current_model_idx += 1
-
-        # --- USE_ALL_GPUS: fill remaining GPUs by cycling through MODEL_ORDER ---
-        remaining_gpus = num_gpus - calc_used_gpus(models)
-        if USE_ALL_GPUS and remaining_gpus > 0:
-            models = self._fill_remaining_gpus_single(
-                models=models,
-                num_gpus=num_gpus,
-                gpu_type=gpu_type,
-                model_order=model_order,
-                it=it,
-                verbose=verbose,
-            )
-
-        # Final evaluation
-        result = evaluate_model_allocation(
-            models=models,
-            num_gpus={gpu_type: num_gpus},
-            workflow=self.workflow,
-            latency_data=self.latency_data,
-            power_data=self.power_data,
-            policy=self.policy,
-            round_up_cost_to_server=True,
-        )
-
-        if verbose:
-            self._print_final_allocation(
-                models=models,
-                used_devices=result.gpus_used,
-                total_devices={gpu_type: num_gpus},
-                power_data=self.power_data,
-                total_time_s=result.total_time_s,
-                ttff_s=result.ttff_s,
-                first_chunk_time=result.first_chunk_time,
-                tbf_s=result.tbf_s,
-                total_energy=result.total_energy if self.power_data else 0.0,
-                cost=result.cost,
-            )
-
-        if not self.policy.is_disaggregated(Model.HF):
-            if models[gpu_type][Model.HF_VAE]:
-                assert models[gpu_type][Model.HF_VAE][0].get_num_gpus() == 0, \
-                    "HF_VAE must have 0 GPUs when HF disaggregation is disabled"
-        if not self.policy.is_disaggregated(Model.FT):
-            if models[gpu_type][Model.FT_VAE]:
-                assert models[gpu_type][Model.FT_VAE][0].get_num_gpus() == 0, \
-                    "FT_VAE must have 0 GPUs when FT disaggregation is disabled"
-
-        num_gpus_used = result.gpus_used[gpu_type]
-        assert num_gpus_used <= num_gpus, f"{num_gpus_used}>{num_gpus} for {gpu_type.value}"
-
-        return Result(
-            total_time_s=result.total_time_s,
-            models=models,
-            gpus_used={gpu_type: num_gpus_used},
-            gpus_total={gpu_type: num_gpus},
-            ttff_s=result.ttff_s,
-            tbf_s=result.tbf_s,
-            total_energy=result.total_energy if self.power_data else 0.0,
-            cost=result.cost,
-        )
-
-    def _pick_from_both_devices_mapping(
-        self,
-        num_gpus: dict[GPUType, int],
-        verbose: bool = False,
-        allow_removal: bool = False,
-        allow_merging: bool = False,
-        look_ahead_replicas: int = 3,
-    ) -> Result:
-        """
-        HexGen-style allocation for two GPU types.
-        Optimizes models one at a time following MODEL_ORDER.
-        """
-        from constants import NUM_GPUS_PER_SERVER
-
-        gpu_types = list(num_gpus.keys())
-        assert len(gpu_types) == 2
-        gpu_type1 = gpu_types[0]
-        gpu_type2 = gpu_types[1]
-        assert num_gpus[gpu_type1] >= NUM_GPUS_PER_SERVER[gpu_type1]
-        assert num_gpus[gpu_type2] >= NUM_GPUS_PER_SERVER[gpu_type2]
-
-        # Initialize allocations (same as GreedyAllocator)
-        models = self._init_both_devices_models(gpu_type1, gpu_type2)
-
-        remaining_gpus: dict[GPUType, int] = {}
-        for gpu_type in num_gpus.keys():
-            remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]})
-
-        # --- HexGen per-model sequential optimization ---
-        model_order = _get_model_order(self.workflow)
-        per_model_metrics: dict[Model, Optional[float]] = {m: None for m in model_order}
-
-        if verbose:
-            evaluate_model_allocation(
-                models=models,
-                num_gpus=num_gpus,
-                workflow=self.workflow,
-                latency_data=self.latency_data,
-                power_data=self.power_data,
-                policy=self.policy,
-                round_up_cost_to_server=True,
-            )
-            self._print_iteration(0, models, num_gpus)
-
-        it = 1
-        current_model_idx = 0
-        cycles_without_progress = 0
-        total_models = len(model_order)
-
-        while sum(remaining_gpus.values()) > 0:
-            if current_model_idx >= total_models:
-                current_model_idx = 0
-                cycles_without_progress += 1
-                if cycles_without_progress >= 1:
-                    logging.debug(
-                        f"HexGen: No progress after {cycles_without_progress} full cycles.")
-                    break
-
-            current_model = model_order[current_model_idx]
-
-            if verbose:
-                print(f"--- HexGen: Optimizing {current_model.value} "
-                      f"(model {current_model_idx + 1}/{total_models}) ---")
-
-            inner_it = 0
-
-            while sum(remaining_gpus.values()) > 0:
-                evaluate_model_allocation(
-                    models=models,
-                    num_gpus=num_gpus,
-                    workflow=self.workflow,
-                    latency_data=self.latency_data,
-                    power_data=self.power_data,
-                    policy=self.policy,
-                    round_up_cost_to_server=False,
-                )
-
-                all_actions = gen_actions(
-                    workflow=self.workflow,
-                    latency_data=self.latency_data,
-                    power_data=self.power_data,
-                    num_gpus=num_gpus,
-                    models=models,
-                    policy=self.policy,
-                )
-
-                # Filter to current model
-                model_actions = [a for a in all_actions if a.model == current_model]
-
-                if not model_actions:
-                    logging.debug(
-                        f"HexGen: No actions for {current_model.value} after {inner_it} inner iterations.")
-                    break
-
-                best_action = choose_action(model_actions, self.policy.objective)
-
-                if not best_action:
-                    logging.debug(f"HexGen: No action selected for {current_model.value}.")
-                    break
-
-                new_metric = best_action.get_metric(self.policy.objective)
-                prev_metric = per_model_metrics[current_model]
-
-                if self.policy.objective.is_monotonic() and prev_metric is not None and new_metric >= prev_metric:
-                    msg = (
-                        f"HexGen: {current_model.value} converged. "
-                        f"Metric: {new_metric:.2f} >= previous {prev_metric:.2f}."
-                    )
-                    if verbose:
-                        print(msg)
-                    logging.debug(msg)
-                    break
-
-                per_model_metrics[current_model] = new_metric
-
-                models = apply_action(best_action, models=models)
-                models = simplify_model_allocations(models)
-
-                remaining_gpus.clear()
-                for gpu_type in num_gpus.keys():
-                    remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]})
-
-                if verbose:
-                    self._print_iteration(it, models, num_gpus)
-                    print(f"HexGen: Applied action for {current_model.value}, "
-                          f"metric: {new_metric:.2f}")
-                    print("Remaining devices:")
-                    for gt in remaining_gpus:
-                        print(f"  {remaining_gpus[gt]} x {gt.value}")
-
-                it += 1
-                inner_it += 1
-
-                if it > MAX_ITERATIONS:
-                    logging.debug(f"HexGen: Reached max iterations ({MAX_ITERATIONS}). Stopping.")
-                    break
-
-            if it > MAX_ITERATIONS:
-                break
-
-            current_model_idx += 1
-
-        # --- USE_ALL_GPUS: fill remaining GPUs by cycling through MODEL_ORDER ---
-        remaining_gpus_total = sum(
-            num_gpus[gt] - calc_used_gpus({gt: models[gt]})
-            for gt in num_gpus
-        )
-        if USE_ALL_GPUS and remaining_gpus_total > 0:
-            models = self._fill_remaining_gpus_multi(
-                models=models,
-                num_gpus=num_gpus,
-                model_order=model_order,
-                it=it,
-                verbose=verbose,
-            )
-
-        # Adjust for no disaggregation
-        if not self.policy.is_disaggregated(Model.HF):
-            for models_gpu in models.values():
-                for instance_id in range(len(models_gpu[Model.HF_VAE])):
-                    assert models_gpu[Model.HF_VAE][instance_id].get_num_gpus() == 0, \
-                        "HF_VAE must have 0 GPUs when HF disaggregation is disabled"
-        if not self.policy.is_disaggregated(Model.FT):
-            for models_gpu in models.values():
-                for instance_id in range(len(models_gpu[Model.FT_VAE])):
-                    assert models_gpu[Model.FT_VAE][instance_id].get_num_gpus() == 0, \
-                        "FT_VAE must have 0 GPUs when FT disaggregation is disabled"
-
-        # Final evaluation
-        result = evaluate_model_allocation(
-            models=models,
-            num_gpus=num_gpus,
-            workflow=self.workflow,
-            latency_data=self.latency_data,
-            power_data=self.power_data,
-            policy=self.policy,
-            round_up_cost_to_server=True,
-        )
-
-        if verbose:
-            self._print_final_allocation(
-                models=models,
-                used_devices=result.gpus_used,
-                total_devices={
-                    gpu_type1: num_gpus.get(gpu_type1, 0),
-                    gpu_type2: num_gpus.get(gpu_type2, 0),
-                },
-                power_data=self.power_data,
-                total_time_s=result.total_time_s,
-                ttff_s=result.ttff_s,
-                first_chunk_time=result.first_chunk_time,
-                tbf_s=result.tbf_s,
-                total_energy=result.total_energy if self.power_data else 0.0,
-                cost=result.cost,
-            )
-
-        assert result.gpus_used[gpu_type1] <= num_gpus.get(gpu_type1, 0), \
-            f"{gpu_type1.value}: {result.gpus_used[gpu_type1]} > {num_gpus.get(gpu_type1, 0)}"
-        assert result.gpus_used[gpu_type2] <= num_gpus.get(gpu_type2, 0), \
-            f"{gpu_type2.value}: {result.gpus_used[gpu_type2]} > {num_gpus.get(gpu_type2, 0)}"
-
-        return Result(
-            total_time_s=result.total_time_s,
-            models=models,
-            gpus_used=result.gpus_used,
-            ttff_s=result.ttff_s,
-            tbf_s=result.tbf_s,
-            total_energy=result.total_energy if self.power_data else 0.0,
-            cost=result.cost,
-        )
-
-    def _fill_remaining_gpus_single(
-        self,
-        models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-        num_gpus: int,
-        gpu_type: GPUType,
-        model_order: list[Model],
-        it: int = 0,
-        verbose: bool = False,
-    ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
-        """
-        Fill remaining GPUs by cycling through MODEL_ORDER (single GPU type).
-        Applies any available action per model, ignoring metric convergence.
-        Stops when all GPUs are used or no model can accept more.
-        """
-        remaining_gpus = num_gpus - calc_used_gpus(models)
-        total_models = len(model_order)
-        model_idx = 0
-        models_exhausted: set[Model] = set()
-
-        if verbose:
-            print(f"--- HexGen: USE_ALL_GPUS fill phase, {remaining_gpus} remaining ---")
-
-        while remaining_gpus > 0 and len(models_exhausted) < total_models:
-            current_model = model_order[model_idx % total_models]
-            model_idx += 1
-
-            if current_model in models_exhausted:
-                continue
-
-            evaluate_model_allocation(
-                models=models,
-                num_gpus={gpu_type: num_gpus},
-                workflow=self.workflow,
-                latency_data=self.latency_data,
-                power_data=self.power_data,
-                policy=self.policy,
-                round_up_cost_to_server=False,
-            )
-
-            all_actions = gen_actions(
-                num_gpus={gpu_type: num_gpus},
-                latency_data=self.latency_data,
-                power_data=self.power_data,
-                workflow=self.workflow,
-                models=models,
-                policy=self.policy,
-            )
-            model_actions = [a for a in all_actions if a.model == current_model]
-
-            if not model_actions:
-                models_exhausted.add(current_model)
-                logging.debug(f"HexGen fill: {current_model.value} exhausted (no actions).")
-                continue
-
-            best_action = choose_action(model_actions, self.policy.objective)
-            if not best_action:
-                models_exhausted.add(current_model)
-                logging.debug(f"HexGen fill: {current_model.value} exhausted (no action selected).")
-                continue
-
-            models = apply_action(best_action, models=models)
-            models = simplify_model_allocations(models)
-            remaining_gpus = num_gpus - calc_used_gpus(models)
-
-            if verbose:
-                self._print_iteration(it, models, {gpu_type: num_gpus})
-                print(f"HexGen fill: Allocated to {current_model.value}, remaining: {remaining_gpus}")
-
-            it += 1
-            if it > MAX_ITERATIONS:
-                logging.debug(f"HexGen fill: Reached max iterations ({MAX_ITERATIONS}). Stopping.")
-                break
-
-        return models
-
-    def _fill_remaining_gpus_multi(
-        self,
-        models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-        num_gpus: dict[GPUType, int],
-        model_order: list[Model],
-        it: int = 0,
-        verbose: bool = False,
-    ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
-        """
-        Fill remaining GPUs by cycling through MODEL_ORDER (multi GPU type).
-        Applies any available action per model, ignoring metric convergence.
-        Stops when all GPUs are used or no model can accept more.
-        """
-        total_remaining = sum(
-            num_gpus[gt] - calc_used_gpus({gt: models[gt]})
-            for gt in num_gpus
-        )
-        total_models = len(model_order)
-        model_idx = 0
-        models_exhausted: set[Model] = set()
-
-        if verbose:
-            print(f"--- HexGen: USE_ALL_GPUS fill phase, {total_remaining} remaining ---")
-
-        while total_remaining > 0 and len(models_exhausted) < total_models:
-            current_model = model_order[model_idx % total_models]
-            model_idx += 1
-
-            if current_model in models_exhausted:
-                continue
-
-            evaluate_model_allocation(
-                models=models,
-                num_gpus=num_gpus,
-                workflow=self.workflow,
-                latency_data=self.latency_data,
-                power_data=self.power_data,
-                policy=self.policy,
-                round_up_cost_to_server=False,
-            )
-
-            all_actions = gen_actions(
-                workflow=self.workflow,
-                latency_data=self.latency_data,
-                power_data=self.power_data,
-                num_gpus=num_gpus,
-                models=models,
-                policy=self.policy,
-            )
-            model_actions = [a for a in all_actions if a.model == current_model]
-
-            if not model_actions:
-                models_exhausted.add(current_model)
-                logging.debug(f"HexGen fill: {current_model.value} exhausted (no actions).")
-                continue
-
-            best_action = choose_action(model_actions, self.policy.objective)
-            if not best_action:
-                models_exhausted.add(current_model)
-                logging.debug(f"HexGen fill: {current_model.value} exhausted (no action selected).")
-                continue
-
-            models = apply_action(best_action, models=models)
-            models = simplify_model_allocations(models)
-            total_remaining = sum(
-                num_gpus[gt] - calc_used_gpus({gt: models[gt]})
-                for gt in num_gpus
-            )
-
-            if verbose:
-                self._print_iteration(it, models, num_gpus)
-                print(f"HexGen fill: Allocated to {current_model.value}, remaining: {total_remaining}")
-
-            it += 1
-            if it > MAX_ITERATIONS:
-                logging.debug(f"HexGen fill: Reached max iterations ({MAX_ITERATIONS}). Stopping.")
-                break
-
-        return models
diff --git a/simulator/milp.py b/simulator/milp.py
deleted file mode 100644
index 7a84e754..00000000
--- a/simulator/milp.py
+++ /dev/null
@@ -1,1070 +0,0 @@
-"""
-MILP formulation for the StreamWise workflow allocation problem.
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-
-from typing import Callable
-from typing import Optional
-
-from pyomo.environ import ConcreteModel
-from pyomo.environ import Var
-from pyomo.environ import Set
-from pyomo.environ import Objective as OptObjective
-from pyomo.environ import Binary
-from pyomo.environ import NonNegativeIntegers
-from pyomo.environ import NonNegativeReals
-from pyomo.environ import minimize
-from pyomo.environ import SolverFactory
-from pyomo.environ import ConstraintList
-
-from sim_types import GPUType
-from sim_types import Model
-from sim_types import WorkflowConfig
-from sim_types import LatencyData
-from sim_types import PowerData
-from sim_types import Result
-from sim_types import Policy
-from sim_types import ModelAllocation
-from sim_types import Objective
-from sim_types import Solver
-
-from models import get_model_allocation
-
-from model_allocator import ModelAllocator
-
-from constants import DEVICE_OPTIONS
-from constants import NUM_GPUS_PER_SERVER
-from constants import SECONDS_IN_HOUR
-
-from policies import STREAMWISE_MILP_POLICY
-
-
-MAX_INSTANCES = 16
-
-# Maximum time it can take: 24 hours in seconds
-# Used for big-M constraints to link TTFF and makespan to instance variables
-MAX_TIME = 24 * SECONDS_IN_HOUR
-
-
-# Allocators that require quadratic (bilinear) objectives - need Gurobi
-QUADRATIC_OBJECTIVES = [
-    Objective.TTFF_COST,
-    Objective.TIME_ENERGY,
-    Objective.ENERGY_COST,
-]
-
-
-def idx(
-    gpu_type: GPUType,
-    model_name: Model,
-    instance_id: int
-) -> tuple[str, str, int]:
-    """Helper to convert enum to index key for instance variables."""
-    return (gpu_type.value, model_name.value, instance_id)
-
-
-def dev_idx(
-    gpu_type: GPUType,
-    model_name: Model,
-    instance_id: int,
-    num_devices: int
-) -> tuple[str, str, int, int]:
-    """Helper to convert enum to index key for device variables."""
-    return (gpu_type.value, model_name.value, instance_id, num_devices)
-
-
-class MILPAllocator(ModelAllocator):
-    """
-    MILP-based allocator that computes the optimal model allocation.
-    """
-    def __init__(
-        self,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-        power_data: Optional[PowerData] = None,
-        policy: Policy = STREAMWISE_MILP_POLICY,
-    ) -> None:
-        super().__init__(
-            workflow,
-            latency_data,
-            power_data,
-            policy,
-        )
-        assert self.policy.solver in [Solver.GUROBI, Solver.HIGHS]
-
-    def allocate(
-        self,
-        num_gpus: dict[GPUType, int],
-        verbose: bool = False,
-        running_cost: bool = False,  # If True, cost = active time only; False = makespan x GPUs
-        max_cost: Optional[float] = None,  # If set, adds a constraint to limit cost
-        max_ttff: Optional[float] = None,  # If set, adds a constraint to limit TTFF
-        max_makespan: Optional[float] = None,  # If set, adds a constraint to limit makespan
-        time_limit: Optional[int] = None,  # Time limit for the solver in seconds
-        save_solution_path: Optional[str] = None,  # If set, saves the solution to a JSON file
-        warm_start_path: Optional[str] = None,  # If set, loads a warm start solution from a JSON file
-        force_num_gpus: bool = False,  # If True, adds constraints to force the use of all available GPUs
-        skip_server_constraint: bool = False,  # If True, skips the GPU-per-server constraint
-    ) -> Result:
-        """
-        Calculate the optimal model allocation and resulting metrics using MILP formulation.
-        """
-        m = ConcreteModel()
-
-        # Options: "gurobi", "highs"
-        solver_name = self.policy.solver.value
-
-        # Define index sets
-        gpu_types = list(num_gpus.keys())
-
-        model_names = [
-            Model.GEMMA,
-            Model.FLUX,
-            Model.HF,
-            # Model.HF_VAE,
-            Model.FT,
-            # Model.FT_VAE,
-            # Model.UPSCALER,
-            Model.OTHERS,
-        ]
-        if self.policy.use_upscaler:
-            model_names.append(Model.UPSCALER)
-        if self.policy.is_disaggregated(Model.HF):
-            model_names.append(Model.HF_VAE)
-        if self.policy.is_disaggregated(Model.FT):
-            model_names.append(Model.FT_VAE)
-
-        # Remove models not in the workflow
-        model_names = [
-            model_name
-            for model_name in model_names
-            if model_name in self.workflow.models
-        ]
-
-        instance_ids = list(range(MAX_INSTANCES))
-
-        # The units of work that each model has to do
-        work: dict[Model, int] = self.workflow.work
-
-        # Create Pyomo Sets
-        m.GPU_TYPES = Set(initialize=[g.value for g in gpu_types])
-        m.MODEL_NAMES = Set(initialize=[mn.value for mn in model_names])
-        m.INSTANCES = Set(initialize=instance_ids)
-
-        # Create index set for device choices: (gpu_type, model_name, instance_id, device_count)
-        device_index_set = [
-            (gpu_type.value, model_name.value, instance_id, num_devices)
-            for gpu_type in gpu_types
-            for model_name in model_names
-            for instance_id in instance_ids
-            for num_devices in [0] + DEVICE_OPTIONS[model_name]
-        ]
-        m.DEVICE_INDEX = Set(initialize=device_index_set)
-
-        # Create index set for instance variables: (gpu_type, model_name, instance_id)
-        instance_index_set = [
-            (gpu_type.value, model_name.value, instance_id)
-            for gpu_type in gpu_types
-            for model_name in model_names
-            for instance_id in instance_ids
-        ]
-        m.INSTANCE_INDEX = Set(initialize=instance_index_set)
-
-        # Define indexed variables
-        m.device_choice = Var(m.DEVICE_INDEX, domain=Binary)
-        m.work_device = Var(m.DEVICE_INDEX, domain=NonNegativeIntegers)  # Linearization: work per device choice
-        m.gpus = Var(m.INSTANCE_INDEX, domain=NonNegativeIntegers)
-        m.is_active = Var(m.INSTANCE_INDEX, domain=Binary)
-        m.is_min = Var(m.INSTANCE_INDEX, domain=Binary)
-        m.work = Var(m.INSTANCE_INDEX, domain=NonNegativeIntegers)
-        m.time = Var(m.INSTANCE_INDEX, domain=NonNegativeReals)
-        m.ttff = Var(m.INSTANCE_INDEX, domain=NonNegativeReals)
-
-        # Objective variables
-        m.makespan = Var(domain=NonNegativeReals)
-        m.ttff_user = Var(domain=NonNegativeReals)
-        m.ttff_min = Var(m.MODEL_NAMES, domain=NonNegativeReals)  # Per-model minimum TTFF
-        m.time_max = Var(m.MODEL_NAMES, domain=NonNegativeReals)  # Per-model maximum time
-        m.cost = Var(domain=NonNegativeReals)
-        m.energy = Var(domain=NonNegativeReals)
-
-        # Constraint list for dynamic constraints
-        m.constraints = ConstraintList()
-
-        for gpu_type in gpu_types:
-            for model_name in model_names:
-                for instance_id in instance_ids:
-                    key = idx(gpu_type, model_name, instance_id)
-
-                    # GPUs used = sum of num_devices * device_choice[num_devices]
-                    m.constraints.add(
-                        m.gpus[key] == sum(
-                            num_devices * m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                            for num_devices in [0] + DEVICE_OPTIONS[model_name]
-                        )
-                    )
-
-                    # Cannot select inactive instance as min
-                    m.constraints.add(m.is_min[key] <= m.is_active[key])
-                    # If active = 0 -> GPUs = 0
-                    m.constraints.add(m.gpus[key] <= num_gpus[gpu_type] * m.is_active[key])
-                    # If active = 1 -> GPUs ≥ 1
-                    m.constraints.add(m.gpus[key] >= m.is_active[key])
-                    # If work = 0 -> active = 0 -> GPUs = 0
-                    m.constraints.add(m.is_active[key] <= m.work[key])
-
-                    # If device = 0 -> work = 0
-                    dev_idx_0 = dev_idx(gpu_type, model_name, instance_id, 0)
-                    m.constraints.add(
-                        m.work[key]
-                        <= work[model_name] * (1 - m.device_choice[dev_idx_0])
-                    )
-
-                    # Linearization: work_device links device_choice and work
-                    # work = sum(work_device[d] for d in devices) - excludes 0 GPUs since they can't do work
-                    m.constraints.add(
-                        m.work[key] == sum(
-                            m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                            for num_devices in DEVICE_OPTIONS[model_name]
-                        )
-                    )
-                    # If any non-zero device is selected, work must be >= 1
-                    m.constraints.add(
-                        m.work[key] >= sum(
-                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                            for num_devices in DEVICE_OPTIONS[model_name]
-                        )
-                    )
-                    # work_device[d] <= TOTAL_WORK * device_choice[d]
-                    for num_devices in [0] + DEVICE_OPTIONS[model_name]:
-                        didx = dev_idx(gpu_type, model_name, instance_id, num_devices)
-                        m.constraints.add(
-                            m.work_device[didx] <= work[model_name] * m.device_choice[didx]
-                        )
-
-                    # Link instance time to per-model max time
-                    m.constraints.add(m.time[key] <= m.time_max[model_name.value])
-
-                    # Link TTFF to per-model TTFF min
-                    # If selected → ttff_min[model] == ttff_var
-                    m.constraints.add(m.ttff_min[model_name.value] >= m.ttff[key] - MAX_TIME * (1 - m.is_min[key]))
-                    m.constraints.add(m.ttff_min[model_name.value] <= m.ttff[key] + MAX_TIME * (1 - m.is_active[key]))
-
-                # One device per instance
-                for instance_id in instance_ids:
-                    m.constraints.add(
-                        sum(
-                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                            for num_devices in [0] + DEVICE_OPTIONS[model_name]
-                        ) == 1
-                    )
-
-                # Symmetry breaking (fill earlier instances first)
-                for instance_id in range(MAX_INSTANCES - 1):
-                    m.constraints.add(
-                        m.gpus[idx(gpu_type, model_name, instance_id)]
-                        >= m.gpus[idx(gpu_type, model_name, instance_id + 1)]
-                    )
-
-        # Makespan is the sum of max times per model (models run sequentially)
-        m.constraints.add(m.makespan == sum(m.time_max[model_name.value] for model_name in model_names))
-
-        # User TTFF definition: sum of min TTFF per model
-        m.constraints.add(m.ttff_user >= sum(m.ttff_min[model_name.value] for model_name in model_names))
-        m.constraints.add(m.ttff_user >= m.makespan - self.workflow.total_video_seconds)
-
-        # Select exactly 1 instance as the min TTFF instance per model
-        for model_name in model_names:
-            m.constraints.add(
-                sum(
-                    m.is_min[idx(gpu_type, model_name, instance_id)]
-                    for gpu_type in gpu_types
-                    for instance_id in instance_ids
-                ) == 1
-            )
-
-        # Resolution scaling factor for HF/VAE/FT
-        latency_ratio = self.workflow.get_resolution_scale(self.policy.use_upscaler)
-
-        # Time constraints
-        # Each model block is guarded by membership in model_names so that
-        # the MILP can be built for a subset of models (e.g. Helix per-model).
-        for gpu_type in gpu_types:
-            # Gemma
-            if Model.GEMMA in model_names and work[Model.GEMMA] > 0:
-                model_name = Model.GEMMA
-                for instance_id in instance_ids:
-                    key = idx(gpu_type, model_name, instance_id)
-                    # Makespan is the max time across all instances
-                    # Linearized: use work_device instead of device_choice * work
-                    if work[model_name] > 1:
-                        # Parallel: each work unit = 1 scene
-                        # Time for w scenes
-                        # = gemma_first_scene + gemma_per_scene * (w - 1)
-                        # = (gemma_first_scene - gemma_per_scene) * is_active + gemma_per_scene * work
-                        # Using linearized variables:
-                        # = (gemma_first_scene[d] - gemma_per_scene[d]) * \
-                        # device_choice[d] + gemma_per_scene[d] * work_device[d]
-                        m.constraints.add(
-                            m.time[key] == sum(
-                                (
-                                    self.latency_data[gpu_type].gemma_first_scene[num_devices]
-                                    - self.latency_data[gpu_type].gemma_per_scene[num_devices]
-                                )
-                                * m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                                + self.latency_data[gpu_type].gemma_per_scene[num_devices]
-                                * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                                for num_devices in DEVICE_OPTIONS[model_name]
-                            )
-                        )
-                    else:
-                        m.constraints.add(
-                            m.time[key] == sum(
-                                (
-                                    self.latency_data[gpu_type].gemma_first_scene[num_devices]
-                                    + self.latency_data[gpu_type].gemma_per_scene[num_devices]
-                                    * (self.workflow.total_scenes - 1)
-                                )
-                                * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                                for num_devices in DEVICE_OPTIONS[model_name]
-                            )
-                        )
-                    # TTFF is for 1 work unit
-                    m.constraints.add(
-                        m.ttff[key] == sum(
-                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                            * self.latency_data[gpu_type].gemma_first_scene[num_devices]
-                            * 1  # TTFF for tokens in first scene
-                            for num_devices in DEVICE_OPTIONS[model_name]
-                        )
-                    )
-
-            # Flux
-            if Model.FLUX in model_names and work[Model.FLUX] > 0:
-                model_name = Model.FLUX
-                for instance_id in instance_ids:
-                    key = idx(gpu_type, model_name, instance_id)
-                    # Makespan is the max time across all instances
-                    # Linearized: use work_device instead of device_choice * work
-                    if work[model_name] > 1:
-                        # Parallel: each work unit = 1 scene
-                        # Time for w scenes = latency * num_steps_flux * w
-                        m.constraints.add(
-                            m.time[key] == sum(
-                                self.latency_data[gpu_type][model_name, num_devices]
-                                * self.workflow.num_steps[model_name]
-                                * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                                for num_devices in DEVICE_OPTIONS[model_name]
-                            )
-                        )
-                    else:
-                        # Non-parallel: single work unit covers all scenes
-                        m.constraints.add(
-                            m.time[key] == sum(
-                                self.latency_data[gpu_type][model_name, num_devices]
-                                * self.workflow.num_steps[model_name]
-                                * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                                for num_devices in DEVICE_OPTIONS[model_name]
-                            )
-                        )
-                    # TTFF is for 1 work unit
-                    m.constraints.add(
-                        m.ttff[key] == sum(
-                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                            * self.latency_data[gpu_type][model_name, num_devices]
-                            * self.workflow.num_steps[model_name]
-                            * 1  # TTFF for first work unit
-                            for num_devices in DEVICE_OPTIONS[model_name]
-                        )
-                    )
-
-            # Hunyuan FramePack
-            if Model.HF in model_names and work[Model.HF] > 0:
-                model_name = Model.HF
-                for instance_id in instance_ids:
-                    key = idx(gpu_type, model_name, instance_id)
-
-                    """
-                    from models import HFModelAllocation
-                    HFModelAllocation(
-                        gpu_type,
-                        num_devices,
-                        replicas=1,
-                    )._calc_time_per_subscene(
-                        self.policy,
-                        self.workflow,
-                        self.latency_data[gpu_type]
-                    )
-                    """
-
-                    # Makespan is the max time across all instances
-                    # Linearized: use work_device instead of device_choice * work
-                    hf_time_expr = sum(
-                        self.workflow.per_subscene_frames[model_name]
-                        / self.workflow.hf_frames[self.workflow.frames_per_step_idx]
-                        * self.latency_data[gpu_type][model_name, num_devices]
-                        * latency_ratio
-                        * self.workflow.num_steps[model_name]
-                        * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                        for num_devices in DEVICE_OPTIONS[model_name]
-                    )
-                    # When not disaggregated, VAE runs on the same instance
-                    if not self.policy.is_disaggregated(Model.HF):
-                        hf_vae_time_per_work = (
-                            self.latency_data[gpu_type][Model.HF_VAE, 1]
-                            * latency_ratio
-                            / self.workflow.hf_frames[self.workflow.frames_per_step_idx]
-                        )
-                        hf_time_expr += hf_vae_time_per_work * m.work[key]
-                    m.constraints.add(m.time[key] == hf_time_expr)
-                    # TTFF is for first chunk (can be smaller than subscene when disaggregated)
-                    ttff_frames_hf = min(
-                        self.workflow.hf_frames[0],
-                        self.workflow.per_subscene_frames[model_name])
-                    hf_ttff_expr = sum(
-                        m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                        * ttff_frames_hf
-                        / self.workflow.hf_frames[self.workflow.frames_per_step_idx]
-                        * self.latency_data[gpu_type][model_name, num_devices]
-                        * latency_ratio
-                        * self.workflow.num_steps[model_name]
-                        * 1  # TTFF for first chunk
-                        for num_devices in DEVICE_OPTIONS[model_name]
-                    )
-                    # When not disaggregated, add VAE decode time for first chunk
-                    if not self.policy.is_disaggregated(Model.HF):
-                        hf_vae_ttff = (
-                            ttff_frames_hf
-                            / self.workflow.hf_frames[self.workflow.frames_per_step_idx]
-                            * self.latency_data[gpu_type][Model.HF_VAE, 1]
-                            * latency_ratio
-                        )
-                        hf_ttff_expr += hf_vae_ttff * m.is_active[key]
-                    m.constraints.add(m.ttff[key] == hf_ttff_expr)
-
-            # Hunyuan FramePack VAE
-            if Model.HF_VAE in model_names and work[Model.HF_VAE] > 0:
-                model_name = Model.HF_VAE
-                for instance_id in instance_ids:
-                    key = idx(gpu_type, model_name, instance_id)
-                    # Makespan is the max time across all instances
-                    # Linearized: use work_device instead of device_choice * work
-                    m.constraints.add(
-                        m.time[key] == sum(
-                            self.latency_data[gpu_type][model_name, num_devices]
-                            * latency_ratio
-                            / self.workflow.hf_frames[self.workflow.frames_per_step_idx]
-                            * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                            for num_devices in DEVICE_OPTIONS[model_name]
-                        )
-                    )
-                    # TTFF is for 1 subscene
-                    m.constraints.add(
-                        m.ttff[key] == sum(
-                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                            * self.workflow.per_subscene_frames[Model.HF]
-                            * self.latency_data[gpu_type][model_name, num_devices]
-                            * latency_ratio
-                            / self.workflow.hf_frames[self.workflow.frames_per_step_idx]  # frames_per_step_hf
-                            * 1  # TTFF for first subscene
-                            for num_devices in DEVICE_OPTIONS[model_name]
-                        )
-                    )
-
-            # Fantasy Talking
-            if Model.FT in model_names and work[Model.FT] > 0:
-                model_name = Model.FT
-                for instance_id in instance_ids:
-                    key = idx(gpu_type, model_name, instance_id)
-                    # Makespan is the max time across all instances
-                    # Linearized: use work_device instead of device_choice * work
-                    ft_time_expr = sum(
-                        self.workflow.per_subscene_frames[model_name]
-                        / self.workflow.ft_frames[self.workflow.frames_per_step_idx]
-                        * self.latency_data[gpu_type][model_name, num_devices]
-                        * latency_ratio
-                        * self.workflow.num_steps[model_name]
-                        * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                        for num_devices in DEVICE_OPTIONS[model_name]
-                    )
-                    # When not disaggregated, VAE runs on the same instance
-                    if not self.policy.is_disaggregated(Model.FT):
-                        ft_vae_time_per_work = (
-                            self.latency_data[gpu_type][Model.FT_VAE, 1]
-                            * latency_ratio
-                            / self.workflow.ft_frames[self.workflow.frames_per_step_idx]
-                        )
-                        ft_time_expr += ft_vae_time_per_work * m.work[key]
-                    m.constraints.add(m.time[key] == ft_time_expr)
-                    # TTFF is for 1 work unit (e.g., subscene)
-                    ft_ttff_expr = sum(
-                        m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                        * self.workflow.per_subscene_frames[model_name]
-                        / self.workflow.ft_frames[self.workflow.frames_per_step_idx]
-                        * self.latency_data[gpu_type][model_name, num_devices]
-                        * latency_ratio
-                        * self.workflow.num_steps[model_name]
-                        * 1  # TTFF for first subscene
-                        for num_devices in DEVICE_OPTIONS[model_name]
-                    )
-                    # When not disaggregated, add VAE decode time for first subscene
-                    if not self.policy.is_disaggregated(Model.FT):
-                        ft_vae_ttff = (
-                            self.workflow.per_subscene_frames[Model.FT]
-                            / self.workflow.ft_frames[self.workflow.frames_per_step_idx]
-                            * self.latency_data[gpu_type][Model.FT_VAE, 1]
-                            * latency_ratio
-                        )
-                        ft_ttff_expr += ft_vae_ttff * m.is_active[key]
-                    m.constraints.add(m.ttff[key] == ft_ttff_expr)
-
-            # Fantasy Talking VAE
-            if Model.FT_VAE in model_names and work[Model.FT_VAE] > 0:
-                model_name = Model.FT_VAE
-                for instance_id in instance_ids:
-                    key = idx(gpu_type, model_name, instance_id)
-                    # Makespan is the max time across all instances
-                    # Linearized: use work_device instead of device_choice * work
-                    m.constraints.add(
-                        m.time[key] == sum(
-                            self.latency_data[gpu_type][model_name, num_devices]
-                            * latency_ratio
-                            / self.workflow.ft_frames[self.workflow.frames_per_step_idx]
-                            * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                            for num_devices in DEVICE_OPTIONS[model_name]
-                        )
-                    )
-                    # TTFF is for 1 subscene
-                    m.constraints.add(
-                        m.ttff[key] == sum(
-                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                            * self.workflow.per_subscene_frames[Model.FT]
-                            * self.latency_data[gpu_type][model_name, num_devices]
-                            * latency_ratio
-                            / self.workflow.ft_frames[self.workflow.frames_per_step_idx]  # frames_per_step_ft
-                            * 1  # TTFF for first subscene
-                            for num_devices in DEVICE_OPTIONS[model_name]
-                        )
-                    )
-
-            # Upscaler
-            if Model.UPSCALER in model_names and work[Model.UPSCALER] > 0 and self.policy.use_upscaler:
-                model_name = Model.UPSCALER
-                for instance_id in instance_ids:
-                    key = idx(gpu_type, model_name, instance_id)
-                    # Linearized: use work_device instead of device_choice * work
-                    m.constraints.add(
-                        m.time[key] == sum(
-                            self.latency_data[gpu_type][model_name, num_devices]
-                            * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                            for num_devices in DEVICE_OPTIONS[model_name]
-                        )
-                    )
-                    # TTFF is for 1 work unit (e.g., subscene)
-                    m.constraints.add(
-                        m.ttff[key] == sum(
-                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                            * self.latency_data[gpu_type][model_name, num_devices]
-                            * self.workflow.per_subscene_frames[Model.FT]
-                            * 1  # TTFF is for first subscene
-                            for num_devices in DEVICE_OPTIONS[model_name]
-                        )
-                    )
-
-            # Others
-            if Model.OTHERS in model_names and work[Model.OTHERS] > 0:
-                model_name = Model.OTHERS
-                for instance_id in instance_ids:
-                    key = idx(gpu_type, model_name, instance_id)
-                    # Makespan is the max time across all instances
-                    m.constraints.add(
-                        m.time[key] == sum(
-                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                            * self.latency_data[gpu_type][model_name, num_devices]
-                            * self.workflow.total_scenes
-                            for num_devices in DEVICE_OPTIONS[model_name]
-                        )
-                    )
-                    # TTFF is for 1 work unit
-                    m.constraints.add(
-                        m.ttff[key] == sum(
-                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                            * self.latency_data[gpu_type][model_name, num_devices]
-                            * 1  # TTFF is for first scene
-                            for num_devices in DEVICE_OPTIONS[model_name]
-                        )
-                    )
-
-        # Total work to do for each model
-        for model_name in model_names:
-            m.constraints.add(
-                sum(
-                    m.work[idx(gpu_type, model_name, instance_id)]
-                    for gpu_type in gpu_types
-                    for instance_id in instance_ids
-                ) == work[model_name]
-            )
-
-        # Number of GPUs per type
-        # Add a variable to represent the number of servers for each GPU type
-        m.num_servers = Var(m.GPU_TYPES, domain=NonNegativeIntegers)
-
-        for gpu_type in gpu_types:
-            total_gpus = sum(
-                m.gpus[idx(gpu_type, model_name, instance_id)]
-                for model_name in model_names
-                for instance_id in instance_ids
-            )
-            if force_num_gpus:
-                m.constraints.add(total_gpus == num_gpus[gpu_type])
-            else:
-                m.constraints.add(total_gpus <= num_gpus[gpu_type])
-
-            # GPUs used must be a multiple of NUM_GPUS_PER_SERVER
-            if not skip_server_constraint:
-                m.constraints.add(total_gpus == m.num_servers[gpu_type.value] * NUM_GPUS_PER_SERVER[gpu_type])
-
-        # Cost calculation
-        # running_cost=True: cost based only on active model running time
-        if running_cost:
-            cost_expr = sum(
-                self._get_latency_per_work(
-                    gpu_type,
-                    model_name,
-                    num_devices,
-                )
-                * num_devices
-                * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                * self.policy.gpu_cost[gpu_type] / SECONDS_IN_HOUR
-                for gpu_type in gpu_types
-                for model_name in model_names
-                for instance_id in instance_ids
-                for num_devices in DEVICE_OPTIONS[model_name]
-            )
-        # running_cost=False: cost = makespan × total_GPUs_used (GPUs allocated for full job duration)
-        else:
-            cost_expr = m.makespan * sum(
-                m.gpus[idx(gpu_type, model_name, instance_id)]
-                * self.policy.gpu_cost[gpu_type] / SECONDS_IN_HOUR
-                for gpu_type in gpu_types
-                for model_name in model_names
-                for instance_id in instance_ids
-            )
-        m.constraints.add(m.cost == cost_expr)
-
-        # Energy: model-specific power * active time + idle power * (makespan - active time)
-        if self.power_data is None:
-            energy_expr = 0.0
-        else:
-            # Active energy: Use model-specific power values (not TDP)
-            energy_expr = sum(
-                self._get_latency_per_work(
-                    gpu_type,
-                    model_name,
-                    num_devices,
-                )
-                * num_devices
-                * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
-                * (
-                    self._get_power_per_work(
-                        gpu_type,
-                        model_name,
-                        num_devices,
-                    ) - self.power_data[gpu_type]["idle"]
-                )
-                for gpu_type in gpu_types
-                for model_name in model_names
-                for instance_id in instance_ids
-                for num_devices in DEVICE_OPTIONS[model_name]
-            )
-            # Idle energy: idle power * num_gpus * makespan
-            energy_expr += sum(
-                self.power_data[gpu_type]["idle"] * num_gpus[gpu_type] * m.makespan
-                for gpu_type in gpu_types
-            )
-        m.constraints.add(m.energy == energy_expr)
-
-        # Bounds
-        if max_cost is not None:
-            m.constraints.add(m.cost <= max_cost)
-        if max_ttff is not None:
-            m.constraints.add(m.ttff_user <= max_ttff)
-        if max_makespan is not None:
-            m.constraints.add(m.makespan <= max_makespan)
-
-        # Objective functions
-        obj = get_objective(
-            m=m,
-            allocator=self.policy.objective,
-            solver_name=solver_name,
-        )
-        if obj is not None:
-            m.objective = obj
-
-        # Solve
-        solver = SolverFactory(solver_name)
-        if solver_name == "gurobi" and time_limit:
-            solver.options["TimeLimit"] = time_limit
-        if solver_name == "highs" and time_limit:
-            solver.options["time_limit"] = time_limit
-        if self.policy.objective in QUADRATIC_OBJECTIVES and solver_name == "gurobi":
-            solver.options['NonConvex'] = 2  # Option for bilinear objectives
-        if solver_name == "highs":
-            solver.options["time_limit"] = 50  # seconds
-
-        if warm_start_path is not None:
-            _load_warm_start(m, warm_start_path)
-
-        if solver_name == "gurobi":
-            opt_result = solver.solve(
-                m,
-                tee=verbose,
-                warmstart=warm_start_path is not None,
-            )
-        else:
-            opt_result = solver.solve(m, tee=verbose)
-
-        if opt_result.solver.status != "ok":
-            logging.error(f"Solver failed with status: {opt_result.solver.status}")
-
-        if save_solution_path is not None:
-            _save_solution(m, save_solution_path)
-
-        models = milp_to_models_dict(
-            m=m,
-            gpu_types=gpu_types,
-            model_names=model_names,
-            instance_ids=instance_ids,
-            idx=idx,
-            workflow=self.workflow,
-            power_data=self.power_data,
-            policy=self.policy,
-        )
-
-        if not self._is_valid_result(m):
-            return Result()
-
-        tbf_s = 0.0
-        if m.makespan.value and self.workflow.num_frames > 0:
-            tbf_s = m.makespan.value / self.workflow.num_frames
-        return Result(
-            models=models,
-            gpus_used=self._get_num_gpus(m, gpu_types, model_names, instance_ids),
-            total_time_s=m.makespan.value,
-            ttff_s=m.ttff_user.value,
-            tbf_s=tbf_s,
-            cost=m.cost.value,
-            total_energy=m.energy.value,
-        )
-
-    def _is_valid_result(self, m: ConcreteModel) -> bool:
-        for gpu_type in m.GPU_TYPES:
-            for model_name in m.MODEL_NAMES:
-                for instance_id in m.INSTANCES:
-                    if m.gpus[gpu_type, model_name, instance_id].value is None:
-                        return False
-        return True
-
-    def _get_num_gpus(
-        self,
-        m: ConcreteModel,
-        gpu_types: list[GPUType],
-        model_names: list[Model],
-        instance_ids: list[int],
-    ) -> dict[GPUType, int]:
-        if not self._is_valid_result(m):
-            return {}
-        return {
-            gpu_type: sum(
-                # round() snaps solver float to nearest int (e.g. 1.9999 -> 2)
-                int(round(m.gpus[idx(gpu_type, model_name, instance_id)].value))
-                for model_name in model_names
-                for instance_id in instance_ids
-                if m.gpus[idx(gpu_type, model_name, instance_id)].value is not None
-            )
-            for gpu_type in gpu_types
-        }
-
-    def _get_latency_per_work(
-        self,
-        gpu_type: GPUType,
-        model_name: Model,
-        num_devices: int,
-    ) -> float:
-        """
-        Cost per unit of work for a given model and GPU type, based on latency data.
-        Cost: Linearized - sum of (latency * work_device * num_devices * ratio)
-        This replaces the bilinear makespan * GPUs.
-        """
-        # Resolution scaling factor for HF/VAE/FT
-        latency_ratio = self.workflow.get_resolution_scale(self.policy.use_upscaler)
-
-        if model_name == Model.GEMMA:
-            return (
-                self.latency_data[gpu_type].gemma_first_scene[num_devices]
-                + self.latency_data[gpu_type].gemma_per_scene[num_devices] * (self.workflow.total_scenes - 1)
-            )
-
-        if model_name == Model.FLUX:
-            return (
-                self.latency_data[gpu_type][model_name, num_devices]
-                * self.workflow.num_steps[Model.FLUX]
-            )
-
-        if model_name == Model.HF:
-            time_per_work = (
-                self.workflow.per_subscene_frames[Model.HF]
-                / self.workflow.hf_frames[self.workflow.frames_per_step_idx]
-                * self.latency_data[gpu_type][model_name, num_devices]
-                * latency_ratio
-                * self.workflow.num_steps[Model.HF]
-            )
-            if not self.policy.is_disaggregated(Model.HF):
-                time_per_work += self._get_latency_per_work(
-                    gpu_type,
-                    Model.HF_VAE,
-                    1,  # VAE is single-device only in current policy
-                )
-            return time_per_work
-
-        if model_name == Model.HF_VAE:
-            return (
-                self.latency_data[gpu_type][model_name, num_devices]
-                * latency_ratio
-                / self.workflow.hf_frames[self.workflow.frames_per_step_idx]
-            )
-
-        if model_name == Model.FT:
-            time_per_work = (
-                self.workflow.per_subscene_frames[Model.FT]
-                / self.workflow.ft_frames[self.workflow.frames_per_step_idx]
-                * self.latency_data[gpu_type][model_name, num_devices]
-                * latency_ratio
-                * self.workflow.num_steps[Model.FT]
-            )
-            if not self.policy.is_disaggregated(Model.FT):
-                time_per_work += self._get_latency_per_work(
-                    gpu_type,
-                    Model.FT_VAE,
-                    1,  # VAE is single-device only in current policy
-                )
-            return time_per_work
-
-        if model_name == Model.FT_VAE:
-            return (
-                self.latency_data[gpu_type][model_name, num_devices]
-                * latency_ratio
-                / self.workflow.ft_frames[self.workflow.frames_per_step_idx]
-            )
-
-        if model_name == Model.UPSCALER:
-            return self.latency_data[gpu_type][model_name, num_devices]
-
-        if model_name == Model.OTHERS:
-            return self.latency_data[gpu_type][model_name, num_devices] * self.workflow.total_scenes
-
-        raise ValueError(f"Unknown model_name {model_name}")
-
-    def _get_power_per_work(
-        self,
-        gpu_type: GPUType,
-        model_name: Model,
-        num_devices: int,
-    ) -> float:
-        """
-        Average power per unit of work for a given model and GPU type.
-        Returns the time-weighted average power consumption in watts.
-        For energy calculation:
-        energy = _get_latency_per_work(...) * _get_power_per_work(...) * num_devices * work
-        """
-        if self.power_data is None:
-            return 0.0
-
-        if model_name == Model.GEMMA:
-            # For Gemma, power varies between first scene and subsequent scenes
-            # Compute energy then divide by total time to get average power
-            power_first = self.power_data[gpu_type].gemma_first_scene[num_devices]
-            power_per_scene = self.power_data[gpu_type].gemma_per_scene[num_devices]
-            latency_first = self.latency_data[gpu_type].gemma_first_scene[num_devices]
-            latency_per_scene = self.latency_data[gpu_type].gemma_per_scene[num_devices]
-
-            total_energy = (
-                power_first * latency_first
-                + power_per_scene * latency_per_scene * (self.workflow.total_scenes - 1)
-            )
-            total_time = latency_first + latency_per_scene * (self.workflow.total_scenes - 1)
-
-            return total_energy / total_time if total_time > 0 else power_first
-
-        if model_name == Model.FLUX:
-            return self.power_data[gpu_type][model_name, num_devices]
-
-        if model_name == Model.HF:
-            return self.power_data[gpu_type][model_name, num_devices]
-
-        if model_name == Model.HF_VAE:
-            return self.power_data[gpu_type][model_name, num_devices]
-
-        if model_name == Model.FT:
-            return self.power_data[gpu_type][model_name, num_devices]
-
-        if model_name == Model.FT_VAE:
-            return self.power_data[gpu_type][model_name, num_devices]
-
-        if model_name == Model.UPSCALER:
-            return self.power_data[gpu_type][model_name, num_devices]
-
-        if model_name == Model.OTHERS:
-            # OTHERS model uses minimal GPU power (mostly idle)
-            # See models.py OthersModelAllocation.calculate_energy - only uses idle power
-            return self.power_data[gpu_type]["idle"]
-
-        raise ValueError(f"Unknown model_name {model_name}")
-
-
-def milp_to_models_dict(
-    m: ConcreteModel,
-    gpu_types: list[GPUType],
-    model_names: list[Model],
-    instance_ids: list[int],
-    idx: Callable[[GPUType, Model, int], tuple[str, str, int]],
-    workflow: WorkflowConfig,
-    power_data: Optional[PowerData],
-    policy: Policy,
-) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
-    """
-    MILP result to models dictionary.
-    """
-    if m is None:
-        return {}
-
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {}
-    for gpu_type in gpu_types:
-        models[gpu_type] = {}
-        for model_name in model_names:
-            models[gpu_type][model_name] = []
-            for instance_id in instance_ids:
-                key = idx(gpu_type, model_name, instance_id)
-                gpus_val = m.gpus[key].value
-                work_val = m.work[key].value
-                if gpus_val is None or work_val is None:
-                    continue
-                # round() snaps solver floats to nearest int (e.g. 1.9999 -> 2);
-                # banker's rounding is irrelevant here since MILP values can be
-                # near-integer, like 1.999 and 2.001
-                gpus = int(round(gpus_val))
-                work = int(round(work_val))
-                if gpus > 0 and work > 0:
-                    model_allocation = get_model_allocation(
-                        model=model_name,
-                        gpu_type=gpu_type,
-                        devices=gpus,
-                        replicas=1,
-                    )
-                    model_allocation.work = work
-                    model_allocation.time = m.time[key].value
-                    model_allocation.time_first = m.ttff[key].value
-                    model_allocation.calculate_energy(
-                        workflow=workflow,
-                        power_data=power_data,
-                        total_time_s=m.makespan.value
-                    )
-                    model_allocation.calculate_cost(
-                        policy,
-                        total_time_s=m.makespan.value
-                    )
-                    models[gpu_type][model_name].append(model_allocation)
-    merged_models = models  # coalesce_models(models)
-    return merged_models
-
-
-def get_objective(
-    m: ConcreteModel,
-    allocator: Objective,
-    solver_name: str,
-) -> Optional[OptObjective]:
-    if allocator == Objective.TIME:
-        return OptObjective(expr=m.makespan, sense=minimize)
-
-    if allocator == Objective.TTFF:
-        return OptObjective(expr=m.ttff_user, sense=minimize)
-
-    if allocator == Objective.TTFF_COST:
-        # Note: This creates a bilinear (nonconvex) objective - requires Gurobi
-        if solver_name == "gurobi":
-            return OptObjective(expr=m.ttff_user * m.cost, sense=minimize)
-        logging.warning("TTFF_COST using linear utility function.")
-        a = 1.0
-        b = 1.0
-        return OptObjective(expr=a * m.ttff_user + b * m.cost, sense=minimize)
-
-    if allocator == Objective.COST:
-        return OptObjective(expr=m.cost, sense=minimize)
-
-    if allocator == Objective.ENERGY:
-        return OptObjective(expr=m.energy, sense=minimize)
-
-    if allocator == Objective.TIME_ENERGY:
-        # Note: This creates a bilinear objective - requires Gurobi
-        if solver_name == "gurobi":
-            return OptObjective(expr=m.makespan * m.energy, sense=minimize)
-        logging.warning("TIME_ENERGY using linear utility function.")
-        a = 1.0
-        b = 1.0
-        return OptObjective(expr=a * m.makespan + b * m.energy, sense=minimize)
-
-    if allocator == Objective.ENERGY_COST:
-        if solver_name == "gurobi":
-            return OptObjective(expr=m.energy * m.cost, sense=minimize)
-        logging.warning("ENERGY_COST using linear utility function.")
-        a = 1.0
-        b = 1.0
-        return OptObjective(expr=a * m.energy + b * m.cost, sense=minimize)
-
-    if allocator == Objective.FIFO:
-        logging.error("FIFO not implemented in MILP")
-
-    if allocator == Objective.RANDOM:
-        return None  # No objective, just find a feasible solution
-
-    if allocator == Objective.NONE:
-        return None
-
-    return OptObjective(expr=m.makespan, sense=minimize)
-
-
-def _save_solution(
-    m: ConcreteModel,
-    save_solution_path: str,
-) -> None:
-    solution = {
-        var.name: var.value
-        for var in m.component_data_objects(Var, active=True)
-        if var.value is not None
-    }
-    with open(save_solution_path, "w", encoding="utf-8") as output_file:
-        json.dump(solution, output_file, indent=2)
-
-
-def _load_warm_start(
-    m: ConcreteModel,
-    warm_start_path: str,
-) -> None:
-    """Load warm start values from a JSON file and apply them to the model variables."""
-    with open(warm_start_path, "r", encoding="utf-8") as input_file:
-        warm_start_values = json.load(input_file)
-
-    warm_start_applied = 0
-    for var in m.component_data_objects(Var, active=True):
-        if var.name in warm_start_values:
-            var.set_value(warm_start_values[var.name])
-            warm_start_applied += 1
-
-    logging.info(
-        f"Warm start loaded from {warm_start_path}. "
-        f"Applied values to {warm_start_applied} variables."
-    )
diff --git a/simulator/model_allocator.py b/simulator/model_allocator.py
deleted file mode 100644
index ab1c7e39..00000000
--- a/simulator/model_allocator.py
+++ /dev/null
@@ -1,282 +0,0 @@
-"""
-Defines the ModelAllocator abstract base class and its interface for model allocation strategies.
-"""
-
-from __future__ import annotations
-
-from typing import Optional
-
-from abc import ABC
-from abc import abstractmethod
-
-from sim_types import GPUType
-from sim_types import Model
-from sim_types import ModelAllocation
-from sim_types import Policy
-from sim_types import WorkflowConfig
-from sim_types import LatencyData
-from sim_types import PowerData
-from sim_types import Result
-
-from models import FluxModelAllocation
-from models import GemmaModelAllocation
-from models import HFModelAllocation
-from models import HFVAEModelAllocation
-from models import FTModelAllocation
-from models import FTVAEModelAllocation
-from models import UpscalerModelAllocation
-from models import OthersModelAllocation
-
-from policies import NAIVE_POLICY
-
-
-class ModelAllocator(ABC):
-    """
-    Abstract base class for model allocators.
-    """
-
-    def __init__(
-        self,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-        power_data: Optional[PowerData] = None,
-        policy: Policy = NAIVE_POLICY,
-    ) -> None:
-        self.workflow = workflow
-        self.latency_data = latency_data
-        self.power_data = power_data
-        self.policy = policy
-
-    @abstractmethod
-    def allocate(
-        self,
-        num_gpus: dict[GPUType, int],
-        verbose: bool = False,
-    ) -> Result:
-        """Allocate models to GPUs and return the provisioning result."""
-        ...
-
-    def _init_single_server_models(
-        self,
-        gpu_type: GPUType,
-    ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
-        """
-        Initialize model allocations for a single server (8 GPUs or fewer).
-        Each model gets a single allocation entry.
-        """
-        models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {
-            gpu_type: {
-                Model.GEMMA: [
-                    GemmaModelAllocation(
-                        gpu_type=gpu_type,
-                        devices=1, replicas=1)
-                ],
-                Model.FLUX: [
-                    FluxModelAllocation(
-                        gpu_type=gpu_type,
-                        devices=1, replicas=1)
-                ],
-                Model.HF: [
-                    HFModelAllocation(
-                        gpu_type=gpu_type,
-                        devices=1, replicas=2)
-                ],
-                Model.HF_VAE: [
-                    HFVAEModelAllocation(
-                        gpu_type=gpu_type,
-                        devices=1, replicas=1)
-                ],
-                Model.FT: [
-                    FTModelAllocation(
-                        gpu_type=gpu_type,
-                        devices=1, replicas=1)
-                ],
-                Model.FT_VAE: [
-                    FTVAEModelAllocation(
-                        gpu_type=gpu_type,
-                        devices=1, replicas=1)
-                ],
-                Model.UPSCALER: [
-                    UpscalerModelAllocation(
-                        gpu_type=gpu_type)
-                ],
-                Model.OTHERS: [
-                    OthersModelAllocation(
-                        gpu_type=gpu_type,
-                        devices=1, replicas=1)  # + 1 for Kokoro/YOLO
-                ],
-            },
-        }
-
-        if self.policy.use_upscaler:
-            # HF -> UPSCALER
-            models[gpu_type][Model.HF][0].replicas -= 1
-            models[gpu_type][Model.UPSCALER][0].replicas += 1
-
-        if not self.policy.is_disaggregated(Model.HF):
-            # HF_VAE -> HF
-            models[gpu_type][Model.HF_VAE][0].replicas -= 1
-            models[gpu_type][Model.HF][0].replicas += 1
-        if not self.policy.is_disaggregated(Model.FT):
-            # FT_VAE -> FT
-            models[gpu_type][Model.FT_VAE][0].replicas -= 1
-            models[gpu_type][Model.FT][0].replicas += 1
-
-        self._zero_out_unused_models(models)
-        return models
-
-    def _init_single_device_models(
-        self,
-        gpu_type: GPUType,
-    ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
-        """
-        Initialize model allocations for a single GPU type with >8 GPUs.
-        Each model gets two allocation entries (active and inactive).
-        """
-        models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {
-            gpu_type: {
-                Model.GEMMA: [
-                    GemmaModelAllocation(
-                        gpu_type=gpu_type,
-                        devices=1, replicas=1),
-                    GemmaModelAllocation(
-                        gpu_type=gpu_type),
-                ],
-                Model.FLUX: [
-                    FluxModelAllocation(
-                        gpu_type=gpu_type,
-                        devices=1, replicas=1),
-                    FluxModelAllocation(
-                        gpu_type=gpu_type),
-                ],
-                Model.HF: [
-                    HFModelAllocation(
-                        gpu_type=gpu_type,
-                        devices=1, replicas=1),
-                    HFModelAllocation(
-                        gpu_type=gpu_type),
-                ],
-                Model.HF_VAE: [
-                    HFVAEModelAllocation(
-                        gpu_type=gpu_type,
-                        devices=1, replicas=1),
-                    HFVAEModelAllocation(
-                        gpu_type=gpu_type),
-                ],
-                Model.FT: [
-                    FTModelAllocation(
-                        gpu_type=gpu_type,
-                        devices=2, replicas=1),
-                    FTModelAllocation(
-                        gpu_type=gpu_type),
-                ],
-                Model.FT_VAE: [
-                    FTVAEModelAllocation(
-                        gpu_type=gpu_type,
-                        devices=1, replicas=1),
-                    FTVAEModelAllocation(
-                        gpu_type=gpu_type),
-                ],
-                Model.UPSCALER: [
-                    UpscalerModelAllocation(
-                        gpu_type=gpu_type),
-                    UpscalerModelAllocation(
-                        gpu_type=gpu_type),
-                ],
-                Model.OTHERS: [
-                    OthersModelAllocation(
-                        gpu_type=gpu_type,
-                        devices=1, replicas=1),
-                    OthersModelAllocation(
-                        gpu_type=gpu_type),
-                ],
-            },
-        }
-
-        if self.policy.use_upscaler:
-            models[gpu_type][Model.UPSCALER][0].replicas = 1
-
-        if not self.policy.is_disaggregated(Model.HF):
-            # HF_VAE -> HF
-            models[gpu_type][Model.HF_VAE][0].replicas -= 1
-            models[gpu_type][Model.HF][0].replicas += 1
-        if not self.policy.is_disaggregated(Model.FT):
-            # FT_VAE -> FT
-            models[gpu_type][Model.FT_VAE][0].replicas -= 1
-            models[gpu_type][Model.FT][0].replicas += 1
-
-        self._zero_out_unused_models(models)
-        return models
-
-    def _init_both_devices_models(
-        self,
-        gpu_type1: GPUType,
-        gpu_type2: GPUType,
-    ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
-        """
-        Initialize model allocations for two GPU types.
-        gpu_type1 gets GEMMA, FLUX, OTHERS; gpu_type2 gets HF, VAE, FT, UPSCALER.
-        """
-        models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {
-            gpu_type1: {
-                Model.GEMMA: [GemmaModelAllocation(
-                    gpu_type=gpu_type1,
-                    devices=1, replicas=1)],
-                Model.FLUX: [FluxModelAllocation(
-                    gpu_type=gpu_type1,
-                    devices=1, replicas=1)],
-                Model.HF: [],
-                Model.HF_VAE: [],
-                Model.FT: [],
-                Model.FT_VAE: [],
-                Model.UPSCALER: [],
-                Model.OTHERS: [OthersModelAllocation(
-                    gpu_type=gpu_type1,
-                    devices=1, replicas=1)],  # + 1 for Kokoro/YOLO
-            },
-            gpu_type2: {
-                Model.GEMMA: [],
-                Model.FLUX: [],
-                Model.HF: [HFModelAllocation(
-                    gpu_type=gpu_type2,
-                    devices=1, replicas=1)],
-                Model.HF_VAE: [HFVAEModelAllocation(
-                    gpu_type=gpu_type2,
-                    devices=1, replicas=1)],
-                Model.FT: [FTModelAllocation(
-                    gpu_type=gpu_type2,
-                    devices=2, replicas=1)],
-                Model.FT_VAE: [FTVAEModelAllocation(
-                    gpu_type=gpu_type2,
-                    devices=1, replicas=1)],
-                Model.UPSCALER: [UpscalerModelAllocation(
-                    gpu_type=gpu_type2)],
-                Model.OTHERS: [],
-            },
-        }
-
-        if not self.policy.is_disaggregated(Model.HF):
-            # HF_VAE -> HF
-            models[gpu_type2][Model.HF_VAE][0].replicas -= 1
-            models[gpu_type2][Model.HF][0].replicas += 1
-        if not self.policy.is_disaggregated(Model.FT):
-            # FT_VAE -> FT
-            models[gpu_type2][Model.FT_VAE][0].replicas -= 1
-            models[gpu_type2][Model.FT][0].replicas += 1
-
-        if self.policy.use_upscaler:
-            models[gpu_type2][Model.UPSCALER][0].replicas = 1
-
-        self._zero_out_unused_models(models)
-        return models
-
-    def _zero_out_unused_models(
-        self,
-        models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-    ) -> None:
-        """Zero out replicas for models not in the workflow."""
-        for gpu_type in models:
-            for model in Model:
-                if model not in self.workflow.models:
-                    for alloc in models[gpu_type][model]:
-                        alloc.replicas = 0
diff --git a/simulator/models.py b/simulator/models.py
deleted file mode 100644
index 9a56ab79..00000000
--- a/simulator/models.py
+++ /dev/null
@@ -1,811 +0,0 @@
-"""
-Contains the definition for each model.
-It includes the calculations for time, energy, and cost.
-"""
-from __future__ import annotations
-
-import math
-
-from typing import override
-from typing import Callable
-from typing import Optional
-from typing import Type
-from typing import ClassVar
-
-from sim_types import LatencyData
-from sim_types import PowerData
-from sim_types import ModelAllocation
-from sim_types import Model
-from sim_types import Policy
-from sim_types import QualityLevel
-from sim_types import WorkflowConfig
-from sim_types import GPUType
-
-from constants import TOTAL_INPUT_TOKENS
-
-
-# ModelAllocation Factory
-ModelAllocationCls = Type[ModelAllocation]
-
-_MODEL_ALLOCATION_REGISTRY: dict[Model, ModelAllocationCls] = {}
-
-
-def register_model(
-    model: Model
-) -> Callable[[ModelAllocationCls], ModelAllocationCls]:
-    """Register a ModelAllocation class for the factory."""
-    def decorator(cls: ModelAllocationCls) -> ModelAllocationCls:
-        _MODEL_ALLOCATION_REGISTRY[model] = cls
-        return cls
-    return decorator
-
-
-def get_model_allocation(
-    *,
-    model: Model,
-    gpu_type: GPUType,
-    devices: int = 1,
-    replicas: int = 0,
-) -> ModelAllocation:
-    """Factory to get the ModelAllocation instance for a specific model."""
-    if model not in _MODEL_ALLOCATION_REGISTRY:
-        raise ValueError(f"No ModelAllocation for model {model}")
-    cls = _MODEL_ALLOCATION_REGISTRY[model]
-    return cls(
-        gpu_type=gpu_type,
-        devices=devices,
-        replicas=replicas,
-    )
-
-
-def _calculate_total_time(
-    total_work: float,
-    num_replicas: int,
-    time_per_work: float,
-) -> float:
-    """Calculate total time given work, replicas, and time per work unit."""
-    if num_replicas <= 0:
-        return 0.0
-    total_time = (total_work / num_replicas) * time_per_work
-    if total_time < time_per_work:  # We cannot go faster than single work unit time
-        total_time = time_per_work
-    return total_time
-
-
-def assert_pixel_config(
-    workflow: WorkflowConfig
-) -> None:
-    """Verify that the workflow's pixel configuration is valid for upscaling."""
-    from sim_types import RESOLUTION_PIXELS
-    assert 0 < RESOLUTION_PIXELS[QualityLevel.MEDIUM] < RESOLUTION_PIXELS[QualityLevel.HIGH]
-
-
-@register_model(Model.GEMMA)
-class GemmaModelAllocation(ModelAllocation):
-    """Gemma model allocation."""
-    model: ClassVar[Model] = Model.GEMMA
-
-    @override
-    def get_max_replicas(
-        self,
-        workflow: WorkflowConfig,
-    ) -> int:
-        return workflow.model_work.get(Model.GEMMA, 1)
-
-    @override
-    def calculate_time(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-        work_pct: float = 1.0,
-    ) -> float:
-        if self.get_num_gpus() == 0:
-            self.time = 0.0
-            return self.time
-        latency_first = latency_data[self.gpu_type].gemma_first_scene[self.devices]
-        latency_per_scene = latency_data[self.gpu_type].gemma_per_scene[self.devices]
-        latency_first *= workflow.total_input_tokens / TOTAL_INPUT_TOKENS
-        latency_per_scene *= workflow.total_input_tokens / TOTAL_INPUT_TOKENS
-        total_work = workflow.model_work.get(Model.GEMMA, 1)
-        if total_work > 1:
-            num_scenes = math.ceil(work_pct * total_work)
-            total_time_per_scene = latency_first + latency_per_scene * (num_scenes - 1)
-            self.time = _calculate_total_time(
-                num_scenes,
-                self.replicas,
-                total_time_per_scene / num_scenes)
-        else:
-            self.time = latency_first + latency_per_scene * (workflow.total_scenes - 1)
-        return self.time
-
-    @override
-    def calculate_time_first(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-    ) -> float:
-        if self.get_num_gpus() == 0:
-            self.time_first = 0.0
-            return self.time_first
-        latency_first = latency_data[self.gpu_type].gemma_first_scene[self.devices]
-        latency_first *= workflow.total_input_tokens / TOTAL_INPUT_TOKENS
-        self.time_first = latency_first
-        return self.time_first
-
-    @override
-    def calculate_energy(
-        self,
-        workflow: WorkflowConfig,
-        power_data: Optional[PowerData] = None,
-        total_time_s: float = 0.0,
-    ) -> float:
-        if self.get_num_gpus() == 0 or power_data is None:
-            self.energy = 0.0
-            return self.energy
-        # Gemma energy
-        latency_first = self.time_first
-        latency_per_scene = max(0.0, self.time - latency_first)
-        power_first = power_data[self.gpu_type].gemma_first_scene[self.devices]
-        power_per_scene = power_data[self.gpu_type].gemma_per_scene[self.devices]
-        self.energy = \
-            power_first * latency_first + \
-            power_per_scene * latency_per_scene * (workflow.total_scenes - 1)
-        # Idle energy
-        power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus()
-        time_idle = total_time_s - self.time
-        if time_idle > 0:
-            self.energy += power_idle * time_idle
-        return self.energy
-
-
-@register_model(Model.FLUX)
-class FluxModelAllocation(ModelAllocation):
-    """Flux model allocation."""
-    model: ClassVar[Model] = Model.FLUX
-
-    def _calc_time_per_scene(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-    ) -> float:
-        return (
-            latency_data[self.gpu_type][self.model, self.devices]
-            * workflow.num_steps[Model.FLUX]
-        )
-
-    @override
-    def get_max_replicas(
-        self,
-        workflow: WorkflowConfig,
-    ) -> int:
-        return workflow.model_work.get(Model.FLUX, 1)
-
-    @override
-    def calculate_time(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-        work_pct: float = 1.0,
-    ) -> float:
-        if self.get_num_gpus() == 0:
-            self.time = 0.0
-            return self.time
-        time_per_scene = self._calc_time_per_scene(
-            policy,
-            workflow,
-            latency_data,
-        )
-        total_work = workflow.model_work.get(Model.FLUX, 1)
-        if total_work > 1:
-            num_scenes = math.ceil(work_pct * total_work)
-            self.time = _calculate_total_time(
-                num_scenes,
-                self.replicas,
-                time_per_scene)
-        else:
-            self.time = time_per_scene
-        return self.time
-
-    @override
-    def calculate_time_first(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-    ) -> float:
-        if self.get_num_gpus() == 0:
-            self.time_first = 0.0
-            return self.time_first
-        time_per_scene = self._calc_time_per_scene(
-            policy,
-            workflow,
-            latency_data,
-        )
-        self.time_first = time_per_scene
-        return self.time_first
-
-    @override
-    def calculate_energy(
-        self,
-        workflow: WorkflowConfig,
-        power_data: Optional[PowerData] = None,
-        total_time_s: float = 0.0,
-    ) -> float:
-        if self.get_num_gpus() == 0 or power_data is None:
-            self.energy = 0.0
-            return self.energy
-        power_flux = power_data[self.gpu_type][Model.FLUX, self.devices]
-        self.energy = power_flux * self.time * self.replicas
-        # Idle energy
-        power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus()
-        time_idle = total_time_s - self.time
-        if time_idle > 0:
-            self.energy += power_idle * time_idle
-        return self.energy
-
-
-@register_model(Model.HF)
-class HFModelAllocation(ModelAllocation):
-    """HunyuanFramePack model allocation."""
-    model: ClassVar[Model] = Model.HF
-
-    def _calc_time_per_frame(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-    ) -> float:
-        return (
-            latency_data[self.gpu_type][self.model, self.devices]
-            * workflow.get_resolution_scale(policy.use_upscaler)
-            * workflow.num_steps[Model.HF]
-        )
-
-    def _calc_time_per_subscene(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-    ) -> float:
-        return (
-            workflow.per_subscene_frames[Model.HF]
-            / workflow.hf_frames[workflow.frames_per_step_idx]
-            * latency_data[self.gpu_type][self.model, self.devices]
-            * workflow.get_resolution_scale(policy.use_upscaler)  # latency_ratio
-            * workflow.num_steps[Model.HF]
-        )
-
-    @override
-    def calculate_time(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-        work_pct: float = 1.0,
-    ) -> float:
-        if self.get_num_gpus() == 0:
-            self.time = 0.0
-            return self.time
-
-        hf_time_per_subscene = self._calc_time_per_subscene(
-            policy,
-            workflow,
-            latency_data,
-        )
-        self.time = _calculate_total_time(
-            math.ceil(work_pct * workflow.total_subscenes),
-            self.replicas,
-            hf_time_per_subscene)
-
-        if not policy.is_disaggregated(Model.HF):
-            # Include VAE time in the same GPU when disaggregation is disabled
-            hf_vae_time_per_frame = (
-                latency_data[self.gpu_type][Model.HF_VAE, 1]  # VAE is single-device only in current policy
-                * workflow.get_resolution_scale(policy.use_upscaler)
-                / workflow.hf_frames[workflow.frames_per_step_idx]
-            )
-            self.time += _calculate_total_time(
-                math.ceil(work_pct * workflow.total_frames[Model.HF]),
-                self.replicas,
-                hf_vae_time_per_frame)
-
-        return self.time
-
-    @override
-    def calculate_time_first(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-    ) -> float:
-        if self.get_num_gpus() == 0:
-            self.time_first = 0.0
-            return self.time_first
-
-        if policy.is_disaggregated(Model.HF):
-            # HF for the first chunk
-            self.time_first = min(
-                # Option 1: the first few frames until the first chunk is done
-                workflow.hf_frames[0]
-                / workflow.hf_frames[workflow.frames_per_step_idx]
-                * self._calc_time_per_frame(
-                    policy,
-                    workflow,
-                    latency_data
-                ),
-                # Option 2: the full subscene
-                self._calc_time_per_subscene(
-                    policy,
-                    workflow,
-                    latency_data
-                ),
-            )
-        else:
-            # HF + VAE for the full subscene
-            hf_time_per_subscene = self._calc_time_per_subscene(
-                policy,
-                workflow,
-                latency_data)
-            hf_vae_time_per_subscene = (
-                workflow.per_subscene_frames[Model.HF]
-                / workflow.hf_frames[workflow.frames_per_step_idx]
-                * latency_data[self.gpu_type][Model.HF_VAE, 1]  # VAE is single-device only in current policy
-                * workflow.get_resolution_scale(policy.use_upscaler)
-            )
-            self.time_first = hf_time_per_subscene + hf_vae_time_per_subscene
-
-        return self.time_first
-
-    @override
-    def calculate_energy(
-        self,
-        workflow: WorkflowConfig,
-        power_data: Optional[PowerData] = None,
-        total_time_s: float = 0.0,
-    ) -> float:
-        if self.get_num_gpus() == 0 or power_data is None:
-            self.energy = 0.0
-            return self.energy
-        power_hf = power_data[self.gpu_type][Model.HF, self.devices]
-        self.energy = power_hf * self.time * self.replicas
-        # Idle energy
-        power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus()
-        time_idle = total_time_s - self.time
-        if time_idle > 0:
-            self.energy += power_idle * time_idle
-        return self.energy
-
-    @override
-    def get_max_replicas(
-        self,
-        workflow: WorkflowConfig,
-    ) -> int:
-        return workflow.model_work.get(Model.HF, 1)
-
-
-@register_model(Model.HF_VAE)
-class HFVAEModelAllocation(ModelAllocation):
-    """HunyuanFramePack VAE model allocation."""
-    model: ClassVar[Model] = Model.HF_VAE
-
-    def _calc_time_per_frame(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-    ) -> float:
-        return (
-            latency_data[self.gpu_type][Model.HF_VAE, self.devices]
-            * workflow.get_resolution_scale(policy.use_upscaler)
-            / workflow.hf_frames[workflow.frames_per_step_idx]
-        )
-
-    @override
-    def calculate_time(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-        work_pct: float = 1.0,
-    ) -> float:
-        if not policy.is_disaggregated(Model.HF):
-            assert self.get_num_gpus() == 0
-            self.time = 0.0
-            return self.time
-        if self.get_num_gpus() == 0:
-            self.time = 0.0
-            return self.time
-
-        vae_time_per_frame = self._calc_time_per_frame(
-            policy,
-            workflow,
-            latency_data
-        )
-        self.time = _calculate_total_time(
-            math.ceil(workflow.total_frames[Model.HF] * work_pct),
-            self.replicas,
-            vae_time_per_frame)
-        return self.time
-
-    @override
-    def calculate_time_first(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-    ) -> float:
-        if not policy.is_disaggregated(Model.HF):
-            assert self.get_num_gpus() == 0
-            self.time_first = 0.0
-            return self.time_first
-        if self.get_num_gpus() == 0:
-            self.time_first = 0.0
-            return self.time_first
-
-        vae_time_per_frame = self._calc_time_per_frame(
-            policy,
-            workflow,
-            latency_data,
-        )
-        num_frames = workflow.per_subscene_frames[Model.HF]
-        self.time_first = num_frames * vae_time_per_frame
-        return self.time_first
-
-    @override
-    def calculate_energy(
-        self,
-        workflow: WorkflowConfig,
-        power_data: Optional[PowerData] = None,
-        total_time_s: float = 0.0,
-    ) -> float:
-        if self.get_num_gpus() == 0 or power_data is None:
-            self.energy = 0.0
-            return self.energy
-        self.energy = power_data[self.gpu_type][Model.HF_VAE, self.devices] * self.time * self.replicas
-        # Idle energy
-        power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus()
-        time_idle = total_time_s - self.time
-        if time_idle > 0:
-            self.energy += power_idle * time_idle
-        return self.energy
-
-    @override
-    def get_max_replicas(
-        self,
-        workflow: WorkflowConfig,
-    ) -> int:
-        return workflow.model_work.get(Model.HF_VAE, 1)
-
-
-@register_model(Model.FT)
-class FTModelAllocation(ModelAllocation):
-    """FantasyTalking model allocation."""
-    model: ClassVar[Model] = Model.FT
-
-    def _calc_time_per_subscene(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-    ) -> float:
-        return (
-            workflow.per_subscene_frames[Model.FT]
-            / workflow.ft_frames[workflow.frames_per_step_idx]
-            * latency_data[self.gpu_type][Model.FT, self.devices]
-            * workflow.get_resolution_scale(policy.use_upscaler)
-            * workflow.num_steps[Model.FT]
-        )
-
-    @override
-    def calculate_time(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-        work_pct: float = 1.0,
-    ) -> float:
-        if self.get_num_gpus() == 0:
-            self.time = 0.0
-            return self.time
-
-        ft_time_per_subscene = self._calc_time_per_subscene(
-            policy,
-            workflow,
-            latency_data,
-        )
-        self.time = _calculate_total_time(
-            math.ceil(work_pct * workflow.total_subscenes),
-            self.replicas,
-            ft_time_per_subscene)
-
-        if not policy.is_disaggregated(Model.FT):
-            # Include VAE time in the same GPU when disaggregation is disabled
-            # Note: VAE latency uses devices=1 as VAE processing is not parallelized
-            # across multiple devices in the same way as the main FT diffusion
-            ft_vae_time_per_frame = (
-                latency_data[self.gpu_type][Model.FT_VAE, 1]
-                * workflow.get_resolution_scale(policy.use_upscaler)
-                / workflow.ft_frames[workflow.frames_per_step_idx]
-            )
-            self.time += _calculate_total_time(
-                math.ceil(work_pct * workflow.total_frames[Model.FT]),
-                self.replicas,
-                ft_vae_time_per_frame)
-
-        return self.time
-
-    @override
-    def calculate_time_first(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-    ) -> float:
-        if self.get_num_gpus() == 0:
-            self.time_first = 0.0
-            return self.time_first
-
-        ft_time_per_subscene = self._calc_time_per_subscene(
-            policy,
-            workflow,
-            latency_data,
-        )
-        self.time_first = ft_time_per_subscene
-
-        if not policy.is_disaggregated(Model.FT):
-            # Include VAE time_first when FT-VAE is not disaggregated
-            # Note: VAE latency uses devices=1 (see note in calculate_time)
-            ft_vae_time_per_subscene = (
-                workflow.per_subscene_frames[Model.FT]
-                / workflow.ft_frames[workflow.frames_per_step_idx]
-                * latency_data[self.gpu_type][Model.FT_VAE, 1]
-                * workflow.get_resolution_scale(policy.use_upscaler)
-            )
-            self.time_first += ft_vae_time_per_subscene
-
-        return self.time_first
-
-    @override
-    def calculate_energy(
-        self,
-        workflow: WorkflowConfig,
-        power_data: Optional[PowerData] = None,
-        total_time_s: float = 0.0,
-    ) -> float:
-        if self.get_num_gpus() == 0 or power_data is None:
-            self.energy = 0.0
-            return self.energy
-        power_ft = power_data[self.gpu_type][Model.FT, self.devices]
-        self.energy = power_ft * self.time * self.replicas
-        # Idle energy
-        power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus()
-        time_idle = total_time_s - self.time
-        if time_idle > 0:
-            self.energy += power_idle * time_idle
-        return self.energy
-
-    @override
-    def get_max_replicas(
-        self,
-        workflow: WorkflowConfig,
-    ) -> int:
-        return workflow.model_work.get(Model.FT, 1)
-
-
-@register_model(Model.FT_VAE)
-class FTVAEModelAllocation(ModelAllocation):
-    """FantasyTalking VAE model allocation."""
-    model: ClassVar[Model] = Model.FT_VAE
-
-    def _calc_time_per_frame(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-    ) -> float:
-        return (
-            latency_data[self.gpu_type][Model.FT_VAE, self.devices]
-            * workflow.get_resolution_scale(policy.use_upscaler)
-            / workflow.ft_frames[workflow.frames_per_step_idx]
-        )
-
-    @override
-    def calculate_time(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-        work_pct: float = 1.0,
-    ) -> float:
-        if not policy.is_disaggregated(Model.FT):
-            assert self.get_num_gpus() == 0
-            self.time = 0.0
-            return self.time
-        if self.get_num_gpus() == 0:
-            self.time = 0.0
-            return self.time
-
-        vae_time_per_frame = self._calc_time_per_frame(
-            policy,
-            workflow,
-            latency_data,
-        )
-        self.time = _calculate_total_time(
-            math.ceil(workflow.total_frames[Model.FT] * work_pct),
-            self.replicas,
-            vae_time_per_frame)
-        return self.time
-
-    @override
-    def calculate_time_first(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-    ) -> float:
-        if not policy.is_disaggregated(Model.FT):
-            assert self.get_num_gpus() == 0
-            self.time_first = 0.0
-            return self.time_first
-        if self.get_num_gpus() == 0:
-            self.time_first = 0.0
-            return self.time_first
-
-        vae_time_per_frame = self._calc_time_per_frame(
-            policy,
-            workflow,
-            latency_data,
-        )
-        num_frames = workflow.per_subscene_frames[Model.FT]
-        self.time_first = num_frames * vae_time_per_frame
-        return self.time_first
-
-    @override
-    def calculate_energy(
-        self,
-        workflow: WorkflowConfig,
-        power_data: Optional[PowerData] = None,
-        total_time_s: float = 0.0,
-    ) -> float:
-        if self.get_num_gpus() == 0 or power_data is None:
-            self.energy = 0.0
-            return self.energy
-        self.energy = power_data[self.gpu_type][Model.FT_VAE, self.devices] * self.time * self.replicas
-        # Idle energy
-        power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus()
-        time_idle = total_time_s - self.time
-        if time_idle > 0:
-            self.energy += power_idle * time_idle
-        return self.energy
-
-    @override
-    def get_max_replicas(
-        self,
-        workflow: WorkflowConfig,
-    ) -> int:
-        return workflow.model_work.get(Model.FT_VAE, 1)
-
-
-@register_model(Model.UPSCALER)
-class UpscalerModelAllocation(ModelAllocation):
-    """Upscaler model allocation."""
-    model: ClassVar[Model] = Model.UPSCALER
-
-    @override
-    def calculate_time(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-        work_pct: float = 1.0,
-    ) -> float:
-        if self.get_num_gpus() == 0:
-            self.time = 0.0
-            return self.time
-        self.time = _calculate_total_time(
-            math.ceil(work_pct * workflow.total_frames[Model.FT]),
-            self.replicas,
-            latency_data[self.gpu_type][self.model, self.devices])
-        return self.time
-
-    @override
-    def calculate_time_first(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-    ) -> float:
-        if not policy.use_upscaler:
-            assert self.get_num_gpus() == 0
-        if self.get_num_gpus() == 0:
-            self.time_first = 0.0
-            return self.time_first
-
-        self.time_first = (
-            workflow.per_subscene_frames[Model.FT]
-            * latency_data[self.gpu_type][self.model, self.devices]
-        )
-        return self.time_first
-
-    @override
-    def calculate_energy(
-        self,
-        workflow: WorkflowConfig,
-        power_data: Optional[PowerData] = None,
-        total_time_s: float = 0.0,
-    ) -> float:
-        if self.get_num_gpus() == 0 or power_data is None:
-            self.energy = 0.0
-            return self.energy
-        # Assumes a single device and multiple replicas
-        self.energy = power_data[self.gpu_type][self.model, self.devices] * self.time * self.replicas
-        # Idle energy
-        power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus()
-        time_idle = total_time_s - self.time
-        if time_idle > 0:
-            self.energy += power_idle * time_idle
-        return self.energy
-
-    @override
-    def get_max_replicas(
-        self,
-        workflow: WorkflowConfig,
-    ) -> int:
-        return workflow.model_work.get(Model.UPSCALER, 1)
-
-
-@register_model(Model.OTHERS)
-class OthersModelAllocation(ModelAllocation):
-    """Others: Kokoro + YOLO."""
-    model: ClassVar[Model] = Model.OTHERS
-
-    @override
-    def calculate_time(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-        work_pct: float = 1.0,
-    ) -> float:
-        if self.get_num_gpus() == 0:
-            self.time = 0.0
-            return self.time
-
-        self.time = (
-            workflow.total_scenes
-            * latency_data[self.gpu_type][self.model, self.devices]
-        )
-        return self.time
-
-    @override
-    def calculate_time_first(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-    ) -> float:
-        if self.get_num_gpus() == 0:
-            self.time_first = 0.0
-            return self.time_first
-
-        self.time_first = latency_data[self.gpu_type][self.model, self.devices]
-        return self.time_first
-
-    @override
-    def calculate_energy(
-        self,
-        workflow: WorkflowConfig,
-        power_data: Optional[PowerData] = None,
-        total_time_s: float = 0.0,
-    ) -> float:
-        if self.get_num_gpus() == 0 or power_data is None:
-            self.energy = 0.0
-            return self.energy
-        # Idle energy; not much GPU usage
-        power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus()
-        self.energy = power_idle * self.time
-        return self.energy
diff --git a/simulator/multirequests.py b/simulator/multirequests.py
index 4fee5d55..82957c8f 100644
--- a/simulator/multirequests.py
+++ b/simulator/multirequests.py
@@ -4,23 +4,23 @@
 import os
 from dataclasses import replace
 
-from sim_types import GPUType
-from sim_types import Model
-from sim_types import QualityLevel
-from sim_types import RESOLUTION_PIXELS
-from sim_types import Result
-from sim_types import WorkflowConfig
-from sim_types import LatencyData
+from model_provisioner.sim_types import GPUType
+from model_provisioner.sim_types import Model
+from model_provisioner.sim_types import QualityLevel
+from model_provisioner.sim_types import RESOLUTION_PIXELS
+from model_provisioner.sim_types import Result
+from model_provisioner.sim_types import WorkflowConfig
+from model_provisioner.sim_types import LatencyData
 
-from data_loading import load_latency_data
-from data_loading import load_power_data
-from data_loading import load_adaptive_quality_data
+from model_provisioner.data_loading import load_latency_data
+from model_provisioner.data_loading import load_power_data
+from model_provisioner.data_loading import load_adaptive_quality_data
 
-from workflows import PODCAST_WORKFLOW
+from model_provisioner.workflows import PODCAST_WORKFLOW
 
-from policies import STREAMWISE_POLICY
+from model_provisioner.policies import STREAMWISE_POLICY
 
-from auto_model_allocator import AutoModelAllocator
+from model_provisioner.auto_model_allocator import AutoModelAllocator
 
 
 # Queries per minute
diff --git a/simulator/naive_baseline.py b/simulator/naive_baseline.py
deleted file mode 100644
index 9f9c550c..00000000
--- a/simulator/naive_baseline.py
+++ /dev/null
@@ -1,484 +0,0 @@
-"""
-Naive baseline for the StreamWise workflow allocation problem.
-"""
-
-from __future__ import annotations
-
-from typing import Optional
-
-from constants import NUM_GPUS_PER_SERVER
-from constants import DEVICE_OPTIONS
-
-from sim_types import Result
-from sim_types import GPUType
-from sim_types import WorkflowConfig
-from sim_types import LatencyData
-from sim_types import PowerData
-from sim_types import Policy
-from sim_types import Solver
-from sim_types import Model
-from sim_types import ModelAllocation
-from sim_types import Objective
-
-from models import FluxModelAllocation
-from models import GemmaModelAllocation
-from models import HFModelAllocation
-from models import HFVAEModelAllocation
-from models import FTModelAllocation
-from models import FTVAEModelAllocation
-from models import UpscalerModelAllocation
-from models import OthersModelAllocation
-
-from evaluator import evaluate_model_allocation
-
-from policies import NAIVE_POLICY
-from policies import MAX_DEVICES
-
-from model_allocator import ModelAllocator
-
-
-class NaiveAllocator(ModelAllocator):
-    """
-    Naive allocator that implements a simple heuristic.
-    """
-    def __init__(
-        self,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-        power_data: Optional[PowerData] = None,
-        policy: Policy = NAIVE_POLICY,
-    ) -> None:
-        super().__init__(
-            workflow,
-            latency_data,
-            power_data,
-            policy,
-        )
-        assert self.policy.solver == Solver.NAIVE
-        assert self.policy.objective == Objective.TTFF
-
-    def allocate(
-        self,
-        num_gpus: dict[GPUType, int],
-        verbose: bool = False,
-    ) -> Result:
-        total_gpus = sum(num_gpus.values())
-        assert total_gpus >= 8, f"Total number of GPUs must be at least 8 ({num_gpus})"
-
-        gpu_types = [
-            gpu_type
-            for gpu_type, count in num_gpus.items()
-            if count > 0
-        ]
-        assert 1 <= len(gpu_types) <= 2, f"Only up to two GPU types are supported ({len(gpu_types)})"
-        gpu_type1 = gpu_types[0]
-
-        if len(gpu_types) == 1:
-            models = self._naive_single(
-                num_gpus.get(gpu_type1, 0),
-                gpu_type=gpu_type1,
-            )
-        else:
-            # Mixed setup of GPU types (e.g., A100 and H100)
-            models = self._naive_two(num_gpus)
-
-        result = evaluate_model_allocation(
-            models=models,
-            num_gpus=num_gpus,
-            workflow=self.workflow,
-            latency_data=self.latency_data,
-            power_data=self.power_data,
-            policy=self.policy,
-            round_up_cost_to_server=True,
-        )
-        return result
-
-    def _naive_single(
-        self,
-        num_gpus: int,
-        gpu_type: GPUType,
-    ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
-        """Naive allocation for single GPU type."""
-        return self._naive_parallelism_allocation(gpu_type, num_gpus)
-
-    def _naive_two(
-        self,
-        num_gpus: dict[GPUType, int],
-    ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
-        """Naive allocation for two GPU types."""
-        gpu_types = list(num_gpus.keys())
-        assert len(gpu_types) == 2
-        assert len(num_gpus) == 2
-        gpu_type1 = gpu_types[0]
-        gpu_type2 = gpu_types[1]
-        assert num_gpus[gpu_type1] >= NUM_GPUS_PER_SERVER[gpu_type1]
-        assert num_gpus[gpu_type2] >= NUM_GPUS_PER_SERVER[gpu_type2]
-
-        # Initialize allocations with minimal setup
-        models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {
-            gpu_type1: {  # 3 x A100s (type1)
-                Model.GEMMA: [GemmaModelAllocation(
-                    gpu_type=gpu_type1,
-                    devices=1, replicas=1)],
-                Model.FLUX: [FluxModelAllocation(
-                    gpu_type=gpu_type1,
-                    devices=1, replicas=1)],
-                Model.HF: [],
-                Model.HF_VAE: [],
-                Model.FT: [],
-                Model.FT_VAE: [],
-                Model.UPSCALER: [],
-                Model.OTHERS: [OthersModelAllocation(
-                    gpu_type=gpu_type1,
-                    devices=1, replicas=1)],  # + 1 for Kokoro/YOLO
-            },
-            gpu_type2: {  # 4 (+1) X H100 GPUs (type2)
-                Model.GEMMA: [],
-                Model.FLUX: [],
-                Model.HF: [HFModelAllocation(
-                    gpu_type=gpu_type2,
-                    devices=1, replicas=1)],
-                Model.HF_VAE: [HFVAEModelAllocation(
-                    gpu_type=gpu_type2,
-                    devices=1, replicas=1)],
-                Model.FT: [FTModelAllocation(
-                    gpu_type=gpu_type2,
-                    devices=2, replicas=1)],
-                Model.FT_VAE: [FTVAEModelAllocation(
-                    gpu_type=gpu_type2,
-                    devices=1, replicas=1)],
-                Model.UPSCALER: [UpscalerModelAllocation(
-                    gpu_type=gpu_type2)],
-                Model.OTHERS: [],
-            },
-        }
-
-        # Calculate remaining: starting - assigned
-        if not self.policy.is_disaggregated(Model.HF):
-            models[gpu_type2][Model.HF][0].replicas = 2
-            models[gpu_type2][Model.HF_VAE][0].replicas = 0
-        if not self.policy.is_disaggregated(Model.FT):
-            models[gpu_type2][Model.FT_VAE][0].replicas = 0
-
-        if self.policy.use_upscaler:
-            models[gpu_type2][Model.UPSCALER][0].replicas = 1
-
-        models_gpu_type1 = self._naive_parallelism_allocation(
-            gpu_type1,
-            num_gpus.get(gpu_type1, 0),
-        )
-        models_gpu_type2 = self._naive_parallelism_allocation(
-            gpu_type2,
-            num_gpus.get(gpu_type2, 0),
-            # Already allocated in first GPU type
-            skip_non_paralelizable_models=True,
-        )
-        models[gpu_type1] = models_gpu_type1[gpu_type1]
-        models[gpu_type2] = models_gpu_type2[gpu_type2]
-
-        # Apply per-GPU-type overrides after allocation
-        if self.policy.use_upscaler:
-            models[gpu_type2][Model.UPSCALER][0].replicas = 1
-
-        return models
-
-    def _naive_parallelism_allocation(
-        self,
-        gpu_type: GPUType,
-        num_devices: int,
-        skip_non_paralelizable_models: bool = False,
-    ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
-        """
-        Device allocation for naive parallelism.
-        Max devices for each model.
-        Allocate devices to each model proportional to their max devices.
-        """
-        models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {
-            gpu_type: {
-                Model.GEMMA: [GemmaModelAllocation(
-                    gpu_type=gpu_type,
-                    replicas=1)],
-                Model.FLUX: [FluxModelAllocation(
-                    gpu_type=gpu_type,
-                    replicas=1)],
-                Model.HF: [HFModelAllocation(
-                    gpu_type=gpu_type,
-                    replicas=1)],
-                Model.HF_VAE: [HFVAEModelAllocation(
-                    gpu_type=gpu_type,
-                    replicas=1 if self.policy.is_disaggregated(Model.HF) else 0)],
-                Model.FT: [FTModelAllocation(
-                    gpu_type=gpu_type,
-                    replicas=4)],
-                Model.FT_VAE: [FTVAEModelAllocation(
-                    gpu_type=gpu_type,
-                    replicas=1 if self.policy.is_disaggregated(Model.FT) else 0)],
-                Model.OTHERS: [OthersModelAllocation(
-                    gpu_type=gpu_type,
-                    replicas=1)],  # + 1 for Kokoro/YOLO
-                Model.UPSCALER: [UpscalerModelAllocation(
-                    gpu_type=gpu_type,
-                    replicas=1 if self.policy.use_upscaler else 0)],
-            },
-        }
-
-        # Zero out replicas for models not in workflow
-        for model in Model:
-            if model not in self.workflow.models:
-                for alloc in models[gpu_type][model]:
-                    alloc.replicas = 0
-
-        # Zero out replicas for models that are not parallelizable when skip_non_paralelizable_models is True
-        if skip_non_paralelizable_models:
-            for model in Model:
-                if not self.workflow.is_parallelizable(model):
-                    for alloc in models[gpu_type][model]:
-                        alloc.replicas = 0
-
-        # Assert only 1 allocation instance per model for naive parallelism
-        for model in Model:
-            assert len(models[gpu_type][model]) == 1, \
-                f"Expected only 1 allocation instance for {model}, got {len(models[gpu_type][model])}"
-
-        alloc_id = 0
-        model_gemma = models[gpu_type][Model.GEMMA][alloc_id]
-        model_flux = models[gpu_type][Model.FLUX][alloc_id]
-        model_hf = models[gpu_type][Model.HF][alloc_id]
-        model_vae = models[gpu_type][Model.HF_VAE][alloc_id]
-        model_ft = models[gpu_type][Model.FT][alloc_id]
-        model_ft_vae = models[gpu_type][Model.FT_VAE][alloc_id]
-        model_upscaler = models[gpu_type][Model.UPSCALER][alloc_id]
-
-        # TODO do we need to do something for Model.OTHERS
-
-        if num_devices == 8:
-            # single server case, use fixed allocation
-            if Model.FT in self.workflow.models:
-                model_ft.replicas = 4
-            if self.policy.use_upscaler and Model.UPSCALER in self.workflow.models:
-                model_upscaler.replicas = 1
-                if Model.FT in self.workflow.models:
-                    model_ft.replicas -= 1
-            if self.policy.is_disaggregated(Model.HF) and Model.HF_VAE in self.workflow.models:
-                model_vae.replicas = 1
-                if Model.FT in self.workflow.models:
-                    model_ft.replicas -= 1
-            if self.policy.is_disaggregated(Model.FT) and Model.FT_VAE in self.workflow.models:
-                model_ft_vae.replicas = 1
-                if Model.FT in self.workflow.models:
-                    model_ft.replicas -= 1
-            return models
-
-        init_num_devices = sum([
-            model[0].devices * model[0].replicas
-            for model in models[gpu_type].values()
-        ])
-
-        # Allocate devices proportional to each model's max devices
-        max_devices = MAX_DEVICES
-        models_in_workflow = [
-            model
-            for model in max_devices.keys()
-            if model in self.workflow.models
-        ]
-        if skip_non_paralelizable_models:
-            for model in max_devices.keys():
-                if not self.workflow.is_parallelizable(model):
-                    models_in_workflow.remove(model)
-
-        total_max_devices = sum([
-            max_devices[model]
-            for model in models_in_workflow
-        ])
-        for model in models_in_workflow:
-            # Calculate the number of devices to allocate for the model, proportional to its max devices among others
-            alloc_devices = int((num_devices - init_num_devices) * max_devices[model] / total_max_devices)
-            if model == Model.GEMMA:
-                max_devices_gemma = max_devices[Model.GEMMA]
-                if self.latency_data:
-                    max_devices_gemma = min(max_devices_gemma, self.latency_data[gpu_type].get_max_parallelism(model))
-                model_gemma.devices += min(alloc_devices, max_devices_gemma)
-                # Round down nearest in DEVICE_OPTIONS_GEMMA
-                num_gemma_devices = max([
-                    d
-                    for d in DEVICE_OPTIONS[Model.GEMMA]
-                    if d <= model_gemma.devices
-                ])
-                model_gemma.devices = num_gemma_devices
-            elif model == Model.FLUX:
-                max_devices_flux = max_devices[Model.FLUX]
-                if self.latency_data:
-                    max_devices_flux = min(max_devices_flux, self.latency_data[gpu_type].get_max_parallelism(model))
-                model_flux.devices += min(alloc_devices, max_devices_flux)
-                # Round down nearest in DEVICE_OPTIONS_FLUX
-                model_flux.devices = max([
-                    d
-                    for d in DEVICE_OPTIONS[Model.FLUX]
-                    if d <= model_flux.devices
-                ])
-            elif model == Model.HF:
-                max_devices_hf = max_devices[Model.HF]
-                if self.latency_data:
-                    max_devices_hf = min(max_devices_hf, self.latency_data[gpu_type].get_max_parallelism(model))
-                model_hf.replicas += min(alloc_devices, max_devices_hf)
-            elif model == Model.HF_VAE:
-                if self.policy.is_disaggregated(Model.HF):
-                    max_devices_vae = max_devices[Model.HF_VAE]
-                    if self.latency_data:
-                        max_devices_vae = min(max_devices_vae, self.latency_data[gpu_type].get_max_parallelism(model))
-                    model_vae.replicas += min(alloc_devices, max_devices_vae)
-            elif model == Model.FT:
-                max_devices_ft = max_devices[Model.FT]
-                if self.latency_data:
-                    max_devices_ft = min(max_devices_ft, self.latency_data[gpu_type].get_max_parallelism(model))
-                model_ft.replicas += min(alloc_devices, max_devices_ft)
-            elif model == Model.FT_VAE:
-                if self.policy.is_disaggregated(Model.FT):
-                    max_devices_ft_vae = max_devices[Model.FT_VAE]
-                    if self.latency_data:
-                        max_devices_ft_vae = min(
-                            max_devices_ft_vae, self.latency_data[gpu_type].get_max_parallelism(model)
-                        )
-                    model_ft_vae.replicas += min(alloc_devices, max_devices_ft_vae)
-            else:
-                raise ValueError(f"Unrecognized model {model}")
-
-        remaining_devices = num_devices
-        for model_name in models[gpu_type].keys():
-            for model_alloc in models[gpu_type][model_name]:
-                remaining_devices -= model_alloc.get_num_gpus()
-
-        # Distribute remaining devices to parallelizable models
-        distribute_models = self.workflow.filter_parallelizable_models(
-            models_in_workflow,
-            disaggregation=self.policy.disaggregation,
-        )
-        # Prioritise models that already hold more GPUs
-        distribute_models.sort(
-            key=lambda m: models[gpu_type][m][alloc_id].get_num_gpus(),
-            reverse=True,
-        )
-        num_distribute = len(distribute_models)
-        if num_distribute > 0 and remaining_devices > 0:
-            made_progress = True
-            while remaining_devices > 0 and made_progress:
-                made_progress = False
-                for model_name in distribute_models:
-                    gpus_per_replica = models[gpu_type][model_name][alloc_id].devices
-                    if gpus_per_replica <= 0 or remaining_devices < gpus_per_replica:
-                        continue
-                    models[gpu_type][model_name][alloc_id].replicas += 1
-                    remaining_devices -= gpus_per_replica
-                    made_progress = True
-                    if remaining_devices <= 0:
-                        break
-
-        remaining_devices = num_devices
-        for model_name in models[gpu_type].keys():
-            for model_alloc in models[gpu_type][model_name]:
-                remaining_devices -= model_alloc.get_num_gpus()
-
-        # TODO we should try to assign all resources
-        # assert remaining_devices == 0, \
-        assert remaining_devices >= 0, \
-            f"remaining={remaining_devices} != 0: " \
-            f"gpu={gpu_type.value} total={num_devices} remaining={remaining_devices}"
-
-        # Update replicas based on total devices
-        # Gemma (when parallelizable)
-        if self.workflow.is_parallelizable(Model.GEMMA) and Model.GEMMA in models_in_workflow:
-            model_gemma.devices, model_gemma.replicas, remaining_devices = _calculate_naive_num_devices(
-                model_gemma.devices,
-                model_gemma.replicas,
-                remaining_devices,
-                device_options=DEVICE_OPTIONS[Model.GEMMA],
-                replica_upper_bound=self.workflow.total_scenes)
-
-        # Flux (when parallelizable)
-        if self.workflow.is_parallelizable(Model.FLUX) and Model.FLUX in models_in_workflow:
-            model_flux.devices, model_flux.replicas, remaining_devices = _calculate_naive_num_devices(
-                model_flux.devices,
-                model_flux.replicas,
-                remaining_devices,
-                device_options=DEVICE_OPTIONS[Model.FLUX],
-                replica_upper_bound=self.workflow.total_scenes)
-
-        # Hunyuan FramePack
-        if Model.HF in self.workflow.models:
-            model_hf.devices, model_hf.replicas, remaining_devices = _calculate_naive_num_devices(
-                model_hf.devices,
-                model_hf.replicas,
-                remaining_devices,
-                device_options=DEVICE_OPTIONS[Model.HF],
-                replica_upper_bound=self.workflow.total_scenes)
-
-        # Hunyuan FramePack VAE
-        if self.policy.is_disaggregated(Model.HF) and Model.HF_VAE in self.workflow.models:
-            model_vae.devices, model_vae.replicas, remaining_devices = _calculate_naive_num_devices(
-                model_vae.devices,
-                model_vae.replicas,
-                remaining_devices,
-                device_options=None,
-                replica_upper_bound=self.workflow.total_frames[Model.HF],
-            )
-
-        # Fantasy Talking
-        if Model.FT in self.workflow.models:
-            model_ft.devices, model_ft.replicas, remaining_devices = _calculate_naive_num_devices(
-                model_ft.devices,
-                model_ft.replicas,
-                remaining_devices,
-                device_options=DEVICE_OPTIONS[Model.FT],
-                replica_upper_bound=self.workflow.total_subscenes,
-            )
-
-        # Fantasy Talking VAE
-        if self.policy.is_disaggregated(Model.FT) and Model.FT_VAE in self.workflow.models:
-            model_ft_vae.devices, model_ft_vae.replicas, remaining_devices = _calculate_naive_num_devices(
-                model_ft_vae.devices,
-                model_ft_vae.replicas,
-                remaining_devices,
-                device_options=None,
-                replica_upper_bound=self.workflow.total_frames[Model.FT],
-            )
-
-        return models
-
-
-def _calculate_naive_num_devices(
-    num_devices: int,
-    num_replicas: int,
-    remaining_devices: int,
-    device_options: Optional[list[int]] = [1],
-    replica_upper_bound: Optional[int] = None,
-) -> tuple[int, int, int]:
-    """Find the parallelism that maximizes the device usage."""
-    assert remaining_devices >= 0
-
-    model_quota = num_devices * num_replicas
-
-    if device_options:
-        best_product = 0
-        best_devices_per_replica = 1
-        best_replicas = 1
-        for devices_per_replica in device_options:
-            if devices_per_replica > model_quota:
-                continue
-            max_replicas = model_quota // devices_per_replica
-            if replica_upper_bound and max_replicas > replica_upper_bound:
-                max_replicas = replica_upper_bound
-            product = devices_per_replica * max_replicas
-            if product > best_product:
-                best_product = product
-                best_devices_per_replica = devices_per_replica
-                best_replicas = max_replicas
-    else:
-        # start with parallelism=1 instead
-        best_devices_per_replica = 1
-        best_replicas = model_quota
-
-    num_devices = best_devices_per_replica
-    num_replicas = best_replicas
-    remaining_devices += model_quota - num_replicas * num_devices
-
-    return num_devices, num_replicas, remaining_devices
diff --git a/simulator/plot_utils.py b/simulator/plot_utils.py
index 4b0d5849..2ec13de9 100644
--- a/simulator/plot_utils.py
+++ b/simulator/plot_utils.py
@@ -10,12 +10,12 @@
 
 from typing import Optional
 
-from utils import get_pareto_frontier
+from model_provisioner.utils import get_pareto_frontier
 
-from sim_types import ProvisioningResult
-from sim_types import GPUType
-from sim_types import Model
-from sim_types import QualityLevel
+from model_provisioner.sim_types import ProvisioningResult
+from model_provisioner.sim_types import GPUType
+from model_provisioner.sim_types import Model
+from model_provisioner.sim_types import QualityLevel
 
 
 FIG_SIZE = (7, 5)
diff --git a/simulator/policies.py b/simulator/policies.py
deleted file mode 100644
index 3f670f93..00000000
--- a/simulator/policies.py
+++ /dev/null
@@ -1,252 +0,0 @@
-from __future__ import annotations
-
-from sim_types import Objective
-from sim_types import Policy
-from sim_types import GPUType
-from sim_types import Model
-from sim_types import Solver
-
-from constants import GPU_RESERVED_COST
-from constants import GPU_SPOT_COST
-
-
-# Max devices for each model
-# the logic is to allocate devices to each model proportional to their max devices
-MAX_DEVICES = {
-    Model.GEMMA: 8,
-    Model.FLUX: 16,
-    Model.HF: 40,
-    Model.HF_VAE: 1,
-    Model.FT: 40,
-    Model.FT_VAE: 1,
-}
-
-# Max iterations for the optimization loop to prevent infinite loops in case of non-monotonic allocators or other issues
-MAX_ITERATIONS = 100
-
-# Set to True if we want to use up all GPUs if there's no further improvements in the greedy optimization loop
-USE_ALL_GPUS = True
-
-# Default StreamWise policy configuration
-# TODO: Add a meta policy that picks the best among disaggregation options for HF/FT
-STREAMWISE_POLICY = Policy(
-    name="streamwise",
-    gpu_cost=GPU_SPOT_COST,
-    objective=Objective.TTFF_COST,
-    disaggregation={
-        Model.HF: True,
-        Model.FT: False,
-    },
-    use_upscaler=True,
-    hardware=list(GPUType),
-)
-
-STREAMWISE_MILP_POLICY = Policy(
-    name="streamwise",
-    gpu_cost=GPU_SPOT_COST,
-    objective=Objective.TTFF_COST,
-    disaggregation={
-        Model.HF: True,
-        Model.FT: False,
-    },
-    use_upscaler=True,
-    hardware=list(GPUType),
-    solver=Solver.GUROBI,
-)
-
-
-"""
-HexGen policy configuration.
-"""
-HEXGEN_POLICY = Policy(
-    name="hexgen",
-    gpu_cost=GPU_RESERVED_COST,
-    objective=Objective.TTFF,  # Does not account for cost
-    disaggregation={
-        Model.HF: True,
-        Model.FT: False,
-    },  # Dissagregation
-    use_upscaler=False,
-    hardware=[  # Multiple hardware
-        GPUType.A100,
-        GPUType.H100,
-        GPUType.H200,
-        GPUType.GB200,
-    ],
-    solver=Solver.HEXGEN,
-)
-
-
-"""
-Helix policy configuration.
-Reference: https://github.com/Thesys-lab/Helix-ASPLOS25
-Optimizes models one-by-one following MODEL_ORDER using MILP.
-"""
-HELIX_POLICY = Policy(
-    name="helix",
-    gpu_cost=GPU_RESERVED_COST,
-    objective=Objective.TTFF,  # Does not account for cost
-    disaggregation={
-        Model.HF: True,
-        Model.FT: False,
-    },
-    use_upscaler=False,
-    hardware=list(GPUType),
-    solver=Solver.HELIX,
-)
-
-
-"""
-DDiT policy configuration.
-Reference: https://arxiv.org/html/2506.13497v1
-"""
-DDIT_POLICY = Policy(
-    name="ddit",
-    gpu_cost=GPU_RESERVED_COST,
-    objective=Objective.TTFF,
-    disaggregation={
-        Model.HF: True,
-        Model.FT: False,
-    },
-    use_upscaler=False,
-    hardware=list(GPUType),
-    solver=Solver.NAIVE,
-)
-
-
-STREAMWISE_ENERGY_POLICY = Policy(
-    name="streamwise energy",
-    gpu_cost=GPU_SPOT_COST,
-    objective=Objective.TIME_ENERGY,
-    disaggregation={
-        Model.HF: True,
-        Model.FT: False,
-    },
-    use_upscaler=True,
-    hardware=list(GPUType),
-)
-
-NAIVE_POLICY = Policy(
-    name="naive",
-    gpu_cost=GPU_RESERVED_COST,
-    objective=Objective.TTFF,
-    disaggregation={},
-    use_upscaler=False,
-    hardware=[GPUType.A100],
-    solver=Solver.NAIVE,
-)
-
-
-BASELINE_POLICIES = {
-    "naive": NAIVE_POLICY,
-    "naive disag": Policy(
-        "naive disag",
-        gpu_cost=GPU_RESERVED_COST,
-        objective=Objective.TTFF,
-        disaggregation={
-            Model.HF: True,
-            Model.FT: True,
-        },
-        use_upscaler=False,
-        hardware=[GPUType.A100],
-        solver=Solver.NAIVE,
-    ),
-    "naive upscaler": Policy(
-        "naive upscaler",
-        gpu_cost=GPU_RESERVED_COST,
-        objective=Objective.TTFF,
-        disaggregation={},
-        use_upscaler=True,  # Changed to True
-        hardware=[GPUType.A100],
-        solver=Solver.NAIVE,
-    ),
-    "naive spot": Policy(
-        "naive spot",
-        gpu_cost=GPU_SPOT_COST,  # Changed to SPOT_COST
-        objective=Objective.TTFF,
-        disaggregation={},
-        use_upscaler=False,
-        hardware=[GPUType.A100],
-        solver=Solver.NAIVE,
-    ),
-    "naive ttff*cost allocator": Policy(
-        "naive ttff*cost allocator",
-        GPU_RESERVED_COST,
-        objective=Objective.TTFF_COST,  # Changed to TTFF_COST
-        disaggregation={},
-        use_upscaler=False,
-        hardware=[GPUType.A100],
-        solver=Solver.GREEDY,
-    ),
-    "naive hardware": Policy(
-        "naive hardware",
-        GPU_RESERVED_COST,
-        objective=Objective.TTFF,
-        disaggregation={},
-        use_upscaler=False,
-        hardware=list(GPUType),  # Changed hardware
-        solver=Solver.NAIVE,
-    ),
-}
-
-
-STREAMWISE_POLICIES = {
-    "streamwise": STREAMWISE_POLICY,
-    "streamwise no disag": Policy(
-        name="streamwise no disag",
-        gpu_cost=GPU_SPOT_COST,
-        objective=Objective.TTFF_COST,
-        disaggregation={},
-        use_upscaler=True,
-        hardware=list(GPUType),
-        solver=Solver.GREEDY,
-    ),
-    "streamwise no upscaler": Policy(
-        name="streamwise no upscaler",
-        gpu_cost=GPU_SPOT_COST,
-        objective=Objective.TTFF_COST,
-        disaggregation={
-            Model.HF: True,
-            Model.FT: False,
-        },
-        use_upscaler=False,
-        hardware=list(GPUType),
-        solver=Solver.GREEDY,
-    ),
-    "streamwise no spot": Policy(
-        name="streamwise no spot",
-        gpu_cost=GPU_RESERVED_COST,
-        objective=Objective.TTFF_COST,
-        disaggregation={
-            Model.HF: True,
-            Model.FT: False,
-        },
-        use_upscaler=True,
-        hardware=list(GPUType),
-        solver=Solver.GREEDY,
-    ),
-    "streamwise naive allocator": Policy(
-        name="streamwise naive allocator",
-        gpu_cost=GPU_SPOT_COST,
-        objective=Objective.TTFF,
-        disaggregation={
-            Model.HF: True,
-            Model.FT: False,
-        },
-        use_upscaler=True,
-        hardware=list(GPUType),
-        solver=Solver.NAIVE,
-    ),
-    "streamwise A100": Policy(
-        name="streamwise single hardware",
-        gpu_cost=GPU_SPOT_COST,
-        objective=Objective.TTFF_COST,
-        disaggregation={
-            Model.HF: True,
-            Model.FT: False,
-        },
-        use_upscaler=True,
-        hardware=[GPUType.A100],
-        solver=Solver.NAIVE,
-    ),
-}
diff --git a/simulator/provisioning.py b/simulator/provisioning.py
index 43612b53..26e9c8a9 100644
--- a/simulator/provisioning.py
+++ b/simulator/provisioning.py
@@ -3,6 +3,18 @@
 """
 from __future__ import annotations
 
+import os
+import sys
+
+# Ensure streamwise/ and simulator/ are on sys.path so model_provisioner
+# imports work in child processes spawned by ProcessPoolExecutor.
+_REPO_ROOT = os.path.normpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
+_STREAMWISE_DIR = os.path.join(_REPO_ROOT, "streamwise")
+_SIMULATOR_DIR = os.path.dirname(os.path.abspath(__file__))
+for _p in (_REPO_ROOT, _STREAMWISE_DIR, _SIMULATOR_DIR):
+    if _p not in sys.path:
+        sys.path.insert(0, _p)
+
 from tqdm.auto import tqdm
 
 import logging
@@ -18,24 +30,24 @@
 from concurrent.futures import TimeoutError
 from concurrent.futures import as_completed
 
-from sim_types import WorkflowConfig
-from sim_types import GPUType
-from sim_types import LatencyData
-from sim_types import Provision
-from sim_types import ProvisioningResult
-from sim_types import Model
-from sim_types import ModelAllocation
-from sim_types import PowerData
-from sim_types import QualityLevel
-from sim_types import Policy
-from sim_types import Result
-from sim_types import num_gpus_to_str
+from model_provisioner.sim_types import WorkflowConfig
+from model_provisioner.sim_types import GPUType
+from model_provisioner.sim_types import LatencyData
+from model_provisioner.sim_types import Provision
+from model_provisioner.sim_types import ProvisioningResult
+from model_provisioner.sim_types import Model
+from model_provisioner.sim_types import ModelAllocation
+from model_provisioner.sim_types import PowerData
+from model_provisioner.sim_types import QualityLevel
+from model_provisioner.sim_types import Policy
+from model_provisioner.sim_types import Result
+from model_provisioner.sim_types import num_gpus_to_str
 
-from auto_model_allocator import AutoModelAllocator
+from model_provisioner.auto_model_allocator import AutoModelAllocator
 
-from policies import STREAMWISE_POLICY
+from model_provisioner.policies import STREAMWISE_POLICY
 
-from constants import SECONDS_IN_HOUR
+from model_provisioner.constants import SECONDS_IN_HOUR
 
 
 GPU_PROVISIONS: list[int] = [
diff --git a/simulator/sim_types.py b/simulator/sim_types.py
deleted file mode 100644
index a83cec22..00000000
--- a/simulator/sim_types.py
+++ /dev/null
@@ -1,796 +0,0 @@
-from __future__ import annotations
-
-import pandas as pd
-import numpy as np
-
-from typing import Optional
-from typing import ClassVar
-
-from abc import ABC
-from abc import abstractmethod
-
-from dataclasses import dataclass
-from dataclasses import field
-
-from enum import Enum
-
-
-class GPUType(Enum):
-    A100 = "A100"
-    H100 = "H100"
-    H200 = "H200"
-    GB200 = "GB200"
-
-    def __lt__(self, other: object) -> bool:
-        if not isinstance(other, GPUType):
-            return NotImplemented
-        order = [GPUType.A100, GPUType.H100, GPUType.H200, GPUType.GB200]
-        return order.index(self) < order.index(other)
-
-
-class QualityLevel(Enum):
-    ORIGINAL = "original"
-    HIGH = "high"
-    MEDIUM = "medium"
-    LOW = "low"
-
-
-# Pixel counts per quality level (16:10 aspect ratio).
-# Latency data is profiled at MEDIUM resolution.
-RESOLUTION_PIXELS: dict[QualityLevel, int] = {
-    QualityLevel.HIGH: 1280 * 800,
-    QualityLevel.MEDIUM: 640 * 400,
-    QualityLevel.LOW: 320 * 200,
-}
-
-
-class Model(Enum):
-    GEMMA = "gemma"
-    FLUX = "flux"
-    HF = "hf"  # HunyuanFramePack
-    HF_VAE = "hf_vae"  # HunyuanFramePack VAE
-    FT = "ft"  # FantasyTalking
-    FT_VAE = "ft_vae"  # FantasyTalking VAE
-    UPSCALER = "upscaler"
-    OTHERS = "others"  # YOLO + Kokoro
-
-
-# Used for FIFO
-MODEL_ORDER: dict[Model, int] = {
-    Model.GEMMA: 0,
-    Model.FLUX: 1,
-    Model.OTHERS: 2,
-    Model.HF: 3,
-    Model.HF_VAE: 4,
-    Model.FT: 5,
-    Model.FT_VAE: 6,
-    Model.UPSCALER: 7,
-}
-
-
-@dataclass
-class ModelAllocation(ABC):
-    model: ClassVar[Model]
-
-    # policy TODO
-    # workflow TODO
-    gpu_type: GPUType
-    devices: int = 1
-    replicas: int = 0  # No replicas by default
-    work: int = 0
-    time: float = 0.0
-    time_first: float = 0.0
-    energy: float = 0.0
-    cost: float = 0.0
-
-    def __str__(self) -> str:
-        if self.replicas <= 0:
-            assert self.time == 0.0, f"time must be 0 when no replicas, got {self.time:.2f}"
-            assert self.energy == 0.0, f"energy must be 0 when no replicas, got {self.energy:.2f}"
-            return "--"
-        return \
-            f"devices={self.devices:2d}, " \
-            f"replicas={self.replicas}, " \
-            f"work={self.work}, " \
-            f"time={self.time:.2f} secs, " \
-            f"time_first={self.time_first:.2f} secs, " \
-            f"energy={self.energy / 60.0 / 60.0:.2f} Wh, " \
-            f"cost=${self.cost:.2f}"
-
-    def __repr__(self) -> str:
-        return self.__str__()
-
-    def __post_init__(self) -> None:
-        if self.replicas > 0:
-            return
-        if self.time != 0.0 or self.energy != 0.0:
-            raise ValueError(
-                f"time and energy must be 0.0 when no replicas, got time={self.time:.2f}, energy={self.energy:.2f}")
-
-    def get_num_gpus(self) -> int:
-        if self.replicas <= 0:
-            return 0
-        return self.devices * self.replicas
-
-    def disable(self) -> None:
-        self.devices = 0
-        self.replicas = 0
-        self.time = 0.0
-        self.time_first = 0.0
-        self.energy = 0.0
-
-    @abstractmethod
-    def calculate_time(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-        work_pct: float = 1.0,
-    ) -> float:
-        ...
-
-    @abstractmethod
-    def calculate_time_first(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-    ) -> float:
-        ...
-
-    @abstractmethod
-    def calculate_energy(
-        self,
-        workflow: WorkflowConfig,
-        power_data: Optional[PowerData] = None,
-        total_time_s: float = 0.0,
-    ) -> float:
-        ...
-
-    def calculate_cost(
-        self,
-        policy: Policy,
-        total_time_s: float = 0.0,
-    ) -> float:
-        """Calculate the cost for this model allocation."""
-        SECONDS_IN_HOUR = 60 * 60
-        gpu_cost = policy.gpu_cost[self.gpu_type]
-        self.cost = total_time_s * (self.get_num_gpus() * gpu_cost) / SECONDS_IN_HOUR
-        return self.cost
-
-    def calculate(
-        self,
-        policy: Policy,
-        workflow: WorkflowConfig,
-        latency_data: LatencyData,
-        power_data: Optional[PowerData] = None,
-        total_time_s: float = 0.0,
-        work_pct: float = 1.0,
-    ) -> None:
-        """Calculate all the values for this model allocation."""
-        self.calculate_time(policy, workflow, latency_data, work_pct)
-        self.calculate_time_first(policy, workflow, latency_data)
-        self.calculate_cost(policy, total_time_s)
-        self.calculate_energy(workflow, power_data, total_time_s)
-
-    def get_max_replicas(
-        self,
-        workflow: WorkflowConfig,
-    ) -> int:
-        """Get the maximum number of replicas that can leverage parallelism."""
-        return 1
-
-
-class Objective(Enum):
-    FIFO = "fifo"
-    TIME = "time"
-    TTFF = "ttff"
-    COST = "cost"
-    ENERGY = "energy"
-    TIME_COST = "time_cost"
-    TTFF_COST = "ttff_cost"
-    ENERGY_COST = "energy_cost"
-    TIME_ENERGY = "time_energy"
-    RANDOM = "random"
-    NONE = "none"
-
-    TTFF_THEN_TIME = "ttff_then_time"  # first minimize ttff, then minimize time
-
-    def is_monotonic(self) -> bool:
-        return self not in {Objective.RANDOM, Objective.FIFO}
-
-
-@dataclass
-class WorkflowConfig:
-    total_video_seconds: int
-    total_scenes: int
-    total_frames: dict[Model, int]
-    total_subscenes: int
-    per_subscene_frames: dict[Model, int]
-    # default per-frame number of denoising steps
-    num_steps: dict[Model, int]
-    # supported number of generation frames
-    hf_frames: list[int]
-    ft_frames: list[int]
-    frames_per_step_idx: int
-    # target output resolution (default: HIGH)
-    target_resolution: QualityLevel = QualityLevel.HIGH
-
-    # total input tokens
-    total_input_tokens: int = 0
-
-    # work per model (determines parallelism; work > 1 means parallelizable across replicas)
-    # models included in the workflow are derived from the keys of this dict
-    model_work: dict[Model, int] = field(default_factory=dict)
-
-    @property
-    def models(self) -> list[Model]:
-        """Models included in the workflow (derived from model_work keys)."""
-        return list(self.model_work.keys())
-
-    @property
-    def work(self) -> dict[Model, int]:
-        """Units of work per model (0 for models not in the workflow)."""
-        return {
-            model_name: self.model_work.get(model_name, 0)
-            for model_name in Model
-        }
-
-    def get_model_order(self) -> list[Model]:
-        """Get ordered list of models in the workflow, sorted by MODEL_ORDER."""
-        return sorted(
-            [m for m in self.models if m in MODEL_ORDER],
-            key=lambda m: MODEL_ORDER[m],
-        )
-
-    def get_resolution_scale(self, use_upscaler: bool) -> float:
-        """Compute latency scaling factor based on target resolution.
-
-        Latency data is profiled at MEDIUM resolution.  The scale factor
-        adjusts for the actual generation resolution:
-
-        1. Upscaler used, HIGH   → 1.0 (models generate at MEDIUM)
-        2. Upscaler used, MEDIUM → LOW / MEDIUM (models generate at LOW)
-        3. No upscaler, HIGH     → HIGH / MEDIUM  (scale up)
-        4. No upscaler, MEDIUM   → 1.0
-        5. No upscaler, LOW      → LOW / MEDIUM   (scale down)
-        """
-        if use_upscaler:
-            assert self.target_resolution in (QualityLevel.HIGH, QualityLevel.MEDIUM), \
-                "Upscaler can only be used when target resolution is HIGH or MEDIUM"
-            if self.target_resolution == QualityLevel.HIGH:
-                return 1.0
-            # MEDIUM target with upscaler: generate at LOW, upscale to MEDIUM
-            return RESOLUTION_PIXELS[QualityLevel.LOW] / RESOLUTION_PIXELS[QualityLevel.MEDIUM]
-        if self.target_resolution == QualityLevel.MEDIUM:
-            return 1.0
-        return RESOLUTION_PIXELS[self.target_resolution] / RESOLUTION_PIXELS[QualityLevel.MEDIUM]
-
-    def is_parallelizable(self, model: Model) -> bool:
-        """Whether the given model can be parallelized across multiple replicas."""
-        return self.model_work.get(model, 0) > 1
-
-    def filter_parallelizable_models(
-        self,
-        models: list[Model],
-        disaggregation: dict[Model, bool],
-    ) -> list[Model]:
-        filtered_models = [
-            model
-            for model in models
-            if self.is_parallelizable(model)
-        ]
-        # Remove VAE models when their parent model disaggregation is disabled
-        if not disaggregation.get(Model.HF, False):
-            filtered_models = [m for m in filtered_models if m != Model.HF_VAE]
-        if not disaggregation.get(Model.FT, False):
-            filtered_models = [m for m in filtered_models if m != Model.FT_VAE]
-        return filtered_models
-
-    def __post_init__(self) -> None:
-        assert self.total_frames[Model.HF] > self.per_subscene_frames[Model.HF]
-        assert self.total_frames[Model.FT] > self.per_subscene_frames[Model.FT]
-
-        # If no models specified, populate defaults for all models
-        if not self.model_work:
-            defaults: dict[Model, int] = {
-                Model.GEMMA: 1,
-                Model.FLUX: 1,
-                Model.HF: self.total_subscenes,
-                Model.HF_VAE: self.total_frames[Model.HF],
-                Model.FT: self.total_subscenes,
-                Model.FT_VAE: self.total_frames[Model.FT],
-                Model.UPSCALER: self.total_frames[Model.FT],
-                Model.OTHERS: 1,
-            }
-            for model, work in defaults.items():
-                self.model_work[model] = work
-        if self.target_resolution != QualityLevel.HIGH:
-            if Model.UPSCALER in self.model_work:
-                del self.model_work[Model.UPSCALER]
-
-    @property
-    def num_frames(self) -> int:
-        """Number of frames generated by the workflow."""
-        if Model.FT in self.total_frames:
-            return self.total_frames[Model.FT]
-        return 0
-
-
-class ActionName(Enum):
-    MERGE = "merge"
-    ADD_DEVICE = "add device"
-    ADD_REPLICA = "add replica"
-    ADD_DEVICE_REPLICA = "add device replica"
-    ADD_INSTANCE = "add instance"
-    REMOVE_DEVICE = "remove device"
-    REMOVE_REPLICA = "remove replica"
-
-
-@dataclass
-class Action:
-    """
-    Optimization action to take.
-    """
-    name: ActionName
-    model: Model
-    gpu_type: GPUType
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]]
-
-    action_result: Result = field(repr=False)
-
-    arrival_time_s: float = 0.0  # For FIFO scheduling
-
-    # Derived fields from action_result (not passed by caller)
-    time: float = field(init=False)  # Total execution time
-    ttff: float = field(init=False)  # Time to first frame
-    cost: float = field(init=False)  # Cost in $
-    energy: float = field(init=False)  # Energy in W*s
-
-    def __post_init__(self) -> None:
-        # ---- type checks ----
-        if not isinstance(self.model, Model):
-            raise ValueError(f"Model {self.model} [{type(self.model)}] not supported")
-        if not isinstance(self.name, ActionName):
-            raise ValueError(f"Action name {self.name} [{type(self.name)}] not supported")
-        if not isinstance(self.models, dict):
-            raise ValueError(f"models must be a dict, got {type(self.models)}")
-        if not isinstance(self.gpu_type, GPUType):
-            raise ValueError(f"Device type {self.gpu_type} [{type(self.gpu_type)}] not supported")
-        """
-        if not isinstance(self.allocation_id, int) or self.allocation_id < 0:
-            raise ValueError(f"Allocation ID {self.allocation_id} must be a non-negative integer")
-        if self.num_replicas <= 0:
-            raise ValueError(f"num_replicas {self.num_replicas} must be > 0")
-        if self.num_devices <= 0:
-            raise ValueError(f"num_devices {self.num_devices} must be > 0")
-        """
-        # ---- derive values ----
-        self.time = self.action_result.total_time_s
-        self.ttff = self.action_result.ttff_s
-        self.cost = self.action_result.cost
-        self.energy = self.action_result.total_energy
-        if self.cost < 0.0:
-            raise ValueError("cost must be >= 0")
-
-    def __str__(self) -> str:
-        return (
-            f"Action("
-            f"{self.name.value}, "
-            f"model={self.model.value}, "
-            f"gpu={self.gpu_type.value}, "
-            f"time={self.time:.2f} s, "
-            f"ttff={self.ttff:.2f} s, "
-            f"cost=${self.cost:.2f}, "
-            f"time*cost={self.time_cost():.2f}, "
-            f"ttff*cost={self.ttff_cost():.2f}, "
-            f"energy*cost={self.energy_cost():.2f}, "
-            f"time*energy={self.time_energy():.2f}, "
-            f"energy={self.energy:.2f} Ws, "
-            f"models={self.models}"
-            f")"
-        )
-
-    def time_cost(self) -> float:
-        """We use improvement in time * $."""
-        if self.time <= 0:
-            return self.cost
-        if self.cost <= 0:
-            return self.time
-        return self.time * self.cost
-
-    def ttff_cost(self) -> float:
-        """We use improvement in TTFF * $."""
-        if self.ttff <= 0:
-            return self.cost
-        if self.cost <= 0:
-            return self.ttff
-        return self.ttff * self.cost
-
-    def energy_cost(self) -> float:
-        """We use improvement in Wh * $."""
-        if self.cost <= 0:
-            return self.energy
-        if self.energy <= 0:
-            return self.cost
-        return self.energy * self.cost
-
-    def time_energy(self) -> float:
-        """We use improvement in TTFF * Wh."""
-        if self.energy <= 0:
-            return self.time
-        if self.time <= 0:
-            return self.energy
-        return self.time * self.energy
-
-    def get_order(self) -> int:
-        " ""For FIFO scheduling."" "
-        return MODEL_ORDER[self.model]
-
-    def get_metric(
-        self,
-        obj: Objective,
-        switch_objective: bool = False,
-    ) -> float:
-        if obj == Objective.RANDOM:
-            return 0.0
-        if obj == Objective.TIME:
-            return self.time
-        if obj == Objective.TTFF:
-            return self.ttff
-        if obj == Objective.COST:
-            return self.cost
-        if obj == Objective.ENERGY:
-            return self.energy
-        if obj == Objective.TIME_COST:
-            return self.time_cost()
-        if obj == Objective.TTFF_COST:
-            return self.ttff_cost()
-        if obj == Objective.ENERGY_COST:
-            return self.energy_cost()
-        if obj == Objective.TIME_ENERGY:
-            return self.time_energy()
-        if obj == Objective.FIFO:
-            # return self.get_order()
-            return 0  # TODO
-        if obj == Objective.TTFF_THEN_TIME:
-            if switch_objective:
-                return self.time
-            else:
-                return self.ttff
-        raise ValueError(f"Unknown objective {obj}")
-
-
-@dataclass
-class Result:
-    total_time_s: float = 0.0
-    first_chunk_time: float = 0.0  # Time to first chunk
-    ttff_s: float = 0.0  # Time to first frame (accounts for total time and workflow length)
-    tbf_s: float = 0.0  # Time between frames
-    total_energy: float = 0.0  # Watts x second
-    cost: float = 0.0  # Total $ cost
-    gpus_used: dict[GPUType, int] = field(default_factory=dict)
-    gpus_total: dict[GPUType, int] = field(default_factory=dict)
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]] = field(default_factory=dict)
-
-    def __post_init__(self) -> None:
-        assert self.total_time_s >= 0.0, f"total_time_s={self.total_time_s} must be >= 0.0"
-        assert self.first_chunk_time >= 0.0, f"first_chunk_time={self.first_chunk_time} must be >= 0.0"
-        assert self.ttff_s >= 0.0, f"ttff_s={self.ttff_s} must be >= 0.0"
-        assert self.tbf_s >= 0.0, f"tbf_s={self.tbf_s} must be >= 0.0"
-        assert self.total_energy >= 0.0, f"total_energy={self.total_energy} must be >= 0.0"
-        assert self.cost >= 0.0, f"cost={self.cost} must be >= 0.0"
-        assert len(self.gpus_used) >= 0, f"gpus_used cannot be empty: {self.gpus_used}"
-        for gpu_used in self.gpus_used.values():
-            assert gpu_used >= 0, f"all gpus_used value {self.gpus_used} must be >= 0"
-
-    def to_csv(self) -> str:
-        num_a100 = self.gpus_used.get(GPUType.A100, 0)
-        num_h100 = self.gpus_used.get(GPUType.H100, 0)
-        num_h200 = self.gpus_used.get(GPUType.H200, 0)
-        num_gb200 = self.gpus_used.get(GPUType.GB200, 0)
-        return (
-            f"{num_a100},{num_h100},{num_h200},{num_gb200},"
-            f"{self.ttff_s:.2f},{self.tbf_s:.2f},{self.cost:.2f},"
-            f"{self.total_time_s:.2f},{self.total_energy:.2f}"
-        )
-
-    def __str__(self) -> str:
-        SECONDS_IN_HOUR = 60 * 60
-        return (
-            f"Time:{self.total_time_s:.2f} s TTFF:{self.ttff_s:.2f} s "
-            f"Cost:${self.cost:.2f} TTFF*Cost:{self.ttff_s * self.cost:.2f} "
-            f"Energy:{self.total_energy / SECONDS_IN_HOUR / 1000:.2f} kWh "
-            f"GPUS: {num_gpus_to_str(self.gpus_used)}"
-        )
-
-    def __repr__(self) -> str:
-        return self.__str__()
-
-
-@dataclass
-class LatencyGPUTypeData:
-    gpu_type: GPUType
-    # TP -> latency mappings
-    flux: dict[int, float] = field(default_factory=dict)
-    hf: dict[int, float] = field(default_factory=dict)
-    hf_high: dict[int, float] = field(default_factory=dict)
-    hf_vae: dict[int, float] = field(default_factory=dict)
-    hf_vae_high: dict[int, float] = field(default_factory=dict)
-    ft: dict[int, float] = field(default_factory=dict)
-    ft_high: dict[int, float] = field(default_factory=dict)
-    ft_vae: dict[int, float] = field(default_factory=dict)
-    ft_vae_high: dict[int, float] = field(default_factory=dict)
-    upscaler: dict[int, float] = field(default_factory=dict)
-    gemma_first_scene: dict[int, float] = field(default_factory=dict)
-    gemma_per_scene: dict[int, float] = field(default_factory=dict)
-    others: dict[int, float] = field(default_factory=dict)
-
-    def __getitem__(
-        self,
-        key: Model | tuple[Model, int]
-    ) -> float:
-        if isinstance(key, tuple):
-            assert isinstance(key[0], Model)
-            assert isinstance(key[1], int)
-            model, num_devices = key
-            if model == Model.FLUX:
-                return self.flux[num_devices]
-            if model == Model.HF:
-                return self.hf[num_devices]
-            if model == Model.HF_VAE:
-                return self.hf_vae[num_devices]
-            if model == Model.FT:
-                return self.ft[num_devices]
-            if model == Model.FT_VAE:
-                return self.ft_vae[num_devices]
-            if model == Model.GEMMA:
-                return self.gemma_first_scene[num_devices]
-            if model == Model.UPSCALER:
-                return self.upscaler[num_devices]
-            if model == Model.OTHERS:
-                return self.others[num_devices]
-        raise KeyError(f"Latency for model {key} not found")
-
-    def __contains__(self, key: Model | tuple[Model, int]) -> bool:
-        if isinstance(key, tuple):
-            assert isinstance(key[0], Model)
-            assert isinstance(key[1], int)
-            model, num_devices = key
-            if model == Model.GEMMA:
-                return num_devices in self.gemma_first_scene
-            if model == Model.FLUX:
-                return num_devices in self.flux
-            if model == Model.HF:
-                return num_devices in self.hf
-            if model == Model.HF_VAE:
-                return num_devices in self.hf_vae
-            if model == Model.FT:
-                return num_devices in self.ft
-            if model == Model.FT_VAE:
-                return num_devices in self.ft_vae
-            if model == Model.UPSCALER:
-                return num_devices in self.upscaler
-            if model == Model.HF_VAE:
-                return num_devices in self.hf_vae
-            if model == Model.OTHERS:
-                return num_devices in self.others
-        return False
-
-    def get_max_parallelism(self, model: Model) -> int:
-        """Max number of devices supported for the given model."""
-        if model == Model.FLUX:
-            return max(self.flux.keys())
-        if model == Model.HF:
-            return max(self.hf.keys())
-        if model == Model.FT:
-            return max(self.ft.keys())
-        if model == Model.FT_VAE:
-            return max(self.ft_vae.keys())
-        if model == Model.GEMMA:
-            return max(self.gemma_first_scene.keys())
-        if model == Model.UPSCALER:
-            return max(self.upscaler.keys())
-        if model == Model.HF_VAE:
-            return max(self.hf_vae.keys())
-        if model == Model.OTHERS:
-            return max(self.others.keys())
-        raise KeyError(f"Model {model} not found in latency data")
-
-
-@dataclass
-class PowerGPUTypeData:
-    gpu_type: GPUType
-    # TP -> power mappings
-    flux: dict[int, float] = field(default_factory=dict)
-    hf: dict[int, float] = field(default_factory=dict)
-    hf_high: dict[int, float] = field(default_factory=dict)
-    hf_vae: dict[int, float] = field(default_factory=dict)
-    hf_vae_high: dict[int, float] = field(default_factory=dict)
-    ft: dict[int, float] = field(default_factory=dict)
-    ft_high: dict[int, float] = field(default_factory=dict)
-    ft_vae: dict[int, float] = field(default_factory=dict)
-    ft_vae_high: dict[int, float] = field(default_factory=dict)
-    upscaler: dict[int, float] = field(default_factory=dict)
-    gemma_first_scene: dict[int, float] = field(default_factory=dict)
-    gemma_per_scene: dict[int, float] = field(default_factory=dict)
-    # Other values
-    idle: float = 0.0  # Idle power in Watts
-    tdp: float = 0.0  # TDP power in Watts
-
-    def __getitem__(
-        self,
-        key: Model | tuple[Model, int] | str
-    ) -> float:
-        if isinstance(key, tuple):
-            assert isinstance(key[0], Model)
-            assert isinstance(key[1], int)
-            model, devices = key
-            if model == Model.FLUX:
-                return self.flux[devices]
-            if model == Model.HF:
-                return self.hf[devices]
-            if model == Model.HF_VAE:
-                return self.hf_vae[devices]
-            if model == Model.FT:
-                return self.ft[devices]
-            if model == Model.FT_VAE:
-                return self.ft_vae[devices]
-            if model == Model.UPSCALER:
-                return self.upscaler[devices]
-        if isinstance(key, str):
-            if key == "idle":
-                return self.idle
-            if key == "tdp":
-                return self.tdp
-        raise KeyError(f"Power for {key} not found")
-
-
-@dataclass
-class LatencyData:
-    gpus: dict[GPUType, LatencyGPUTypeData]
-
-    def __getitem__(self, gpu_type: GPUType) -> LatencyGPUTypeData:
-        return self.gpus[gpu_type]
-
-    def __setitem__(
-        self,
-        gpu_type: GPUType,
-        latency_data: LatencyGPUTypeData
-    ) -> None:
-        self.gpus[gpu_type] = latency_data
-
-
-@dataclass
-class PowerData:
-    gpus: dict[GPUType, PowerGPUTypeData]
-
-    def __getitem__(self, gpu_type: GPUType) -> PowerGPUTypeData:
-        return self.gpus[gpu_type]
-
-    def __setitem__(
-        self,
-        gpu_type: GPUType,
-        power_data: PowerGPUTypeData
-    ) -> None:
-        self.gpus[gpu_type] = power_data
-
-
-def num_gpus_to_str(
-    provision: dict[GPUType, int]
-) -> str:
-    return "+".join([
-        f"{num_gpus}x{gpu_type.name}"
-        for gpu_type, num_gpus in provision.items()
-        if num_gpus > 0
-    ])
-
-
-@dataclass
-class Provision:
-    num_gpus: dict[GPUType, int] = field(default_factory=dict)
-
-    def __getitem__(self, gpu_type: GPUType) -> int:
-        return self.num_gpus[gpu_type]
-
-    def __str__(self) -> str:
-        return num_gpus_to_str(self.num_gpus)
-
-
-@dataclass
-class ProvisioningResult:
-    latencies: list[float]
-    costs: list[float]
-    ttffs: list[float]
-    tbfs: list[float]
-    actual_provision: list[dict[GPUType, int]]
-    config_provision: list[dict[GPUType, int]]
-    model_provision: list[dict[GPUType, dict[Model, list[ModelAllocation]]]]
-    qualities: list[float] = field(default_factory=list)
-    energies: list[float] = field(default_factory=list)
-
-    def save(
-        self,
-        policy_name: str,
-        results_dir: str,
-    ) -> None:
-        """Save the provisioning results to a CSV file."""
-        num_a100: list[int] = []
-        num_h100: list[int] = []
-        num_h200: list[int] = []
-        num_gb200: list[int] = []
-        for provision in self.actual_provision:
-            num_a100.append(provision.get(GPUType.A100, 0))
-            num_h100.append(provision.get(GPUType.H100, 0))
-            num_h200.append(provision.get(GPUType.H200, 0))
-            num_gb200.append(provision.get(GPUType.GB200, 0))
-        df_latency = pd.DataFrame({
-            'num_a100': num_a100,
-            'num_h100': num_h100,
-            'num_h200': num_h200,
-            'num_gb200': num_gb200,
-            'ttff_s': self.ttffs,
-            'tbf_s': self.tbfs,
-            'cost': self.costs,
-            'total_time': self.latencies,
-            'energy': self.energies,
-        })
-        df_latency[['ttff_s', 'tbf_s', 'cost', 'total_time', 'energy']] = (
-            df_latency[['ttff_s', 'tbf_s', 'cost', 'total_time', 'energy']].round(2)
-        )
-        policy_name_clean = policy_name.replace(" ", "_").replace("*", "x").replace("/", "_").lower()
-        file_name = results_dir + f"provisioning_{policy_name_clean}.csv"
-        df_latency.to_csv(file_name, index=False)
-
-    def get_pareto_frontier(
-        self,
-        max_x: Optional[float] = None,
-        max_y: Optional[float] = None,
-    ) -> np.ndarray:
-        from utils import get_pareto_frontier  # TODO this is a lazy fix, we need to reset
-        # points = np.array(list(zip(self.ttffs, self.costs)))
-        return get_pareto_frontier(
-            self.ttffs,
-            self.costs,
-            max_x=max_x,
-            max_y=max_y,
-        )
-
-
-class Solver(Enum):
-    GUROBI = "gurobi"
-    HIGHS = "highs"
-    GREEDY = "greedy"
-    NAIVE = "naive"
-    HEXGEN = "hexgen"
-    HELIX = "helix"
-
-
-@dataclass
-class Policy:
-    name: str
-    gpu_cost: dict[GPUType, float]
-    objective: Objective
-    disaggregation: dict[Model, bool]
-    use_upscaler: bool
-    hardware: list[GPUType] = field(default_factory=lambda: [GPUType.A100, GPUType.H100, GPUType.H200, GPUType.GB200])
-    solver: Solver = Solver.GREEDY
-
-    def is_disaggregated(self, model: Model) -> bool:
-        """Check if a model has disaggregation enabled."""
-        return self.disaggregation.get(model, False)
-
-    def __str__(self) -> str:
-        disag_str = {
-            model.value: disaggregated
-            for model, disaggregated in self.disaggregation.items()
-            if disaggregated
-        }
-        return (
-            f"Policy({self.name}, "
-            f"objective={self.objective}, "
-            f"disag={disag_str}, "
-            f"upscaler={self.use_upscaler}, "
-            f"cost={self.gpu_cost}, "
-            f"solver={self.solver})"
-        )
diff --git a/simulator/sim_types_json.py b/simulator/sim_types_json.py
deleted file mode 100644
index 9f5451ea..00000000
--- a/simulator/sim_types_json.py
+++ /dev/null
@@ -1,58 +0,0 @@
-from __future__ import annotations
-
-import json
-
-from dataclasses import asdict
-
-from sim_types import Model
-from sim_types import Policy
-from sim_types import GPUType
-from sim_types import ModelAllocation
-from sim_types import WorkflowConfig
-
-
-def models_to_json(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]]
-) -> str:
-    result = {}
-    for gpu_type, model_dict in models.items():
-        inner_result = {}
-        for model, allocation_list in model_dict.items():
-            for allocation in allocation_list:
-                alloc_dict = {
-                    'devices': allocation.devices,
-                    'replicas': allocation.replicas,
-                }
-                inner_result[model.value] = alloc_dict
-        result[gpu_type.name] = inner_result
-    return str(result).replace("}}, '", "}},'")
-
-
-def workflow_to_json(workflow: WorkflowConfig) -> str:
-    d = asdict(workflow)
-    # Convert Model enum keys in dict fields to string values
-    for dict_field in ('total_frames', 'per_subscene_frames', 'num_steps', 'model_work'):
-        if dict_field in d:
-            d[dict_field] = {
-                (k.value if hasattr(k, 'value') else k): v
-                for k, v in d[dict_field].items()
-            }
-    # Convert QualityLevel enum to string value
-    if 'target_resolution' in d and hasattr(d['target_resolution'], 'value'):
-        d['target_resolution'] = d['target_resolution'].value
-    return json.dumps(d)
-
-
-def policy_to_json(policy: Policy) -> str:
-    result = {
-        'name': policy.name,
-        'objective': str(policy.objective),
-        'disaggregation': {model.value: enabled for model, enabled in policy.disaggregation.items()},
-        'use_upscaler': policy.use_upscaler,
-        'hardware': [gpu.name for gpu in policy.hardware],
-    }
-    return json.dumps(result)
-
-
-def model_list_to_json(models: list[Model]) -> str:
-    return json.dumps(models, default=lambda o: o.value)
diff --git a/simulator/utils.py b/simulator/utils.py
deleted file mode 100644
index 29ffe7ab..00000000
--- a/simulator/utils.py
+++ /dev/null
@@ -1,297 +0,0 @@
-"""
-Utilities for the simulator.
-"""
-
-from __future__ import annotations
-
-from copy import deepcopy
-
-import pandas as pd
-import numpy as np
-
-from scipy.interpolate import interp1d
-
-from sim_types import ProvisioningResult
-from sim_types import GPUType
-from sim_types import Model
-from sim_types import ModelAllocation
-
-from typing import Optional
-
-
-def to_models_df(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]]
-) -> pd.DataFrame:
-    """
-    Convert the models dictionary to a pandas DataFrame for easier analysis and visualization.
-    """
-    records = []
-    for gpu_type, model_allocations in models.items():
-        for model, allocations in model_allocations.items():
-            for allocation in allocations:
-                if allocation is None or allocation.get_num_gpus() == 0:
-                    continue  # Ignoring empty allocations
-                record = {
-                    "GPU": gpu_type.value,
-                    "Model": model.value,
-                    "Devices": allocation.devices,
-                    "Replicas": allocation.replicas,
-                    "Work": allocation.work,
-                    "#GPUs": allocation.get_num_gpus(),
-                    "Time (s)": allocation.time,
-                    "TTFF (s)": allocation.time_first,
-                    "Energy (kWh)": allocation.energy / (60 * 60) / 1000.0,  # Convert to kWh
-                    "Cost ($)": allocation.cost,
-                }
-                records.append(record)
-    df = pd.DataFrame(records)
-    df = df.set_index(["GPU", "Model"])
-    df = df.round(2)
-
-    total = df.sum(numeric_only=True)
-    total["Time (s)"] = df["Time (s)"].groupby(level="Model").max().sum()
-    total["TTFF (s)"] = df["TTFF (s)"].groupby(level="Model").min().sum()
-    total.name = ("TOTAL", "")
-    df = pd.concat([df, total.to_frame().T])
-
-    df[["Devices", "Replicas", "#GPUs", "Work"]] = df[["Devices", "Replicas", "#GPUs", "Work"]].astype(int)
-
-    return df
-
-
-def coalesce_models(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]]
-) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
-    """The models with the same parallelism and same work, should be accounted as replicas."""
-    merged: dict[GPUType, dict[Model, list[ModelAllocation]]] = {}
-    for gpu_type, model_dict in models.items():
-        merged[gpu_type] = {}
-        for model_name, allocations in model_dict.items():
-            merged_allocations: list[ModelAllocation] = []
-            for alloc in allocations:
-                # Check if there's an existing allocation with the same devices and work
-                match = next((
-                    model_alloc
-                    for model_alloc in merged_allocations
-                    if model_alloc.devices == alloc.devices and model_alloc.work == alloc.work
-                ), None)
-                if match:
-                    # If found, increment replicas and aggregate energy/cost
-                    match.replicas += 1
-                    match.energy += alloc.energy
-                    match.cost += alloc.cost
-                else:
-                    # Otherwise, add as new allocation
-                    merged_allocations.append(deepcopy(alloc))
-            merged[gpu_type][model_name] = merged_allocations
-    return merged
-
-
-def simplify_model_allocations(
-    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
-) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
-    """
-    Simplify model allocations by merging replicas with the same number of devices.
-    This is to reduce the search space for the optimization loop.
-    """
-    new_models = deepcopy(models)
-    for gpu_type in new_models.keys():
-        for model in new_models[gpu_type].keys():
-            model_instances = new_models[gpu_type][model]
-            alloc_map: dict[int, ModelAllocation] = {}
-            for model_instance in model_instances:
-                if model_instance.get_num_gpus() == 0:
-                    continue
-                if model_instance.devices not in alloc_map:
-                    alloc_map[model_instance.devices] = deepcopy(model_instance)
-                else:
-                    alloc_map[model_instance.devices].replicas += model_instance.replicas
-            new_models[gpu_type][model] = list(alloc_map.values())
-    return new_models
-
-
-def find_fastest_provisioning(
-    provisioning: ProvisioningResult,
-) -> int:
-    """Find the fastest provisioning option."""
-    min_latency = min(provisioning.latencies)
-    min_latency_index = provisioning.latencies.index(min_latency)
-    return min_latency_index
-
-
-def find_fastest_ttff_provisioning(
-    provisioning: ProvisioningResult,
-) -> int:
-    """Find the fastest provisioning option."""
-    min_ttff = min(provisioning.ttffs)
-    min_ttff_index = provisioning.ttffs.index(min_ttff)
-    return min_ttff_index
-
-
-def find_cheapest_provisioning(
-    provisioning: ProvisioningResult,
-) -> int:
-    """Find the cheapest provisioning option."""
-    min_cost = min(provisioning.costs)
-    min_cost_index = provisioning.costs.index(min_cost)
-    return min_cost_index
-
-
-def find_most_cost_effective_provisioning(
-    provisioning: ProvisioningResult,
-) -> int:
-    """Find the most cost-effective provisioning option."""
-    min_cost = min(provisioning.costs)
-    min_latency = min(provisioning.latencies)
-    min_cost_index = provisioning.costs.index(min_cost)
-    min_latency_index = provisioning.latencies.index(min_latency)
-    if min_cost_index == min_latency_index:
-        return min_cost_index
-
-    # if the indices are different, return the provisioning option with the minimum cost*latency
-    cost_latency_list = [
-        cost * latency
-        for cost, latency in zip(provisioning.costs, provisioning.latencies)
-    ]
-    min_cost_latency = min(cost_latency_list)
-    min_cost_latency_index = cost_latency_list.index(min_cost_latency)
-    return min_cost_latency_index
-
-
-def find_most_energy_efficient_provisioning(
-    provisioning: ProvisioningResult,
-) -> int:
-    """Find the most energy-efficient provisioning option."""
-    min_energy = min(provisioning.energies)
-    min_latency = min(provisioning.latencies)
-    min_energy_index = provisioning.energies.index(min_energy)
-    min_latency_index = provisioning.latencies.index(min_latency)
-    if min_energy_index == min_latency_index:
-        return min_energy_index
-
-    # if the indices are different, return the provisioning option with the minimum energy*latency
-    energy_latency_list = [
-        energy * latency
-        for energy, latency in zip(provisioning.energies, provisioning.latencies)
-    ]
-    min_energy_latency = min(energy_latency_list)
-    min_energy_latency_index = energy_latency_list.index(min_energy_latency)
-    return min_energy_latency_index
-
-
-def find_pareto_frontier(
-    latency_list: list[float],
-    energy_list: list[float],
-    provision: list[float]
-) -> tuple[list[float], list[float], list[float]]:
-    pareto_provision = []
-    pareto_latency = []
-    pareto_energy = []
-    for i in range(len(latency_list)):
-        dominated = False
-        for j in range(len(latency_list)):
-            if i != j:
-                if latency_list[j] <= latency_list[i] and energy_list[j] <= energy_list[i]:
-                    if latency_list[j] < latency_list[i] or energy_list[j] < energy_list[i]:
-                        dominated = True
-                        break
-        if not dominated:
-            pareto_provision.append(provision[i])
-            pareto_latency.append(latency_list[i])
-            pareto_energy.append(energy_list[i])
-    return pareto_provision, pareto_latency, pareto_energy
-
-
-def get_pareto_frontier_paper(
-    points: np.ndarray,
-    max_y: Optional[float] = None,
-    max_x: Optional[float] = None,
-) -> np.ndarray:
-    """
-    Calculate the Pareto frontier from a set of data points
-    """
-    if points.size == 0:
-        return points.copy()
-
-    # points = points[np.argsort(points[:, 0])]
-    points = points[np.lexsort((points[:, 1], points[:, 0]))]
-
-    pareto_front = [points[0]]
-    for point in points[1:]:
-        if point[1] < pareto_front[-1][1]:
-            pareto_front.append(point)
-
-    # Add extreme points to the Pareto frontier
-    extreme_point_0 = [pareto_front[0][0], max(points[:, 1])]
-    extreme_point_1 = [max(points[:, 0]), pareto_front[-1][1]]
-    pareto_front.append(extreme_point_0)
-    pareto_front.append(extreme_point_1)
-
-    if max_x is not None:
-        candidate = np.array([max_x, min(points[:, 1])])
-        if candidate[0] > pareto_front[-1][0] and candidate[1] <= pareto_front[-1][1]:
-            pareto_front.append(candidate)
-    if max_y is not None:
-        candidate = np.array([min(points[:, 0]), max_y])
-        if candidate[1] > pareto_front[0][1] and candidate[0] <= pareto_front[0][0]:
-            pareto_front.append(candidate)
-
-    pareto_front_np = np.array(pareto_front)
-    pareto_front_np = pareto_front_np[np.lexsort((
-        -pareto_front_np[:, 1],
-        pareto_front_np[:, 0]))]
-
-    # Avoid repeated points
-    _, idx = np.unique(pareto_front_np, axis=0, return_index=True)
-    pareto_front_np = pareto_front_np[np.sort(idx)]
-
-    return pareto_front_np
-
-
-def get_pareto_frontier(
-    ttff_list: list[float],
-    costs: list[float],
-    max_y: Optional[float] = None,
-    max_x: Optional[float] = None,
-) -> np.ndarray:
-    points = np.array(list(zip(ttff_list, costs)))
-    return get_pareto_frontier_paper(
-        points,
-        max_x,
-        max_y,
-    )
-
-
-def clean_frontier(
-    frontier: np.ndarray
-) -> np.ndarray:
-    F = frontier[np.argsort(frontier[:, 0])]
-    xs = []
-    ys = []
-    i = 0
-    while i < len(F):
-        x = F[i, 0]
-        same_x = F[F[:, 0] == x]
-        xs.append(x)
-        ys.append(same_x[:, 1].min())
-        i += len(same_x)
-    return np.column_stack([xs, ys])
-
-
-def area_between_frontiers(
-    A: np.ndarray,
-    B: np.ndarray,
-    n: int = 5000
-) -> np.ndarray:
-    A = clean_frontier(A)
-    B = clean_frontier(B)
-    xmin = max(A[:, 0].min(), B[:, 0].min())
-    xmax = min(A[:, 0].max(), B[:, 0].max())
-    xs = np.linspace(xmin, xmax, n)
-    fA = interp1d(A[:, 0], A[:, 1], kind="linear")
-    fB = interp1d(B[:, 0], B[:, 1], kind="linear")
-    yA = fA(xs)
-    yB = fB(xs)
-    # return np.trapezoid(yB - yA, xs)
-    delta = yB - yA
-    return 100.0 * delta / yB
diff --git a/simulator/workflows.py b/simulator/workflows.py
deleted file mode 100644
index ba0caa46..00000000
--- a/simulator/workflows.py
+++ /dev/null
@@ -1,253 +0,0 @@
-from __future__ import annotations
-
-import math
-
-from typing import Optional
-
-from sim_types import WorkflowConfig
-from sim_types import Model
-from sim_types import QualityLevel
-
-from constants import FPS
-from constants import FRAMES_OPTIONS
-from constants import FRAMES_PER_STEP_IDX
-from constants import NUM_STEPS
-from constants import SECONDS_IN_HOUR, SECONDS_IN_MINUTE
-from constants import TOTAL_INPUT_TOKENS
-
-
-# Shared physical constants
-MAX_FT_FRAMES: int = 1 + 80
-SUBSCENE_SECONDS: float = MAX_FT_FRAMES / FPS[Model.FT]  # 81 frames @ 23 FPS → ~3.52 s
-SUBSCENES_PER_SCENE: int = 4  # default subscene grouping
-TOKENS_PER_FRAME = 500  # 1 frame generates around 500 tokens
-
-
-def _get_num_subscenes(total_video_seconds: int) -> int:
-    """Return the number of subscenes needed to cover the given video duration."""
-    return math.ceil(total_video_seconds / SUBSCENE_SECONDS)
-
-
-def _get_num_scenes(total_video_seconds: int) -> int:
-    """Return the number of scenes needed to cover the given video duration."""
-    return math.ceil(_get_num_subscenes(total_video_seconds) / SUBSCENES_PER_SCENE)
-
-
-def _get_num_frames(total_video_seconds: int, model: Model) -> int:
-    """Return the number of frames needed for the given video duration and model."""
-    return math.ceil(total_video_seconds * FPS[model])
-
-
-def _video_gen_work(
-    total_video_seconds: int,
-    num_scenes: int,
-    num_subscenes: int,
-    model_work_overrides: Optional[dict[Model, int | str | None]] = None,
-) -> dict[Model, int]:
-    """Standard model work for video-generation workflows (Podcast, Movie, etc.)."""
-    ret = {
-        Model.GEMMA: 1,
-        Model.FLUX: 1,
-        Model.HF: num_subscenes,
-        Model.HF_VAE: _get_num_frames(total_video_seconds, Model.HF),
-        Model.FT: num_subscenes,
-        Model.FT_VAE: _get_num_frames(total_video_seconds, Model.FT),
-        Model.UPSCALER: _get_num_frames(total_video_seconds, Model.FT),
-        Model.OTHERS: 1,
-    }
-    if model_work_overrides:
-        for model, value in model_work_overrides.items():
-            if value == "num_scenes":
-                ret[model] = num_scenes
-            elif value == "num_subscenes":
-                ret[model] = num_subscenes
-            elif isinstance(value, str):
-                raise ValueError(f"Invalid model_work override value: {value}")
-            elif value == 0 or value is None:
-                del ret[model]
-            else:
-                ret[model] = value
-    return ret
-
-
-class WorkOverrideType:
-    def __init__(self, value: int | str | None = None):
-        self.value = value
-
-
-def build_workflow_config(
-    total_video_seconds: int,
-    input_tokens: int,
-    model_work: dict[Model, int] | None = None,
-    *,
-    model_work_overrides: dict[Model, int | str | None] | None = None,
-    num_scenes_override: int | None = None,
-    num_steps_override: dict[Model, int] | None = None,
-    target_resolution: QualityLevel = QualityLevel.HIGH,
-) -> WorkflowConfig:
-    """Build a ``WorkflowConfig`` from base parameters, computing all derived values.
-
-    Parameters
-    ----------
-    model_work:
-        Explicit model-work dictionary.  When ``None`` (default), standard
-        video-generation work is auto-generated from the other parameters.
-    exclude_models:
-        Models to remove from auto-generated ``model_work``.
-    model_work_overrides:
-        Key-value overrides applied on top of auto-generated ``model_work``.
-        If a value is set to "num_scenes", it will be replaced with the number of scenes (i.e. per-scene work).
-    target_resolution:
-        The target output resolution for the workflow (default HIGH).
-        When not HIGH, UPSCALER is automatically removed from model_work.
-    """
-    num_subscenes = _get_num_subscenes(total_video_seconds)
-
-    num_scenes = _get_num_scenes(total_video_seconds)
-    if num_scenes_override is not None:
-        num_scenes = num_scenes_override
-
-    num_steps = dict(NUM_STEPS)
-    if num_steps_override:
-        num_steps.update(num_steps_override)
-
-    if model_work is None:
-        model_work = _video_gen_work(
-            total_video_seconds,
-            num_scenes,
-            num_subscenes,
-            model_work_overrides,
-        )
-
-    return WorkflowConfig(
-        total_video_seconds=total_video_seconds,
-        total_scenes=num_scenes,
-        total_subscenes=num_subscenes,
-        total_frames={
-            Model.HF: _get_num_frames(total_video_seconds, Model.HF),
-            Model.FT: _get_num_frames(total_video_seconds, Model.FT),
-        },
-        per_subscene_frames={
-            Model.HF: math.ceil(_get_num_frames(total_video_seconds, Model.HF) / num_subscenes),
-            Model.FT: math.ceil(_get_num_frames(total_video_seconds, Model.FT) / num_subscenes),
-        },
-        num_steps=num_steps,
-        hf_frames=FRAMES_OPTIONS[Model.HF],
-        ft_frames=FRAMES_OPTIONS[Model.FT],
-        frames_per_step_idx=FRAMES_PER_STEP_IDX,
-        target_resolution=target_resolution,
-        total_input_tokens=input_tokens,
-        model_work=model_work,
-    )
-
-
-WORKFLOW_DURATIONS = {  # in seconds
-    "podcast": int(10 * SECONDS_IN_MINUTE),
-    # TODO The input is two hours but the output should be shorter something like 1 or 2 minutes
-    "short": int(2 * SECONDS_IN_HOUR),
-    "movie": int(2 * SECONDS_IN_HOUR),
-    "story": int(10 * SECONDS_IN_MINUTE),
-    "lecture": int(5 * SECONDS_IN_MINUTE),
-    "slide": int(10 * SECONDS_IN_MINUTE),
-    "dubbing": int(10 * SECONDS_IN_MINUTE),
-    "editing": int(10 * SECONDS_IN_MINUTE),
-    "chat": 5,
-}
-
-
-# Podcast: 10-minute video from text/PDF input
-PODCAST_WORKFLOW = build_workflow_config(
-    total_video_seconds=WORKFLOW_DURATIONS["podcast"],
-    input_tokens=TOTAL_INPUT_TOKENS,
-)
-
-# Shorts: short clips from a 2-hour input video
-_SHORTS_SECONDS = WORKFLOW_DURATIONS["short"]
-_SHORTS_SCENES = _SHORTS_SECONDS // 10  # 10-second scene segmentation → 720
-SHORTS_WORKFLOW = build_workflow_config(
-    total_video_seconds=_SHORTS_SECONDS,
-    input_tokens=int(_SHORTS_SECONDS * TOKENS_PER_FRAME),  # 1 fps × 500 tokens/frame
-    model_work={
-        Model.GEMMA: _SHORTS_SCENES,
-        Model.OTHERS: 1,  # TODO isn't this 1 by default?
-    },
-    num_scenes_override=_SHORTS_SCENES,
-)
-
-# Movie: 2-hour movie
-MOVIE_WORKFLOW = build_workflow_config(
-    total_video_seconds=WORKFLOW_DURATIONS["movie"],
-    input_tokens=TOTAL_INPUT_TOKENS,
-    model_work_overrides={
-        Model.FLUX: "num_scenes",
-    },
-)
-
-# Animated Story: Podcast + 5% more HF denoising steps (LoRA overhead)
-OVERHEAD_PCT = 5
-ANIMATED_STORY_WORKFLOW = build_workflow_config(
-    total_video_seconds=WORKFLOW_DURATIONS["story"],
-    input_tokens=TOTAL_INPUT_TOKENS,
-    num_steps_override={
-        Model.HF: int(NUM_STEPS[Model.HF] * 1 + (OVERHEAD_PCT / 100.0))
-    },
-)
-
-# Lecture: 5-minute video, Flux generates per-scene images
-LECTURE_WORKFLOW = build_workflow_config(
-    total_video_seconds=WORKFLOW_DURATIONS["lecture"],
-    input_tokens=TOTAL_INPUT_TOKENS,
-    model_work_overrides={
-        Model.FLUX: "num_scenes",
-    },
-)
-
-# Slide Persona: same as Podcast but at low resolution, no upscaler
-SLIDE_PERSONA_WORKFLOW = build_workflow_config(
-    total_video_seconds=WORKFLOW_DURATIONS["slide"],
-    input_tokens=TOTAL_INPUT_TOKENS,
-    target_resolution=QualityLevel.LOW,
-    model_work_overrides={
-        Model.UPSCALER: None,
-    },
-)
-
-# Dubbing: like Podcast but without Flux, and double the audio work
-DUBBING_WORKFLOW = build_workflow_config(
-    total_video_seconds=WORKFLOW_DURATIONS["dubbing"],
-    input_tokens=TOTAL_INPUT_TOKENS,
-    model_work_overrides={
-        Model.FLUX: None,
-        Model.OTHERS: 2,  # Double audio work
-    },
-)
-
-# Editing: like Podcast but without GEMMA, FLUX, or OTHERS
-EDITING_WORKFLOW = build_workflow_config(
-    total_video_seconds=WORKFLOW_DURATIONS["editing"],
-    input_tokens=TOTAL_INPUT_TOKENS,
-    model_work_overrides={
-        Model.GEMMA: None,
-        Model.FLUX: None,
-        Model.OTHERS: None,
-    }
-)
-
-# Video Chat: like Podcast but only 5 seconds of output video
-VIDEO_CHAT_WORKFLOW = build_workflow_config(
-    total_video_seconds=WORKFLOW_DURATIONS["chat"],
-    input_tokens=TOTAL_INPUT_TOKENS,
-)
-
-
-WORKFLOWS = {
-    "podcast": PODCAST_WORKFLOW,
-    "chat": VIDEO_CHAT_WORKFLOW,
-    "dubbing": DUBBING_WORKFLOW,
-    "editing": EDITING_WORKFLOW,
-    "lecture": LECTURE_WORKFLOW,
-    "movie": MOVIE_WORKFLOW,
-    "short": SHORTS_WORKFLOW,
-    "slide": SLIDE_PERSONA_WORKFLOW,
-    "story": ANIMATED_STORY_WORKFLOW,
-}
diff --git a/tests/simulator/test_auto_model_allocator.py b/tests/simulator/test_auto_model_allocator.py
index a9aa17d6..18ff1871 100644
--- a/tests/simulator/test_auto_model_allocator.py
+++ b/tests/simulator/test_auto_model_allocator.py
@@ -23,30 +23,30 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from sim_types import GPUType
-    from sim_types import Model
-    from sim_types import QualityLevel
-    from sim_types import Solver
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.sim_types import GPUType
+    from model_provisioner.sim_types import Model
+    from model_provisioner.sim_types import QualityLevel
+    from model_provisioner.sim_types import Solver
 
-    from constants import DEFAULT_WORKFLOW_CONFIG
+    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
 
-    from data_loading import load_latency_data
+    from model_provisioner.data_loading import load_latency_data
 
-    from policies import STREAMWISE_POLICY
-    from policies import NAIVE_POLICY
-    from policies import HEXGEN_POLICY
-    from policies import HELIX_POLICY
+    from model_provisioner.policies import STREAMWISE_POLICY
+    from model_provisioner.policies import NAIVE_POLICY
+    from model_provisioner.policies import HEXGEN_POLICY
+    from model_provisioner.policies import HELIX_POLICY
 
-    from auto_model_allocator import AutoModelAllocator
+    from model_provisioner.auto_model_allocator import AutoModelAllocator
 
-    from greedy import GreedyAllocator
-    from naive_baseline import NaiveAllocator
-    from hexgen import HexGenAllocator
-    from helix import HelixAllocator
-    from milp import MILPAllocator
+    from model_provisioner.greedy import GreedyAllocator
+    from model_provisioner.naive_baseline import NaiveAllocator
+    from model_provisioner.hexgen import HexGenAllocator
+    from model_provisioner.helix import HelixAllocator
+    from model_provisioner.milp import MILPAllocator
 
-    from workflows import PODCAST_WORKFLOW
+    from model_provisioner.workflows import PODCAST_WORKFLOW
 
 
 # ---------------------------------------------------------------------------
diff --git a/tests/simulator/test_data_loading.py b/tests/simulator/test_data_loading.py
index 129a2f3b..de883d35 100644
--- a/tests/simulator/test_data_loading.py
+++ b/tests/simulator/test_data_loading.py
@@ -11,12 +11,12 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from sim_types import QualityLevel
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.sim_types import QualityLevel
 
-    from data_loading import load_latency_data
-    from data_loading import load_power_data
-    from data_loading import load_adaptive_quality_data
+    from model_provisioner.data_loading import load_latency_data
+    from model_provisioner.data_loading import load_power_data
+    from model_provisioner.data_loading import load_adaptive_quality_data
 
 
 def test_latency() -> None:
diff --git a/tests/simulator/test_evaluator.py b/tests/simulator/test_evaluator.py
index a162e99b..6f3a5aa7 100644
--- a/tests/simulator/test_evaluator.py
+++ b/tests/simulator/test_evaluator.py
@@ -8,29 +8,29 @@
 from tests.test_utils import assert_equals_approx
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from constants import DEFAULT_WORKFLOW_CONFIG
-    from constants import SECONDS_IN_HOUR
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
+    from model_provisioner.constants import SECONDS_IN_HOUR
 
-    from sim_types import GPUType
-    from sim_types import Model
+    from model_provisioner.sim_types import GPUType
+    from model_provisioner.sim_types import Model
 
-    from data_loading import load_latency_data
-    from data_loading import load_power_data
+    from model_provisioner.data_loading import load_latency_data
+    from model_provisioner.data_loading import load_power_data
 
-    from evaluator import evaluate_model_allocation
+    from model_provisioner.evaluator import evaluate_model_allocation
 
-    from policies import STREAMWISE_POLICY
+    from model_provisioner.policies import STREAMWISE_POLICY
 
-    from models import FluxModelAllocation
-    from models import GemmaModelAllocation
-    from models import HFModelAllocation
-    from models import HFVAEModelAllocation
-    from models import FTModelAllocation
-    from models import UpscalerModelAllocation
-    from models import OthersModelAllocation
+    from model_provisioner.models import FluxModelAllocation
+    from model_provisioner.models import GemmaModelAllocation
+    from model_provisioner.models import HFModelAllocation
+    from model_provisioner.models import HFVAEModelAllocation
+    from model_provisioner.models import FTModelAllocation
+    from model_provisioner.models import UpscalerModelAllocation
+    from model_provisioner.models import OthersModelAllocation
 
-    from utils import to_models_df
+    from model_provisioner.utils import to_models_df
 
 
 def test_empty() -> None:
diff --git a/tests/simulator/test_greedy.py b/tests/simulator/test_greedy.py
index c33d6991..786cc2c2 100644
--- a/tests/simulator/test_greedy.py
+++ b/tests/simulator/test_greedy.py
@@ -8,22 +8,22 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from constants import DEFAULT_WORKFLOW_CONFIG
-    from constants import SECONDS_IN_HOUR
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
+    from model_provisioner.constants import SECONDS_IN_HOUR
 
-    from workflows import WORKFLOWS
+    from model_provisioner.workflows import WORKFLOWS
 
-    from sim_types import GPUType
-    from sim_types import QualityLevel
-    from sim_types import WorkflowConfig
+    from model_provisioner.sim_types import GPUType
+    from model_provisioner.sim_types import QualityLevel
+    from model_provisioner.sim_types import WorkflowConfig
 
-    from data_loading import load_latency_data
-    from data_loading import load_power_data
+    from model_provisioner.data_loading import load_latency_data
+    from model_provisioner.data_loading import load_power_data
 
-    from greedy import GreedyAllocator
+    from model_provisioner.greedy import GreedyAllocator
 
-    from policies import STREAMWISE_POLICY
+    from model_provisioner.policies import STREAMWISE_POLICY
 
 
 def test_allocate_8A_8H() -> None:
diff --git a/tests/simulator/test_helix.py b/tests/simulator/test_helix.py
index a336595d..06ec8f3a 100644
--- a/tests/simulator/test_helix.py
+++ b/tests/simulator/test_helix.py
@@ -12,16 +12,16 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from constants import DEFAULT_WORKFLOW_CONFIG
-    from sim_types import GPUType
-    from sim_types import Model
-    from sim_types import MODEL_ORDER
-    from sim_types import Solver
-    from data_loading import load_latency_data
-    from data_loading import load_power_data
-    from helix import HelixAllocator
-    from policies import HELIX_POLICY
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
+    from model_provisioner.sim_types import GPUType
+    from model_provisioner.sim_types import Model
+    from model_provisioner.sim_types import MODEL_ORDER
+    from model_provisioner.sim_types import Solver
+    from model_provisioner.data_loading import load_latency_data
+    from model_provisioner.data_loading import load_power_data
+    from model_provisioner.helix import HelixAllocator
+    from model_provisioner.policies import HELIX_POLICY
 
 
 def test_get_model_order() -> None:
diff --git a/tests/simulator/test_hexgen.py b/tests/simulator/test_hexgen.py
index 99e7eef5..3317a82e 100644
--- a/tests/simulator/test_hexgen.py
+++ b/tests/simulator/test_hexgen.py
@@ -7,13 +7,13 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from constants import DEFAULT_WORKFLOW_CONFIG
-    from sim_types import GPUType
-    from data_loading import load_latency_data
-    from hexgen import HexGenAllocator
-    from hexgen import _get_model_order
-    from sim_types import MODEL_ORDER
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
+    from model_provisioner.sim_types import GPUType
+    from model_provisioner.data_loading import load_latency_data
+    from model_provisioner.hexgen import HexGenAllocator
+    from model_provisioner.hexgen import _get_model_order
+    from model_provisioner.sim_types import MODEL_ORDER
 
 
 def test_get_model_order() -> None:
@@ -154,7 +154,7 @@ def test_no_gpus_error() -> None:
 
 def test_is_subclass_of_greedy() -> None:
     """HexGenAllocator should extend GreedyAllocator."""
-    from greedy import GreedyAllocator
+    from model_provisioner.greedy import GreedyAllocator
     latency_data = load_latency_data("simulator/data/")
     allocator = HexGenAllocator(
         workflow=DEFAULT_WORKFLOW_CONFIG,
diff --git a/tests/simulator/test_milp.py b/tests/simulator/test_milp.py
index 70c4bfa8..9b0e909e 100644
--- a/tests/simulator/test_milp.py
+++ b/tests/simulator/test_milp.py
@@ -13,29 +13,29 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from sim_types import LatencyData
-    from sim_types import PowerData
-    from sim_types import GPUType
-    from sim_types import Objective
-    from sim_types import Solver
-    from sim_types import QualityLevel
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.sim_types import LatencyData
+    from model_provisioner.sim_types import PowerData
+    from model_provisioner.sim_types import GPUType
+    from model_provisioner.sim_types import Objective
+    from model_provisioner.sim_types import Solver
+    from model_provisioner.sim_types import QualityLevel
 
-    from data_loading import load_latency_data
-    from data_loading import load_power_data
+    from model_provisioner.data_loading import load_latency_data
+    from model_provisioner.data_loading import load_power_data
 
-    from constants import DEFAULT_WORKFLOW_CONFIG
-    from constants import SECONDS_IN_HOUR
+    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
+    from model_provisioner.constants import SECONDS_IN_HOUR
 
-    from policies import STREAMWISE_MILP_POLICY
+    from model_provisioner.policies import STREAMWISE_MILP_POLICY
 
-    from workflows import WORKFLOWS
+    from model_provisioner.workflows import WORKFLOWS
 
-    from milp import MILPAllocator
+    from model_provisioner.milp import MILPAllocator
 
-    from evaluator import evaluate_model_allocation
+    from model_provisioner.evaluator import evaluate_model_allocation
 
-    from utils import to_models_df
+    from model_provisioner.utils import to_models_df
 
 
 def test_base() -> None:
diff --git a/tests/simulator/test_models.py b/tests/simulator/test_models.py
index 57e00a0a..c0171d99 100644
--- a/tests/simulator/test_models.py
+++ b/tests/simulator/test_models.py
@@ -16,34 +16,34 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from sim_types import GPUType
-    from sim_types import Model
-    from sim_types import ModelAllocation
-    from sim_types import QualityLevel
-    from sim_types import LatencyData
-    from sim_types import PowerData
-
-    from constants import DEFAULT_WORKFLOW_CONFIG
-
-    from data_loading import load_latency_data
-    from data_loading import load_power_data
-
-    from policies import STREAMWISE_POLICY
-    from policies import NAIVE_POLICY
-
-    from models import get_model_allocation
-    from models import _calculate_total_time
-    from models import assert_pixel_config
-    from models import _MODEL_ALLOCATION_REGISTRY
-    from models import GemmaModelAllocation
-    from models import FluxModelAllocation
-    from models import HFModelAllocation
-    from models import HFVAEModelAllocation
-    from models import FTModelAllocation
-    from models import FTVAEModelAllocation
-    from models import UpscalerModelAllocation
-    from models import OthersModelAllocation
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.sim_types import GPUType
+    from model_provisioner.sim_types import Model
+    from model_provisioner.sim_types import ModelAllocation
+    from model_provisioner.sim_types import QualityLevel
+    from model_provisioner.sim_types import LatencyData
+    from model_provisioner.sim_types import PowerData
+
+    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
+
+    from model_provisioner.data_loading import load_latency_data
+    from model_provisioner.data_loading import load_power_data
+
+    from model_provisioner.policies import STREAMWISE_POLICY
+    from model_provisioner.policies import NAIVE_POLICY
+
+    from model_provisioner.models import get_model_allocation
+    from model_provisioner.models import _calculate_total_time
+    from model_provisioner.models import assert_pixel_config
+    from model_provisioner.models import _MODEL_ALLOCATION_REGISTRY
+    from model_provisioner.models import GemmaModelAllocation
+    from model_provisioner.models import FluxModelAllocation
+    from model_provisioner.models import HFModelAllocation
+    from model_provisioner.models import HFVAEModelAllocation
+    from model_provisioner.models import FTModelAllocation
+    from model_provisioner.models import FTVAEModelAllocation
+    from model_provisioner.models import UpscalerModelAllocation
+    from model_provisioner.models import OthersModelAllocation
 
 
 # ---------------------------------------------------------------------------
@@ -152,7 +152,7 @@ def test_assert_pixel_config() -> None:
     assert_pixel_config(DEFAULT_WORKFLOW_CONFIG)
 
     # Patching MEDIUM > HIGH violates the ordering constraint → AssertionError.
-    with patch.dict("sim_types.RESOLUTION_PIXELS",
+    with patch.dict("model_provisioner.sim_types.RESOLUTION_PIXELS",
                     {QualityLevel.MEDIUM: 1000, QualityLevel.HIGH: 500}):
         with pytest.raises(AssertionError):
             assert_pixel_config(DEFAULT_WORKFLOW_CONFIG)
diff --git a/tests/simulator/test_multirequests_derive.py b/tests/simulator/test_multirequests_derive.py
index 8e7ed798..d5286121 100644
--- a/tests/simulator/test_multirequests_derive.py
+++ b/tests/simulator/test_multirequests_derive.py
@@ -7,10 +7,10 @@
 from tests.test_utils import assert_equal_dict
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from sim_types import GPUType
-    from sim_types import Model
-    from sim_types import QualityLevel
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.sim_types import GPUType
+    from model_provisioner.sim_types import Model
+    from model_provisioner.sim_types import QualityLevel
 
     from multirequests import TIME_PER_REQ
     from multirequests import INIT_REPLICAS
diff --git a/tests/simulator/test_simulator.py b/tests/simulator/test_simulator.py
index fc791151..d621cd33 100644
--- a/tests/simulator/test_simulator.py
+++ b/tests/simulator/test_simulator.py
@@ -13,23 +13,23 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from sim_types import WorkflowConfig
-    from sim_types import Model
-    from sim_types import Objective
-    from sim_types import GPUType
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.sim_types import WorkflowConfig
+    from model_provisioner.sim_types import Model
+    from model_provisioner.sim_types import Objective
+    from model_provisioner.sim_types import GPUType
 
-    from constants import SECONDS_IN_HOUR
-    from constants import DEFAULT_WORKFLOW_CONFIG
+    from model_provisioner.constants import SECONDS_IN_HOUR
+    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
 
-    from data_loading import load_latency_data
-    from data_loading import load_power_data
+    from model_provisioner.data_loading import load_latency_data
+    from model_provisioner.data_loading import load_power_data
 
-    from auto_model_allocator import AutoModelAllocator
-    from greedy import GreedyAllocator
+    from model_provisioner.auto_model_allocator import AutoModelAllocator
+    from model_provisioner.greedy import GreedyAllocator
 
-    from policies import STREAMWISE_POLICY
-    from policies import NAIVE_POLICY
+    from model_provisioner.policies import STREAMWISE_POLICY
+    from model_provisioner.policies import NAIVE_POLICY
 
 
 def test_estimate_total_time() -> None:
diff --git a/tests/simulator/test_simulator_actions.py b/tests/simulator/test_simulator_actions.py
index dd3bf4fd..11efd7b2 100644
--- a/tests/simulator/test_simulator_actions.py
+++ b/tests/simulator/test_simulator_actions.py
@@ -7,12 +7,12 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from sim_types import Action
-    from sim_types import ActionName
-    from sim_types import GPUType
-    from sim_types import Model
-    from sim_types import Result
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.sim_types import Action
+    from model_provisioner.sim_types import ActionName
+    from model_provisioner.sim_types import GPUType
+    from model_provisioner.sim_types import Model
+    from model_provisioner.sim_types import Result
 
 
 def test_action() -> None:
diff --git a/tests/simulator/test_simulator_baseline.py b/tests/simulator/test_simulator_baseline.py
index 64282777..24749ffb 100644
--- a/tests/simulator/test_simulator_baseline.py
+++ b/tests/simulator/test_simulator_baseline.py
@@ -11,28 +11,28 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from sim_types import GPUType
-    from sim_types import Model
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.sim_types import GPUType
+    from model_provisioner.sim_types import Model
 
-    from constants import DEFAULT_WORKFLOW_CONFIG
-    from constants import SECONDS_IN_HOUR
-    from constants import POWER_GPU_IDLE
-    from constants import POWER_GPU_TDP
+    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
+    from model_provisioner.constants import SECONDS_IN_HOUR
+    from model_provisioner.constants import POWER_GPU_IDLE
+    from model_provisioner.constants import POWER_GPU_TDP
 
-    from data_loading import load_latency_data
-    from data_loading import load_power_data
+    from model_provisioner.data_loading import load_latency_data
+    from model_provisioner.data_loading import load_power_data
 
-    from auto_model_allocator import AutoModelAllocator
-    from naive_baseline import NaiveAllocator
-    from greedy import GreedyAllocator
+    from model_provisioner.auto_model_allocator import AutoModelAllocator
+    from model_provisioner.naive_baseline import NaiveAllocator
+    from model_provisioner.greedy import GreedyAllocator
 
-    from policies import NAIVE_POLICY
-    from policies import BASELINE_POLICIES
-    from policies import STREAMWISE_POLICY
+    from model_provisioner.policies import NAIVE_POLICY
+    from model_provisioner.policies import BASELINE_POLICIES
+    from model_provisioner.policies import STREAMWISE_POLICY
 
-    from workflows import SHORTS_WORKFLOW
-    from workflows import WORKFLOWS
+    from model_provisioner.workflows import SHORTS_WORKFLOW
+    from model_provisioner.workflows import WORKFLOWS
 
 
 def test_baseline() -> None:
diff --git a/tests/simulator/test_simulator_energy.py b/tests/simulator/test_simulator_energy.py
index 16b6e8bf..a739f698 100644
--- a/tests/simulator/test_simulator_energy.py
+++ b/tests/simulator/test_simulator_energy.py
@@ -9,23 +9,23 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from constants import DEFAULT_WORKFLOW_CONFIG
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
 
-    from sim_types import GPUType
-    from sim_types import Model
-    from sim_types import Objective
-    from sim_types import Solver
+    from model_provisioner.sim_types import GPUType
+    from model_provisioner.sim_types import Model
+    from model_provisioner.sim_types import Objective
+    from model_provisioner.sim_types import Solver
 
-    from data_loading import load_latency_data
-    from data_loading import load_power_data
+    from model_provisioner.data_loading import load_latency_data
+    from model_provisioner.data_loading import load_power_data
 
-    from auto_model_allocator import AutoModelAllocator
-    from greedy import GreedyAllocator
-    from naive_baseline import NaiveAllocator
+    from model_provisioner.auto_model_allocator import AutoModelAllocator
+    from model_provisioner.greedy import GreedyAllocator
+    from model_provisioner.naive_baseline import NaiveAllocator
 
-    from policies import NAIVE_POLICY
-    from policies import STREAMWISE_POLICY
+    from model_provisioner.policies import NAIVE_POLICY
+    from model_provisioner.policies import STREAMWISE_POLICY
 
 
 def test_energy() -> None:
diff --git a/tests/simulator/test_simulator_multirequests.py b/tests/simulator/test_simulator_multirequests.py
index 972596ec..3d3e350a 100644
--- a/tests/simulator/test_simulator_multirequests.py
+++ b/tests/simulator/test_simulator_multirequests.py
@@ -7,7 +7,7 @@
 from tests.test_utils import assert_equals_approx
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from multirequests import QPM_LIST
     from multirequests import get_replicas
     from multirequests import get_costs
@@ -21,12 +21,12 @@
     from multirequests import TIME_PER_REQ_ADAPTIVE
     from multirequests import get_time_per_request_baseline
 
-    from data_loading import load_latency_data
-    from workflows import PODCAST_WORKFLOW
+    from model_provisioner.data_loading import load_latency_data
+    from model_provisioner.workflows import PODCAST_WORKFLOW
 
-    from constants import GPU_SPOT_COST
-    from sim_types import GPUType
-    from sim_types import Model
+    from model_provisioner.constants import GPU_SPOT_COST
+    from model_provisioner.sim_types import GPUType
+    from model_provisioner.sim_types import Model
 
 
 def test_multirequests() -> None:
diff --git a/tests/simulator/test_simulator_plotutils.py b/tests/simulator/test_simulator_plotutils.py
index cee69368..2d3b35e2 100644
--- a/tests/simulator/test_simulator_plotutils.py
+++ b/tests/simulator/test_simulator_plotutils.py
@@ -6,7 +6,7 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
+with temp_sys_path("simulator", "streamwise"):
     from plot_utils import plot_ttff_vs_cost
     from plot_utils import plot_ttff_vs_energy
     from plot_utils import plot_adaptive_quality
@@ -14,10 +14,10 @@
     from plot_utils import plot_cost_vs_qpm
     from plot_utils import _get_time_ticklabels
 
-    from sim_types import ProvisioningResult
-    from sim_types import GPUType
-    from sim_types import QualityLevel
-    from sim_types import Model
+    from model_provisioner.sim_types import ProvisioningResult
+    from model_provisioner.sim_types import GPUType
+    from model_provisioner.sim_types import QualityLevel
+    from model_provisioner.sim_types import Model
 
 
 def test_plot_ttff_vs_cost() -> None:
diff --git a/tests/simulator/test_simulator_policies.py b/tests/simulator/test_simulator_policies.py
index ffab5ba0..42bf69db 100644
--- a/tests/simulator/test_simulator_policies.py
+++ b/tests/simulator/test_simulator_policies.py
@@ -11,11 +11,11 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from policies import STREAMWISE_POLICY
-    from policies import BASELINE_POLICIES
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.policies import STREAMWISE_POLICY
+    from model_provisioner.policies import BASELINE_POLICIES
 
-    from sim_types import Objective
+    from model_provisioner.sim_types import Objective
 
 
 def test_streamwise_policies() -> None:
diff --git a/tests/simulator/test_simulator_provisioning.py b/tests/simulator/test_simulator_provisioning.py
index 6bd142ae..d781bc2e 100644
--- a/tests/simulator/test_simulator_provisioning.py
+++ b/tests/simulator/test_simulator_provisioning.py
@@ -7,8 +7,8 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from constants import DEFAULT_WORKFLOW_CONFIG
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
 
     from provisioning import get_provisioning_results
     from provisioning import get_provisioning_adaptive_results
@@ -17,15 +17,15 @@
     from provisioning import GPU_PROVISIONS
     from provisioning import GPU_PROVISIONS_SHORT
 
-    from sim_types import GPUType
-    from sim_types import QualityLevel
-    from sim_types import Solver
+    from model_provisioner.sim_types import GPUType
+    from model_provisioner.sim_types import QualityLevel
+    from model_provisioner.sim_types import Solver
 
-    from data_loading import load_latency_data
+    from model_provisioner.data_loading import load_latency_data
 
-    from policies import NAIVE_POLICY
-    from policies import STREAMWISE_POLICY
-    from policies import HEXGEN_POLICY
+    from model_provisioner.policies import NAIVE_POLICY
+    from model_provisioner.policies import STREAMWISE_POLICY
+    from model_provisioner.policies import HEXGEN_POLICY
 
 
 @pytest.mark.parametrize("gpu_type", [gpu_type for gpu_type in GPUType])
diff --git a/tests/simulator/test_simulator_types.py b/tests/simulator/test_simulator_types.py
index 8bfc292f..9e2384ed 100644
--- a/tests/simulator/test_simulator_types.py
+++ b/tests/simulator/test_simulator_types.py
@@ -8,21 +8,21 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from sim_types import Model
-    from sim_types import GPUType
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.sim_types import Model
+    from model_provisioner.sim_types import GPUType
 
-    from sim_types_json import models_to_json
-    from sim_types_json import workflow_to_json
-    from sim_types_json import policy_to_json
-    from sim_types_json import model_list_to_json
+    from model_provisioner.sim_types_json import models_to_json
+    from model_provisioner.sim_types_json import workflow_to_json
+    from model_provisioner.sim_types_json import policy_to_json
+    from model_provisioner.sim_types_json import model_list_to_json
 
-    from models import GemmaModelAllocation
-    from models import FluxModelAllocation
+    from model_provisioner.models import GemmaModelAllocation
+    from model_provisioner.models import FluxModelAllocation
 
-    from policies import STREAMWISE_POLICY
+    from model_provisioner.policies import STREAMWISE_POLICY
 
-    from workflows import PODCAST_WORKFLOW
+    from model_provisioner.workflows import PODCAST_WORKFLOW
 
 
 def test_serialize_models() -> None:
diff --git a/tests/simulator/test_simulator_utils.py b/tests/simulator/test_simulator_utils.py
index 9711a696..e1575d9a 100644
--- a/tests/simulator/test_simulator_utils.py
+++ b/tests/simulator/test_simulator_utils.py
@@ -6,19 +6,19 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from sim_types import Model
-    from sim_types import GPUType
-    from sim_types import ModelAllocation
-    from sim_types import ProvisioningResult
-
-    from utils import get_pareto_frontier
-    from utils import find_most_cost_effective_provisioning
-    from utils import find_most_energy_efficient_provisioning
-    from utils import find_pareto_frontier
-    from utils import coalesce_models
-
-    from models import FTModelAllocation
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.sim_types import Model
+    from model_provisioner.sim_types import GPUType
+    from model_provisioner.sim_types import ModelAllocation
+    from model_provisioner.sim_types import ProvisioningResult
+
+    from model_provisioner.utils import get_pareto_frontier
+    from model_provisioner.utils import find_most_cost_effective_provisioning
+    from model_provisioner.utils import find_most_energy_efficient_provisioning
+    from model_provisioner.utils import find_pareto_frontier
+    from model_provisioner.utils import coalesce_models
+
+    from model_provisioner.models import FTModelAllocation
 
 
 def test_get_pareto_frontier() -> None:
diff --git a/tests/simulator/test_workflows.py b/tests/simulator/test_workflows.py
index bff7ed56..19a7ff0c 100644
--- a/tests/simulator/test_workflows.py
+++ b/tests/simulator/test_workflows.py
@@ -15,9 +15,9 @@
 
 from tests.test_utils import temp_sys_path
 
-with temp_sys_path("simulator"):
-    from sim_types import WorkflowConfig, Model, QualityLevel, GPUType
-    from constants import (
+with temp_sys_path("simulator", "streamwise"):
+    from model_provisioner.sim_types import WorkflowConfig, Model, QualityLevel, GPUType
+    from model_provisioner.constants import (
         FPS,
         FRAMES_OPTIONS,
         FRAMES_PER_STEP_IDX,
@@ -26,10 +26,10 @@
         SECONDS_IN_MINUTE,
         TOTAL_INPUT_TOKENS,
     )
-    from data_loading import load_latency_data
-    from auto_model_allocator import AutoModelAllocator
-    from policies import STREAMWISE_POLICY, NAIVE_POLICY
-    from workflows import (
+    from model_provisioner.data_loading import load_latency_data
+    from model_provisioner.auto_model_allocator import AutoModelAllocator
+    from model_provisioner.policies import STREAMWISE_POLICY, NAIVE_POLICY
+    from model_provisioner.workflows import (
         MAX_FT_FRAMES,
         SUBSCENE_SECONDS,
         SUBSCENES_PER_SCENE,

From fc7f5e7d6013aff10f5fdf3eba17e7a67bcb8e98 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Fri, 15 May 2026 15:21:38 -0700
Subject: [PATCH 2/9] Move 11 non-policy files from model_provisioner back to
 simulator

Move actions, auto_model_allocator, constants, data_loading, evaluator,
model_allocator, models, sim_types, sim_types_json, utils, and workflows
from streamwise/model_provisioner/ back to simulator/.

Only 6 policy files remain in model_provisioner: greedy, helix, hexgen,
milp, naive_baseline, and policies.

Import changes:
- Moved files use bare imports (from sim_types import ...) instead of
  relative imports (from .sim_types import ...)
- Policy files use bare imports for moved modules and keep relative
  imports for sibling policy modules
- simulator/ and streamwise/allocator_bridge.py updated accordingly
- All test files updated to match new import paths
- Added tests/simulator/conftest.py to set PYTHONPATH for child processes
  spawned by ProcessPoolExecutor

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 simulator/__init__.py                         |   14 +-
 simulator/actions.py                          |  737 ++++++++++++
 simulator/auto_model_allocator.py             |  109 ++
 simulator/constants.py                        |  142 +++
 simulator/data_loading.py                     |  300 +++++
 simulator/evaluator.py                        |  414 +++++++
 simulator/model_allocator.py                  |  282 +++++
 simulator/models.py                           |  811 +++++++++++++
 simulator/multirequests.py                    |   24 +-
 simulator/plot_utils.py                       |   10 +-
 simulator/provisioning.py                     |   39 +-
 simulator/sim_types.py                        |  796 ++++++++++++
 simulator/sim_types_json.py                   |   58 +
 simulator/utils.py                            |  297 +++++
 simulator/workflows.py                        |  253 ++++
 streamwise/allocator_bridge.py                |  256 ++++
 streamwise/model_provisioner/__init__.py      |   15 +
 streamwise/model_provisioner/greedy.py        |  573 +++++++++
 streamwise/model_provisioner/helix.py         |  403 +++++++
 streamwise/model_provisioner/hexgen.py        |  629 ++++++++++
 streamwise/model_provisioner/milp.py          | 1070 +++++++++++++++++
 .../model_provisioner/naive_baseline.py       |  484 ++++++++
 streamwise/model_provisioner/policies.py      |  252 ++++
 tests/simulator/conftest.py                   |   24 +
 tests/simulator/test_auto_model_allocator.py  |   16 +-
 tests/simulator/test_data_loading.py          |    8 +-
 tests/simulator/test_evaluator.py             |   30 +-
 tests/simulator/test_greedy.py                |   16 +-
 tests/simulator/test_helix.py                 |   14 +-
 tests/simulator/test_hexgen.py                |    8 +-
 tests/simulator/test_milp.py                  |   26 +-
 tests/simulator/test_models.py                |   44 +-
 tests/simulator/test_multirequests_derive.py  |    6 +-
 tests/simulator/test_simulator.py             |   18 +-
 tests/simulator/test_simulator_actions.py     |   10 +-
 tests/simulator/test_simulator_baseline.py    |   22 +-
 tests/simulator/test_simulator_energy.py      |   16 +-
 .../simulator/test_simulator_multirequests.py |   10 +-
 tests/simulator/test_simulator_plotutils.py   |    8 +-
 tests/simulator/test_simulator_policies.py    |    2 +-
 .../simulator/test_simulator_provisioning.py  |   10 +-
 tests/simulator/test_simulator_types.py       |   18 +-
 tests/simulator/test_simulator_utils.py       |   24 +-
 tests/simulator/test_workflows.py             |   10 +-
 tests/streamwise/test_allocator_bridge.py     |  280 +++++
 45 files changed, 8391 insertions(+), 197 deletions(-)
 create mode 100644 simulator/actions.py
 create mode 100644 simulator/auto_model_allocator.py
 create mode 100644 simulator/constants.py
 create mode 100644 simulator/data_loading.py
 create mode 100644 simulator/evaluator.py
 create mode 100644 simulator/model_allocator.py
 create mode 100644 simulator/models.py
 create mode 100644 simulator/sim_types.py
 create mode 100644 simulator/sim_types_json.py
 create mode 100644 simulator/utils.py
 create mode 100644 simulator/workflows.py
 create mode 100644 streamwise/allocator_bridge.py
 create mode 100644 streamwise/model_provisioner/__init__.py
 create mode 100644 streamwise/model_provisioner/greedy.py
 create mode 100644 streamwise/model_provisioner/helix.py
 create mode 100644 streamwise/model_provisioner/hexgen.py
 create mode 100644 streamwise/model_provisioner/milp.py
 create mode 100644 streamwise/model_provisioner/naive_baseline.py
 create mode 100644 streamwise/model_provisioner/policies.py
 create mode 100644 tests/simulator/conftest.py
 create mode 100644 tests/streamwise/test_allocator_bridge.py

diff --git a/simulator/__init__.py b/simulator/__init__.py
index 24058e01..263309ff 100644
--- a/simulator/__init__.py
+++ b/simulator/__init__.py
@@ -1,15 +1,15 @@
 """
-Simulator package.
+Simulator package — provisioning sweeps, multi-request analysis, and plotting
+on top of the model_provisioner allocation policies.
 
-The core allocation logic lives in ``streamwise.model_provisioner``.
-This package adds provisioning sweeps, multi-request analysis, and plotting
-on top of that shared foundation.
+The allocation policy implementations live in ``streamwise/model_provisioner/``.
 """
 import os
 import sys
 
-# Make model_provisioner importable for simulator modules and child processes.
-_STREAMWISE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "streamwise")
-_STREAMWISE_DIR = os.path.normpath(_STREAMWISE_DIR)
+# Make model_provisioner importable for simulator modules.
+_STREAMWISE_DIR = os.path.normpath(
+    os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "streamwise")
+)
 if _STREAMWISE_DIR not in sys.path:
     sys.path.insert(0, _STREAMWISE_DIR)
diff --git a/simulator/actions.py b/simulator/actions.py
new file mode 100644
index 00000000..69af1618
--- /dev/null
+++ b/simulator/actions.py
@@ -0,0 +1,737 @@
+"""
+Actions for scaling models for the greedy allocator.
+"""
+
+from __future__ import annotations
+
+import random
+
+from collections import Counter
+
+from copy import deepcopy
+
+from typing import Optional
+
+from constants import DEVICE_OPTIONS
+from constants import SINGLE_INSTANCE_MODELS
+from constants import SINGLE_DEVICE_MODELS
+
+from sim_types import Action
+from sim_types import ActionName
+from sim_types import Model
+from sim_types import ModelAllocation
+from sim_types import GPUType
+from sim_types import WorkflowConfig
+from sim_types import LatencyData
+from sim_types import PowerData
+from sim_types import Objective
+from sim_types import Policy
+
+from model_provisioner.policies import STREAMWISE_POLICY
+
+from models import get_model_allocation
+
+from evaluator import evaluate_model_allocation
+from evaluator import calc_used_gpus
+
+
+def _is_single_instance(
+    model_name: Model,
+    workflow: Optional[WorkflowConfig] = None,
+) -> bool:
+    """Check if a model is single-instance, considering workflow parallelism settings."""
+    if model_name not in SINGLE_INSTANCE_MODELS:
+        return False
+    if workflow is not None and workflow.is_parallelizable(model_name):
+        return False
+    return True
+
+
+def find_next_devices(
+    device_options: list[int],
+    num_devices: int,
+    num_replicas: int,
+    remaining_devices: int,
+    max_num_devices: Optional[int] = None,
+) -> Optional[int]:
+    """
+    Find the next device combination.
+    For example, with device options [2, 4, 8, 16, 40], current devices 8, 1 replica, we get 16.
+    """
+    if num_replicas == 0:
+        # means we haven't allocated any replicas yet so start from smallest device option
+        return device_options[0] if device_options[0] <= remaining_devices else None
+
+    for device_option in device_options:
+        # if device_option > num_devices and device_option <= remaining_devices + num_devices:
+        if (
+            device_option > num_devices
+            and (device_option - num_devices) * num_replicas <= remaining_devices
+            and (max_num_devices is None or device_option <= max_num_devices)
+        ):
+            return device_option
+    return None
+
+
+def choose_action(
+    actions: list[Action],
+    objective: Objective,
+    switch_objective: bool = False,
+) -> Optional[Action]:
+    """Schedule requests."""
+    if not actions:
+        return None
+
+    if objective == Objective.TIME_COST:
+        # return min(actions, key=lambda a: a.time)
+        return min(
+            actions,
+            key=lambda a: (
+                a.time_cost(),
+                a.time,
+            ),
+        )
+    if objective == Objective.TIME_COST:
+        return min(
+            actions,
+            key=lambda a: (
+                a.time_cost(),
+                a.time,
+            ),
+        )
+    if objective == Objective.TTFF_COST:
+        return min(
+            actions,
+            key=lambda a: (
+                a.ttff_cost(),
+                a.ttff,
+            ),
+        )
+    if objective == Objective.FIFO:
+        # return min(actions, key=lambda a: a.arrival_time_s)
+        return min(actions, key=lambda a: a.get_order())
+    if objective == Objective.TIME:
+        return min(actions, key=lambda a: a.time)
+    if objective == Objective.TTFF:
+        return min(actions, key=lambda a: a.ttff)
+    if objective == Objective.COST:
+        return min(actions, key=lambda a: a.cost)
+    if objective == Objective.ENERGY:
+        return min(actions, key=lambda a: a.energy)
+    if objective == Objective.TIME_ENERGY:
+        return min(actions, key=lambda a: a.time_energy())
+    if objective == Objective.ENERGY_COST:
+        return min(actions, key=lambda a: a.energy_cost())
+    if objective == Objective.RANDOM:
+        # randomly pick an improvement to simulate naive allocation
+        return random.choice(actions)
+    if objective == Objective.TTFF_THEN_TIME:
+        if switch_objective:
+            return min(actions, key=lambda a: a.time)
+        else:
+            return min(actions, key=lambda a: a.ttff)
+    if objective == Objective.NONE:
+        return None
+    raise ValueError(f"Cannot recognize objective {objective}")
+
+
+def apply_action(
+    action: Action,
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
+    """Apply the chosen action to the models and update remaining devices."""
+
+    for gpu_type in action.models.keys():
+        if gpu_type not in models:
+            raise ValueError(f"Cannot find gpu type {gpu_type} in {models.keys()}")
+        for model in action.models[gpu_type].keys():
+            if model not in models[gpu_type]:
+                raise ValueError(f"Cannot find model {model} in {models[gpu_type].keys()}")
+            allocs_to_remove = []
+            for alloc_id in range(len(action.models[gpu_type][model])):
+                # check if devices and replicas are non-negative
+                num_devices = action.models[gpu_type][model][alloc_id].devices
+                if num_devices < 0:
+                    raise ValueError(f"Action devices {num_devices} must be >= 0")
+                if action.models[gpu_type][model][alloc_id].replicas <= 0:
+                    # remove that instance if replicas is 0 or negative
+                    allocs_to_remove.append(alloc_id)
+            for alloc_id in reversed(allocs_to_remove):
+                del action.models[gpu_type][model][alloc_id]
+
+    return action.models
+
+
+def gen_actions(
+    workflow: WorkflowConfig,
+    num_gpus: dict[GPUType, int],
+    latency_data: LatencyData,
+    power_data: Optional[PowerData] = None,
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {},
+    policy: Policy = STREAMWISE_POLICY,
+    allow_removal: bool = False,
+    allow_merging: bool = False,
+    look_ahead_replicas: int = 3,
+) -> list[Action]:
+    actions: list[Action] = []
+
+    # Extract GPU types from models
+    gpu_types = list(models.keys())
+    assert len(gpu_types) == len(num_gpus), \
+        f"Number of GPU types in models {len(gpu_types)} must match num_gpus {len(num_gpus)}"
+
+    remaining_gpus = {}
+    for gpu_type in num_gpus.keys():
+        remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]})
+
+    # Option 1: Provision more by increasing <devices, replicas> for each model allocation
+    for model in Model:
+        if model not in workflow.models:
+            continue
+        for gpu_type in gpu_types:
+            for alloc_id in range(len(models[gpu_type][model])):
+                actions.extend(_gen_add_device_replica_actions(
+                    models=models,
+                    num_gpus=num_gpus,
+                    remaining_gpus=remaining_gpus[gpu_type],
+                    gpu_type=gpu_type,
+                    model_name=model,
+                    allocation_id=alloc_id,
+                    workflow=workflow,
+                    policy=policy,
+                    latency_data=latency_data,
+                    power_data=power_data,
+                    look_ahead_replicas=look_ahead_replicas,
+                ))
+
+    # Option 2: Add a model instance of <devices, replicas>
+    for model in Model:
+        if model not in workflow.models:
+            continue
+        for gpu_type in gpu_types:
+            actions.extend(_gen_add_instance(
+                models=models,
+                num_gpus=num_gpus,
+                remaining_gpus=remaining_gpus[gpu_type],
+                gpu_type=gpu_type,
+                model_name=model,
+                workflow=workflow,
+                policy=policy,
+                latency_data=latency_data,
+                power_data=power_data,
+                look_ahead_replicas=look_ahead_replicas,
+            ))
+
+    if allow_removal:
+        # Option 3: Remove replicas for each model allocation
+        for model in Model:
+            if model not in workflow.models:
+                continue
+            for gpu_type in gpu_types:
+                model_instances = models[gpu_type][model]
+                for alloc_id in range(len(model_instances)):
+                    action = _gen_remove_replica_action(
+                        models=models,
+                        num_gpus=num_gpus,
+                        gpu_type=gpu_type,
+                        model_name=model,
+                        allocation_id=alloc_id,
+                        workflow=workflow,
+                        policy=policy,
+                        latency_data=latency_data,
+                        power_data=power_data,
+                    )
+                    if action:
+                        actions.append(action)
+
+    if allow_merging:
+        # Option 4: Merge across model allocations
+        for model in Model:
+            if model not in workflow.models:
+                continue
+            for gpu_type in gpu_types:
+                actions.extend(_gen_merge_replicas_actions(
+                    models=models,
+                    num_gpus=num_gpus,
+                    gpu_type=gpu_type,
+                    model_name=model,
+                    workflow=workflow,
+                    policy=policy,
+                    latency_data=latency_data,
+                    power_data=power_data,
+                ))
+
+    return actions
+
+
+def _get_min_device_combinations(
+    num_gpus: int,
+    model: Model,
+) -> list[tuple[int, int]]:
+    """
+    Get the minimum device combinations for a given number of GPUs and model.
+    [(device_count, num_replicas), ...]
+    For example, for 64, it would return [(40, 1), (16, 1)].
+    """
+    remaining = num_gpus
+    result: list[int] = []
+    for size in sorted(DEVICE_OPTIONS[model], reverse=True):
+        while remaining >= size:
+            result.append(size)
+            remaining -= size
+    if remaining > 0:
+        raise ValueError(f"Cannot exactly decompose {num_gpus} with DEVICE_OPTIONS")
+    counts = Counter(result)
+    return sorted(counts.items(), reverse=True)  # Sort by device count descending
+
+
+def _get_large_instance_many_small_combinations(
+    num_gpus: int,
+    model: Model,
+) -> list[tuple[int, int]]:
+    """
+    Get the largest instance possible and then split the rest into 1 GPU instances.
+    For example, for 64, it would return [(40, 1), (1, 16)].
+    """
+    assert num_gpus > 0
+    assert model in DEVICE_OPTIONS
+    assert DEVICE_OPTIONS[model][0] == 1  # must have 1 GPU option to use this function
+
+    remaining_gpus = num_gpus
+    result: list[tuple[int, int]] = []
+    for size in sorted(DEVICE_OPTIONS[model], reverse=True):
+        if remaining_gpus >= size:
+            result = [(size, 1)]
+            remaining_gpus -= size
+            break
+    if remaining_gpus > 0:
+        result.append((1, remaining_gpus))
+    return result
+
+
+def _gen_add_device_replica_actions(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+    num_gpus: dict[GPUType, int],
+    remaining_gpus: int,
+    gpu_type: GPUType,
+    model_name: Model,
+    allocation_id: int,
+    workflow: WorkflowConfig,
+    policy: Policy,
+    latency_data: LatencyData,
+    power_data: Optional[PowerData] = None,
+    look_ahead_replicas: int = 3,
+) -> list[Action]:
+    """
+    Generate actions that explore all valid (replicas, devices) provisioning
+    options for a given model allocation, using the remaining GPUs.
+
+    From the current replicas * devices, find the next options by distributing the remaining devices.
+    For example, if currently 2 replicas at parallelism 4 with 4 remaining devices, options include:
+      - 3 replicas, 4 devices  (uses 12 total, 4 more than current 8)
+      - 1 replica, 10 devices  (uses 10 total, 2 more than current 8)
+      - etc.
+    """
+    actions: list[Action] = []
+
+    if model_name in SINGLE_DEVICE_MODELS and _is_single_instance(model_name, workflow):
+        return actions  # No scaling possible
+
+    alloc = models[gpu_type][model_name][allocation_id]
+    current_total = alloc.devices * max(alloc.replicas, 0)
+    current_replicas = alloc.replicas
+    total_available = current_total + remaining_gpus
+
+    max_num_devices = latency_data[gpu_type].get_max_parallelism(model_name)
+    max_replicas = alloc.get_max_replicas(workflow)
+    is_single_instance = _is_single_instance(model_name, workflow)
+    is_single_device = model_name in SINGLE_DEVICE_MODELS
+
+    seen: set[tuple[int, int]] = set()
+    seen.add((max(alloc.replicas, 0), alloc.devices))  # skip current config
+
+    for new_devices in DEVICE_OPTIONS[model_name]:
+        if new_devices > max_num_devices:
+            continue  # Exceeds max parallelism from latency data
+        if is_single_device and new_devices > 1:
+            continue  # Model only supports single device
+        if (model_name, new_devices) not in latency_data[gpu_type]:
+            continue  # No latency data for this device count
+
+        # Determine the range of replicas possible with this device count
+        if is_single_instance:
+            replica_candidates = [1]
+        else:
+            max_r = min(max_replicas, total_available // new_devices) if new_devices > 0 else 0
+            # limit max replicas to original replicas + X to avoid too many combinations
+            max_r = min(max_r, current_replicas + look_ahead_replicas)
+            replica_candidates = list(range(1, max_r + 1))
+
+        for new_replicas in replica_candidates:
+            new_total = new_replicas * new_devices
+            if new_total <= current_total:
+                continue  # Must be an increase
+            if new_total > total_available:
+                continue  # Not enough GPUs
+            if (new_replicas, new_devices) in seen:
+                continue
+            seen.add((new_replicas, new_devices))
+
+            try:
+                new_models = deepcopy(models)
+                new_models[gpu_type][model_name][allocation_id] = get_model_allocation(
+                    model=model_name,
+                    gpu_type=gpu_type,
+                    devices=new_devices,
+                    replicas=new_replicas,
+                )
+                action_result = evaluate_model_allocation(
+                    models=new_models,
+                    num_gpus=num_gpus,
+                    workflow=workflow,
+                    latency_data=latency_data,
+                    power_data=power_data,
+                    policy=policy,
+                    include_models=[model_name],
+                )
+                actions.append(Action(
+                    name=ActionName.ADD_DEVICE_REPLICA,
+                    model=model_name,
+                    gpu_type=gpu_type,
+                    models=new_models,
+                    action_result=action_result,
+                    arrival_time_s=alloc.time,
+                ))
+            except Exception:
+                pass  # Invalid configuration, skip
+
+    return actions
+
+
+def _gen_add_device_action(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+    num_gpus: dict[GPUType, int],
+    remaining_gpus: int,
+    gpu_type: GPUType,
+    model_name: Model,
+    allocation_id: int,
+    workflow: WorkflowConfig,
+    policy: Policy,
+    latency_data: LatencyData,
+    power_data: Optional[PowerData] = None,
+) -> Optional[Action]:
+    """
+    Action to add devices (increase parallelism) for a specific model allocation.
+    """
+    action: Optional[Action] = None
+
+    if model_name in SINGLE_DEVICE_MODELS:
+        return action  # These models only run on a single GPU, so we don't add more devices
+
+    alloc = models[gpu_type][model_name][allocation_id]
+
+    max_num_devices = latency_data[gpu_type].get_max_parallelism(model_name)
+    next_num_devices = find_next_devices(
+        DEVICE_OPTIONS[model_name],
+        num_devices=alloc.devices,
+        num_replicas=alloc.replicas,
+        remaining_devices=remaining_gpus,
+        max_num_devices=max_num_devices)
+
+    if not next_num_devices:
+        return action  # No valid next device option, skip
+    if (model_name, next_num_devices) not in latency_data[gpu_type]:
+        return action  # No latency data for this device option, skip
+
+    new_models = deepcopy(models)
+    new_models[gpu_type][model_name][allocation_id] = get_model_allocation(
+        model=model_name,
+        gpu_type=gpu_type,
+        devices=next_num_devices,
+        replicas=max(1, alloc.replicas),
+    )
+    try:
+        action_result = evaluate_model_allocation(
+            models=new_models,
+            num_gpus=num_gpus,
+            workflow=workflow,
+            latency_data=latency_data,
+            power_data=power_data,
+            policy=policy,
+            include_models=[model_name],
+        )
+        action = Action(
+            name=ActionName.ADD_DEVICE,
+            model=model_name,
+            gpu_type=gpu_type,
+            models=new_models,
+            action_result=action_result,
+            arrival_time_s=alloc.time,
+        )
+    except Exception:
+        pass  # Invalid action
+
+    return action
+
+
+def _gen_merge_replicas_actions(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+    gpu_type: GPUType,
+    model_name: Model,
+    num_gpus: dict[GPUType, int],
+    workflow: WorkflowConfig,
+    policy: Policy,
+    latency_data: LatencyData,
+    power_data: Optional[PowerData] = None,
+) -> list[Action]:
+    actions: list[Action] = []
+
+    if _is_single_instance(model_name, workflow):
+        return actions  # These models only support a single instance, so no need to merge
+
+    model_instances = models[gpu_type][model_name]
+    model_num_gpus = 0
+    for model_instance in model_instances:
+        model_num_gpus += model_instance.get_num_gpus()
+    if model_num_gpus <= 1:
+        return actions  # No replicas to merge for this model and GPU type
+
+    for device_combos in [
+        _get_min_device_combinations(model_num_gpus, model_name),
+        _get_large_instance_many_small_combinations(model_num_gpus, model_name)
+    ]:
+        new_models = deepcopy(models)
+        new_models[gpu_type][model_name] = []
+
+        for new_num_devices, new_num_replicas in device_combos:
+            new_models[gpu_type][model_name].append(get_model_allocation(
+                model=model_name,
+                gpu_type=gpu_type,
+                devices=new_num_devices,
+                replicas=new_num_replicas,
+            ))
+
+        try:
+            action_result = evaluate_model_allocation(
+                models=new_models,
+                num_gpus=num_gpus,
+                workflow=workflow,
+                latency_data=latency_data,
+                power_data=power_data,
+                policy=policy,
+                include_models=[model_name],
+            )
+
+            instance_id = 0
+            actions.append(Action(
+                name=ActionName.MERGE,
+                model=model_name,
+                gpu_type=gpu_type,
+                models=new_models,
+                action_result=action_result,
+                arrival_time_s=new_models[gpu_type][model_name][instance_id].time,
+            ))
+        except Exception:
+            pass  # Invalid action
+
+    return actions
+
+
+def _gen_add_instance(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+    num_gpus: dict[GPUType, int],
+    remaining_gpus: int,
+    gpu_type: GPUType,
+    model_name: Model,
+    workflow: WorkflowConfig,
+    policy: Policy,
+    latency_data: LatencyData,
+    power_data: Optional[PowerData] = None,
+    look_ahead_replicas: int = 3,
+) -> list[Action]:
+    actions: list[Action] = []
+
+    if _is_single_instance(model_name, workflow):
+        return actions  # These models only support a single instance, so we don't add more
+
+    for new_num_devices in DEVICE_OPTIONS[model_name]:
+        for new_num_replicas in list(range(1, look_ahead_replicas + 1)):
+            new_instance = get_model_allocation(
+                model=model_name,
+                gpu_type=gpu_type,
+                devices=new_num_devices,
+                replicas=new_num_replicas,
+            )
+            if new_instance.get_num_gpus() > remaining_gpus:
+                continue  # Not enough remaining GPUs for this new instance
+
+            new_models = deepcopy(models)
+            new_models[gpu_type][model_name].append(new_instance)
+
+            try:
+                action_result = evaluate_model_allocation(
+                    models=new_models,
+                    num_gpus=num_gpus,
+                    workflow=workflow,
+                    latency_data=latency_data,
+                    power_data=power_data,
+                    policy=policy,
+                    include_models=[model_name],
+                )
+                action = Action(
+                    name=ActionName.ADD_INSTANCE,
+                    model=model_name,
+                    gpu_type=gpu_type,
+                    models=new_models,
+                    action_result=action_result,
+                    arrival_time_s=new_instance.time,
+                )
+                actions.append(action)
+            except Exception:
+                pass  # Invalid action
+
+    return actions
+
+
+def _gen_remove_replica_action(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+    num_gpus: dict[GPUType, int],
+    gpu_type: GPUType,
+    model_name: Model,
+    allocation_id: int,
+    workflow: WorkflowConfig,
+    policy: Policy,
+    latency_data: LatencyData,
+    power_data: Optional[PowerData] = None,
+) -> Optional[Action]:
+    action: Optional[Action] = None
+
+    model = models[gpu_type][model_name][allocation_id]
+
+    if model.replicas == 0:
+        return action  # No replicas to remove for this model and GPU type
+
+    new_models = deepcopy(models)
+    new_models[gpu_type][model_name][allocation_id] = get_model_allocation(
+        model=model_name,
+        gpu_type=gpu_type,
+        devices=model.devices,
+        replicas=model.replicas - 1,
+    )
+
+    if len(num_gpus) == 2:
+        # For dual GPU setting, initialize removed replica on the other GPU type to see if it improves performance
+        gpu_types = list(num_gpus.keys())
+        other_gpu_type = gpu_types[0] if gpu_type == gpu_types[1] else gpu_types[1]
+        if _is_single_instance(model_name, workflow):
+            if new_models[gpu_type][model_name][allocation_id].replicas == 0:
+                # If this is a single instance model and we're removing the only replica, add it to the other GPU type
+                new_models[other_gpu_type][model_name].append(get_model_allocation(
+                    model=model_name,
+                    gpu_type=other_gpu_type,
+                    devices=model.devices,
+                    replicas=1,
+                ))
+
+    try:
+        action_result = evaluate_model_allocation(
+            models=new_models,
+            num_gpus=num_gpus,
+            workflow=workflow,
+            latency_data=latency_data,
+            power_data=power_data,
+            policy=policy,
+            include_models=[model_name],
+        )
+        action = Action(
+            name=ActionName.REMOVE_REPLICA,
+            model=model_name,
+            gpu_type=gpu_type,
+            models=new_models,
+            action_result=action_result,
+            arrival_time_s=new_models[gpu_type][model_name][allocation_id].time,
+        )
+    except Exception:
+        pass  # Ignore not possible action
+    return action
+
+
+def _gen_add_replica_action(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+    num_gpus: dict[GPUType, int],
+    remaining_gpus: int,
+    gpu_type: GPUType,
+    model_name: Model,
+    allocation_id: int,
+    workflow: WorkflowConfig,
+    policy: Policy,
+    latency_data: LatencyData,
+    power_data: Optional[PowerData] = None,
+) -> Optional[Action]:
+    """
+    Action to add replicas for a specific model allocation.
+    """
+    action: Optional[Action] = None
+
+    if _is_single_instance(model_name, workflow):
+        return action  # These models don't support replication, so we skip
+
+    model = models[gpu_type][model_name][allocation_id]
+
+    if remaining_gpus < model.devices:
+        return action  # Not enough remaining GPUs to add another replica
+
+    max_replicas = model.get_max_replicas(workflow)
+    if model.replicas >= max_replicas:
+        return action  # Already at max replicas, skip
+
+    new_num_replicas = min(
+        model.replicas + 1,
+        max_replicas,  # - models[other_gpu_type][Model.HF].replicas
+        model.replicas + remaining_gpus // model.devices
+    )
+    if new_num_replicas == model.replicas:
+        return action  # No changes, skip
+
+    new_models = deepcopy(models)
+    new_models[gpu_type][model_name][allocation_id] = get_model_allocation(
+        model=model_name,
+        gpu_type=gpu_type,
+        devices=model.devices,
+        replicas=new_num_replicas,
+    )
+
+    try:
+        action_result = evaluate_model_allocation(
+            models=new_models,
+            num_gpus=num_gpus,
+            workflow=workflow,
+            latency_data=latency_data,
+            power_data=power_data,
+            policy=policy,
+            include_models=[model_name],
+        )
+        action = Action(
+            name=ActionName.ADD_REPLICA,
+            model=model_name,
+            gpu_type=gpu_type,
+            models=new_models,
+            action_result=action_result,
+            arrival_time_s=model.time,
+        )
+    except Exception:
+        pass  # Invalid action
+
+    return action
+
+
+def max_time(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+    model_name: Model,
+) -> float:
+    values = []
+    for models_gpu in models.values():
+        if model_name in models_gpu:
+            for alloc in models_gpu[model_name]:
+                values.append(alloc.time)
+    return max(values)
diff --git a/simulator/auto_model_allocator.py b/simulator/auto_model_allocator.py
new file mode 100644
index 00000000..3ca86cb7
--- /dev/null
+++ b/simulator/auto_model_allocator.py
@@ -0,0 +1,109 @@
+"""
+Factory helpers for selecting the right model allocator implementation.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from dataclasses import replace
+from typing import Optional
+
+from sim_types import Policy
+from sim_types import WorkflowConfig
+from sim_types import LatencyData
+from sim_types import Model
+from sim_types import PowerData
+from sim_types import QualityLevel
+from sim_types import Solver
+from sim_types import GPUType
+from sim_types import Result
+
+from model_provisioner.policies import STREAMWISE_POLICY
+
+from model_allocator import ModelAllocator
+
+
+class AutoModelAllocator(ModelAllocator):
+    """Allocator wrapper that routes to a concrete allocator by solver."""
+
+    policy: Policy
+
+    def __init__(
+        self,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+        power_data: Optional[PowerData] = None,
+        policy: Policy = STREAMWISE_POLICY,
+    ) -> None:
+        super().__init__(
+            workflow=workflow,
+            latency_data=latency_data,
+            power_data=power_data,
+            policy=policy,
+        )
+        self._allocator = self._build_allocator()
+
+    def _build_allocator(self) -> ModelAllocator:
+        """Create concrete allocator based on configured solver."""
+        if self.policy.solver == Solver.GREEDY:
+            from model_provisioner.greedy import GreedyAllocator
+            return GreedyAllocator(
+                workflow=self.workflow,
+                latency_data=self.latency_data,
+                power_data=self.power_data,
+                policy=self.policy,
+            )
+        if self.policy.solver == Solver.NAIVE:
+            from model_provisioner.naive_baseline import NaiveAllocator
+            return NaiveAllocator(
+                workflow=self.workflow,
+                latency_data=self.latency_data,
+                power_data=self.power_data,
+                policy=self.policy,
+            )
+        if self.policy.solver in {Solver.GUROBI, Solver.HIGHS}:
+            from model_provisioner.milp import MILPAllocator
+            return MILPAllocator(
+                workflow=self.workflow,
+                latency_data=self.latency_data,
+                power_data=self.power_data,
+                policy=self.policy,
+            )
+        if self.policy.solver == Solver.HEXGEN:
+            from model_provisioner.hexgen import HexGenAllocator
+            return HexGenAllocator(
+                workflow=self.workflow,
+                latency_data=self.latency_data,
+                power_data=self.power_data,
+                policy=self.policy,
+            )
+        if self.policy.solver == Solver.HELIX:
+            from model_provisioner.helix import HelixAllocator
+            return HelixAllocator(
+                workflow=self.workflow,
+                latency_data=self.latency_data,
+                power_data=self.power_data,
+                policy=self.policy,
+            )
+        raise ValueError(f"Unsupported solver for allocator selection: {self.policy.solver}")
+
+    def allocate(
+        self,
+        num_gpus: dict[GPUType, int],
+        verbose: bool = False,
+    ) -> Result:
+        if self.policy.use_upscaler and self.workflow.target_resolution == QualityLevel.LOW:
+            logging.warning(
+                f"Policy {self.policy.name} uses upscaler, but workflow target resolution is LOW. "
+                f"Disabling upscaler for this allocation.")
+            self.policy = replace(self.policy, use_upscaler=False)
+            self._allocator.policy = self.policy
+            # Remove upscaler from model work
+            self.workflow.model_work.pop(Model.UPSCALER, None)
+            self._allocator.workflow = self.workflow
+
+        return self._allocator.allocate(
+            num_gpus=num_gpus,
+            verbose=verbose,
+        )
diff --git a/simulator/constants.py b/simulator/constants.py
new file mode 100644
index 00000000..bb6f9034
--- /dev/null
+++ b/simulator/constants.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+import math
+
+from sim_types import WorkflowConfig
+from sim_types import GPUType
+from sim_types import Model
+
+
+SECONDS_IN_MINUTE = 60.0
+SECONDS_IN_HOUR = 60.0 * 60.0
+
+# Video resolution constants (16:10)
+NUM_PIXELS_ORIGINAL = 1280 * 800
+NUM_PIXELS_ORIGINAL_FLUX = 1280 * 800
+NUM_PIXELS_ORIGINAL_HF = 512 * 320
+NUM_PIXELS_ORIGINAL_FT = 640 * 400
+NUM_PIXELS_ORIGINAL_UPSCALER = 1280 * 800
+
+NUM_PIXELS_MEDIUM = 640 * 400
+NUM_PIXELS_MEDIUM_FLUX = 640 * 400
+NUM_PIXELS_MEDIUM_HF = 256 * 160
+NUM_PIXELS_MEDIUM_FT = 320 * 200
+NUM_PIXELS_MEDIUM_UPSCALER = 640 * 400
+
+NUM_PIXELS_LOW = 320 * 200
+NUM_PIXELS_LOW_FLUX = 320 * 200
+NUM_PIXELS_LOW_HF = 128 * 80
+NUM_PIXELS_LOW_FT = 160 * 100
+NUM_PIXELS_LOW_UPSCALER = 320 * 200
+
+# StreamCast constants
+TOTAL_INPUT_TOKENS = 20 * 1024  # 20K tokens for instructions, PDFs, etc.
+TOTAL_VIDEO_SECONDS = 10 * 60  # 10 minutes video
+TOTAL_SUBSCENES = 172  # each subscene is 3.5 seconds -> limited by fantasytalking 81 frames at 23 FPS
+TOTAL_SCENES = 43  # each scene is 4 subscenes
+FPS: dict[Model, float] = {
+    Model.HF: 30,
+    Model.FT: 23,
+}
+NUM_STEPS: dict[Model, int] = {
+    Model.FLUX: 25,
+    Model.HF: 10,
+    Model.FT: 10,
+}
+FRAMES_OPTIONS: dict[Model, list[int]] = {
+    Model.HF: [36, 72, 108, 144, 324],
+    Model.FT: [9, 21, 41, 61, 77],
+}
+FRAMES_PER_STEP_IDX = 4
+
+DEFAULT_WORKFLOW_CONFIG = WorkflowConfig(
+    total_video_seconds=TOTAL_VIDEO_SECONDS,
+    total_scenes=TOTAL_SCENES,
+    total_frames={
+        Model.HF: math.ceil(TOTAL_VIDEO_SECONDS * FPS[Model.HF]),
+        Model.FT: math.ceil(TOTAL_VIDEO_SECONDS * FPS[Model.FT]),
+    },
+    total_subscenes=TOTAL_SUBSCENES,
+    per_subscene_frames={
+        Model.HF: math.ceil(TOTAL_VIDEO_SECONDS * FPS[Model.HF] / TOTAL_SUBSCENES),
+        Model.FT: math.ceil(TOTAL_VIDEO_SECONDS * FPS[Model.FT] / TOTAL_SUBSCENES),
+    },
+    # default per-frame number of denoising steps
+    num_steps=dict(NUM_STEPS),
+    # supported number of generation frames
+    hf_frames=FRAMES_OPTIONS[Model.HF],
+    ft_frames=FRAMES_OPTIONS[Model.FT],
+    frames_per_step_idx=FRAMES_PER_STEP_IDX,
+    total_input_tokens=TOTAL_INPUT_TOKENS,
+)
+
+# Available device counts for scaling
+# Tensor parallelism (TP) or sequence parallelism (SP)
+DEVICE_OPTIONS = {
+    Model.GEMMA: [1, 2, 4, 8],
+    Model.FLUX: [1, 2, 4, 8, 16],
+    Model.OTHERS: [1],  # Single GPU, no parallelism
+    Model.HF: [1, 2, 4, 8, 10, 16, 20, 24, 32, 40],
+    Model.HF_VAE: [1],  # Single GPU, no parallelism
+    Model.FT: [1, 2, 4, 8, 10, 16, 20, 24, 32, 40],
+    Model.FT_VAE: [1],  # Single GPU, no parallelism
+    Model.UPSCALER: [1, 2, 4, 8],  # Single GPU, no parallelism
+}
+
+# Models that only have one instance in the system, so not scaling them across GPU types
+SINGLE_INSTANCE_MODELS = [
+    Model.GEMMA,
+    Model.FLUX,
+    Model.OTHERS,
+]
+
+# Models that can only be run on a single GPU
+SINGLE_DEVICE_MODELS = [
+    Model.OTHERS,
+    Model.HF_VAE,
+    Model.FT_VAE,
+]
+
+
+NUM_GPUS_PER_SERVER = {
+    GPUType.A100: 8,
+    GPUType.H100: 8,
+    GPUType.H200: 8,
+    GPUType.GB200: 8,  # This is technically 4 GPUs per server, but nothing fits
+}
+
+
+POWER_GPU_IDLE = {
+    GPUType.A100: 65.0,  # Watts
+    GPUType.H100: 80.0,  # Watts TODO placeholder value
+    GPUType.H200: 80.0,  # Watts TODO placeholder value
+    GPUType.GB200: 170.0,  # Watts
+}
+
+
+POWER_GPU_TDP = {
+    GPUType.A100: 400.0,  # Watts
+    GPUType.H100: 700.0,  # Watts
+    GPUType.H200: 700.0,  # Watts
+    GPUType.GB200: 1200.0,  # Watts
+}
+
+
+# Cost per GPU
+GPU_SPOT_COST = {
+    # $ / hour (Spot prices)
+    GPUType.A100: 1.07,  # $8.56 for 8 GPUs
+    GPUType.H100: 4.03,  # $32.24 for 8 GPUs
+    GPUType.H200: 4.22,  # $33.76 for 8 GPUs
+    GPUType.GB200: 10.76  # $43.04 for 4 GPUs
+}
+
+GPU_RESERVED_COST = {
+    # $ / hour (Reserved prices)
+    GPUType.A100: 3.4,  # $27.2 for 8 GPUs
+    GPUType.H100: 5.39,  # $43.12 for 8 GPUs
+    GPUType.H200: 5.64,  # $45.12 for 8 GPUs
+    GPUType.GB200: 14.42  # $57.68 for 4 GPUs
+}
+
+GPU_COST = GPU_SPOT_COST
diff --git a/simulator/data_loading.py b/simulator/data_loading.py
new file mode 100644
index 00000000..bea78715
--- /dev/null
+++ b/simulator/data_loading.py
@@ -0,0 +1,300 @@
+"""
+Module for loading latency and power consumption data from CSV files.
+"""
+
+import pandas as pd
+
+from pathlib import Path
+
+from sim_types import LatencyData
+from sim_types import PowerData
+from sim_types import GPUType
+from sim_types import LatencyGPUTypeData
+from sim_types import PowerGPUTypeData
+from sim_types import QualityLevel
+
+from constants import NUM_PIXELS_ORIGINAL_UPSCALER
+from constants import NUM_PIXELS_ORIGINAL_FT
+from constants import NUM_PIXELS_ORIGINAL_HF
+from constants import NUM_PIXELS_ORIGINAL_FLUX
+from constants import NUM_PIXELS_LOW_FT
+from constants import NUM_PIXELS_LOW_HF
+from constants import NUM_PIXELS_LOW_FLUX
+from constants import NUM_PIXELS_LOW_UPSCALER
+from constants import NUM_PIXELS_MEDIUM_FT
+from constants import NUM_PIXELS_MEDIUM_HF
+from constants import NUM_PIXELS_MEDIUM_UPSCALER
+from constants import NUM_PIXELS_MEDIUM_FLUX
+from constants import POWER_GPU_IDLE
+from constants import POWER_GPU_TDP
+
+_DEFAULT_DATA_DIR = str(Path(__file__).resolve().parents[2] / "simulator" / "data")
+
+
+def load_latency_data(
+    data_dir: str = _DEFAULT_DATA_DIR,
+) -> LatencyData:
+    """
+    Load latency and throughput mapping data from CSV files.
+
+    Args:
+        data_dir (str): The directory where the CSV files are stored.
+    Returns:
+        LatencyData: An object containing all loaded latency data.
+    """
+    data_path = Path(data_dir)
+
+    data = LatencyData(gpus={})
+    for gpu_type in GPUType:
+        data.gpus[gpu_type] = LatencyGPUTypeData(gpu_type=gpu_type)
+
+        # Flux time -> per image generation
+        csv_flux_path = data_path / f"latency_flux_mapping_{gpu_type.value.lower()}.csv"
+        df_flux = pd.read_csv(csv_flux_path, comment='#')
+        data[gpu_type].flux = dict(zip(
+            df_flux["world_size"],
+            df_flux["avg_steps_time"]))
+
+        # Hunyuan Framepack per step time -> [36, 72, 108, 144, 324] frames generation
+        csv_hf_path = data_path / f"latency_hf_mapping_{gpu_type.value.lower()}.csv"
+        df_hf = pd.read_csv(csv_hf_path, comment='#')
+        data[gpu_type].hf = dict(zip(
+            df_hf["world_size"],
+            df_hf["avg_steps_time"]))
+
+        # Hunyuan Framepack VAE time -> per inference iteration
+        # Derived: steps * avg_step_time * vae_pct(vae_time / total_time)
+        data[gpu_type].hf_vae = dict(zip(
+            df_hf["world_size"],
+            df_hf["vae_time"]))
+
+        # Fantasy Talking per step time -> [9, 21, 41, 61, 77] frames generation
+        csv_ft_path = data_path / f"latency_ft_mapping_{gpu_type.value.lower()}.csv"
+        df_ft = pd.read_csv(csv_ft_path, comment='#')
+        data[gpu_type].ft = dict(zip(
+            df_ft["world_size"],
+            df_ft["avg_steps_time"]))
+
+        # Fantasy Talking VAE time -> per inference iteration
+        # Derived: steps * avg_step_time * vae_pct(vae_time / total_time)
+        data[gpu_type].ft_vae = dict(zip(
+            df_ft["world_size"],
+            df_ft["vae_time"]))
+
+        # Upscaler time -> per image frame
+        csv_upscaler_path = data_path / f"latency_upscaler_{gpu_type.value.lower()}.csv"
+        df_upscaler = pd.read_csv(csv_upscaler_path, comment='#')
+        data[gpu_type].upscaler = dict(zip(
+            df_upscaler['world_size'],
+            df_upscaler['avg_steps_time']))
+
+        # Gemma time -> first scene and per scene
+        csv_gemma_path = data_path / f"latency_gemma_{gpu_type.value.lower()}.csv"
+        df_gemma = pd.read_csv(csv_gemma_path, comment='#')
+        data[gpu_type].gemma_first_scene = dict(zip(
+            df_gemma['tp'],
+            df_gemma['first_scene_time']))
+        data[gpu_type].gemma_per_scene = dict(zip(
+            df_gemma['tp'],
+            df_gemma['per_scene_time']))
+
+        # Others time -> kokoro and other overheads -> time per scene
+        csv_others_path = data_path / f"latency_others_{gpu_type.value.lower()}.csv"
+        df_others = pd.read_csv(csv_others_path, comment='#')
+        data[gpu_type].others = dict(zip(
+            df_others['world_size'],
+            df_others['time']))
+
+    return data
+
+
+def load_power_data(
+    data_dir: str = _DEFAULT_DATA_DIR
+) -> PowerData:
+    """
+    Load power consumption data from CSV files.
+
+    Args:
+        data_dir (str): The directory where the CSV files are stored.
+    Returns:
+        PowerData: An object containing all loaded power consumption data.
+    """
+    data_path = Path(data_dir)
+
+    data = PowerData(gpus={})
+    for gpu_type in GPUType:
+        data.gpus[gpu_type] = PowerGPUTypeData(gpu_type=gpu_type)
+
+        # Flux power profile
+        power_flux_file_name = data_path / f'power_flux_mapping_{gpu_type.value.lower()}.csv'
+        power_flux_df = pd.read_csv(power_flux_file_name, comment='#')
+        data[gpu_type].flux = dict(zip(
+            power_flux_df['world_size'],
+            power_flux_df['power_watts']))
+
+        # Hunyuan Framepack 640x400 power profile
+        power_hf_file_name = data_path / f'power_hf_mapping_{gpu_type.value.lower()}.csv'
+        power_hf_df = pd.read_csv(power_hf_file_name, comment='#')
+        data[gpu_type].hf = dict(zip(
+            power_hf_df['world_size'],
+            power_hf_df['power_watts']))
+
+        # Hunyuan Framepack 1280x800 power profile
+        power_hf_file_name_high = data_path / f'power_hf_mapping_{gpu_type.value.lower()}_high.csv'
+        power_hf_high_df = pd.read_csv(power_hf_file_name_high, comment='#')
+        data[gpu_type].hf_high = dict(zip(
+            power_hf_high_df['world_size'],
+            power_hf_high_df['power_watts']))
+
+        # Hunyuan Framepack VAE power profile
+        power_hf_vae_file_name = data_path / f'power_hf_vae_{gpu_type.value.lower()}.csv'
+        power_hf_vae_df = pd.read_csv(power_hf_vae_file_name, comment='#')
+        data[gpu_type].hf_vae = dict(zip(
+            power_hf_vae_df['world_size'],
+            power_hf_vae_df['power_watts']))
+
+        # Hunyuan Framepack VAE high power profile
+        power_hf_vae_high_file_name = data_path / f'power_hf_vae_{gpu_type.value.lower()}_high.csv'
+        power_hf_vae_high_df = pd.read_csv(power_hf_vae_high_file_name, comment='#')
+        data[gpu_type].hf_vae_high = dict(zip(
+            power_hf_vae_high_df['world_size'],
+            power_hf_vae_high_df['power_watts']))
+
+        # Fantasy Talking 640x400 power profile
+        power_ft_file_name = data_path / f'power_ft_mapping_{gpu_type.value.lower()}.csv'
+        power_ft_df = pd.read_csv(power_ft_file_name, comment='#')
+        data[gpu_type].ft = dict(zip(
+            power_ft_df['world_size'],
+            power_ft_df['power_watts']))
+
+        # Fantasy Talking 1280x800 power profile
+        power_ft_high_file_name = data_path / f'power_ft_mapping_{gpu_type.value.lower()}_high.csv'
+        power_ft_high_df = pd.read_csv(power_ft_high_file_name, comment='#')
+        data[gpu_type].ft_high = dict(zip(
+            power_ft_high_df['world_size'],
+            power_ft_high_df['power_watts']))
+
+        # Fantasy Talking VAE mapping
+        power_ft_vae_file_name = data_path / f'power_ft_vae_mapping_{gpu_type.value.lower()}.csv'
+        power_ft_vae_df = pd.read_csv(power_ft_vae_file_name, comment='#')
+        data[gpu_type].ft_vae = dict(zip(
+            power_ft_vae_df['world_size'],
+            power_ft_vae_df['power_watts']))
+
+        # Fantasy Talking VAE high mapping
+        power_ft_vae_high_file_name = data_path / f'power_ft_vae_mapping_{gpu_type.value.lower()}_high.csv'
+        power_ft_vae_high_df = pd.read_csv(power_ft_vae_high_file_name, comment='#')
+        data[gpu_type].ft_vae_high = dict(zip(
+            power_ft_vae_high_df['world_size'],
+            power_ft_vae_high_df['power_watts']))
+
+        # Upscaler power profile
+        power_upscaler_file_name = data_path / f'power_upscaler_{gpu_type.value.lower()}.csv'
+        power_upscaler_df = pd.read_csv(power_upscaler_file_name, comment='#')
+        data[gpu_type].upscaler = dict(zip(
+            power_upscaler_df['world_size'],
+            power_upscaler_df['power_watts']))
+
+        # Gemma power profile
+        power_gemma_first_scene_file_name = data_path / f'power_gemma_first_scene_{gpu_type.value.lower()}.csv'
+        power_gemma_per_scene_file_name = data_path / f'power_gemma_per_scene_{gpu_type.value.lower()}.csv'
+        power_gemma_first_scene_df = pd.read_csv(power_gemma_first_scene_file_name, comment='#')
+        power_gemma_per_scene_df = pd.read_csv(power_gemma_per_scene_file_name, comment='#')
+        data[gpu_type].gemma_first_scene = dict(zip(
+            power_gemma_first_scene_df['world_size'],
+            power_gemma_first_scene_df['power_watts']
+        ))
+        data[gpu_type].gemma_per_scene = dict(zip(
+            power_gemma_per_scene_df['world_size'],
+            power_gemma_per_scene_df['power_watts']
+        ))
+
+    # Idle and TDP power profiles
+    for gpu_type in GPUType:
+        data[gpu_type].idle = POWER_GPU_IDLE[gpu_type]
+        data[gpu_type].tdp = POWER_GPU_TDP[gpu_type]
+
+    return data
+
+
+def load_adaptive_quality_data(
+    data_dir: str,
+    level: QualityLevel,
+) -> LatencyData:
+    """Load latency data for adaptive quality."""
+    assert isinstance(level, QualityLevel)
+
+    latency_data = load_latency_data(data_dir=data_dir)
+
+    if level == QualityLevel.ORIGINAL or level == QualityLevel.HIGH:
+        return latency_data
+
+    if level == QualityLevel.MEDIUM:
+        ratio_flux = NUM_PIXELS_MEDIUM_FLUX / NUM_PIXELS_ORIGINAL_FLUX
+        ratio_hf = NUM_PIXELS_MEDIUM_HF / NUM_PIXELS_ORIGINAL_HF
+        ratio_hf_vae = NUM_PIXELS_MEDIUM_HF / NUM_PIXELS_ORIGINAL_HF
+        ratio_ft = NUM_PIXELS_MEDIUM_FT / NUM_PIXELS_ORIGINAL_FT
+        ratio_ft_vae = NUM_PIXELS_MEDIUM_FT / NUM_PIXELS_ORIGINAL_FT
+        ratio_upscaler = NUM_PIXELS_MEDIUM_UPSCALER / NUM_PIXELS_ORIGINAL_UPSCALER
+        for gpu_type in GPUType:
+            latency_data[gpu_type].flux = {
+                k: v * ratio_flux
+                for k, v in latency_data[gpu_type].flux.items()
+            }
+            latency_data[gpu_type].hf = {
+                k: v * ratio_hf
+                for k, v in latency_data[gpu_type].hf.items()
+            }
+            latency_data[gpu_type].hf_vae = {
+                k: v * ratio_hf_vae
+                for k, v in latency_data[gpu_type].hf_vae.items()
+            }
+            latency_data[gpu_type].ft = {
+                k: v * ratio_ft
+                for k, v in latency_data[gpu_type].ft.items()
+            }
+            latency_data[gpu_type].ft_vae = {
+                k: v * ratio_ft_vae
+                for k, v in latency_data[gpu_type].ft_vae.items()
+            }
+            latency_data[gpu_type].upscaler = {
+                k: v * ratio_upscaler
+                for k, v in latency_data[gpu_type].upscaler.items()
+            }
+        return latency_data
+
+    if level == QualityLevel.LOW:
+        ratio_flux = NUM_PIXELS_LOW_FLUX / NUM_PIXELS_ORIGINAL_FLUX
+        ratio_hf = NUM_PIXELS_LOW_HF / NUM_PIXELS_ORIGINAL_HF
+        ratio_hf_vae = NUM_PIXELS_LOW_HF / NUM_PIXELS_ORIGINAL_HF
+        ratio_ft = NUM_PIXELS_LOW_FT / NUM_PIXELS_ORIGINAL_FT
+        ratio_ft_vae = NUM_PIXELS_LOW_FT / NUM_PIXELS_ORIGINAL_FT
+        ratio_upscaler = NUM_PIXELS_LOW_UPSCALER / NUM_PIXELS_ORIGINAL_UPSCALER
+        for gpu_type in GPUType:
+            latency_data[gpu_type].flux = {
+                k: v * ratio_flux
+                for k, v in latency_data[gpu_type].flux.items()
+            }
+            latency_data[gpu_type].hf = {
+                k: v * ratio_hf
+                for k, v in latency_data[gpu_type].hf.items()
+            }
+            latency_data[gpu_type].hf_vae = {
+                k: v * ratio_hf_vae
+                for k, v in latency_data[gpu_type].hf_vae.items()
+            }
+            latency_data[gpu_type].ft = {
+                k: v * ratio_ft
+                for k, v in latency_data[gpu_type].ft.items()
+            }
+            latency_data[gpu_type].ft_vae = {
+                k: v * ratio_ft_vae
+                for k, v in latency_data[gpu_type].ft_vae.items()
+            }
+            latency_data[gpu_type].upscaler = {
+                k: v * ratio_upscaler
+                for k, v in latency_data[gpu_type].upscaler.items()
+            }
+        return latency_data
+
+    return latency_data
diff --git a/simulator/evaluator.py b/simulator/evaluator.py
new file mode 100644
index 00000000..a9730bb2
--- /dev/null
+++ b/simulator/evaluator.py
@@ -0,0 +1,414 @@
+"""
+Evaluate the performance of a given model allocation in terms of time, energy, and cost.
+It includes some assertions (e.g., only one instance of Gemma and Flux).
+"""
+from __future__ import annotations
+
+import math
+import logging
+
+from typing import Optional
+
+from constants import NUM_GPUS_PER_SERVER
+from constants import TOTAL_INPUT_TOKENS
+from constants import SECONDS_IN_HOUR
+
+from sim_types import Result
+from sim_types import GPUType
+from sim_types import WorkflowConfig
+from sim_types import PowerData
+from sim_types import LatencyData
+from sim_types import Model
+from sim_types import ModelAllocation
+from sim_types import Policy
+
+from sim_types_json import models_to_json
+from sim_types_json import workflow_to_json
+from sim_types_json import policy_to_json
+
+
+def _count_instances(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+    model: Model,
+) -> int:
+    num_instances = 0
+    for model_gpus in models.values():
+        if model in model_gpus:
+            for model_allocation in model_gpus[model]:
+                if model_allocation.get_num_gpus() > 0:
+                    num_instances += 1
+    return num_instances
+
+
+def _assert_single_instance(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+    model: Model,
+) -> None:
+    num_instances = _count_instances(models, model)
+    assert num_instances == 1, f"Expected exactly one instance of {model}, but found {num_instances}"
+
+
+def _assert_at_least_one_instance(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+    model: Model,
+) -> None:
+    num_instances = _count_instances(models, model)
+    assert num_instances > 0, f"Expected at least one instance of {model}, but found {num_instances}"
+
+
+def _assert_no_instances(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+    model: Model,
+) -> None:
+    num_instances = _count_instances(models, model)
+    assert num_instances == 0, f"Expected no instances of {model}, but found {num_instances}"
+
+
+def evaluate_times(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+    latency_data: LatencyData,
+    workflow: WorkflowConfig,
+    policy: Policy,
+    include_models: Optional[list[Model]] = None,
+) -> None:
+    """
+    Compute the total time for the given model allocation and workflow, using the latency data.
+    It only evaluates the models specified in "include_models" if provided.
+    """
+    gpu_types = list(models.keys())
+
+    upscaler_gpus = sum(
+        model_alloc.get_num_gpus()
+        for gpu_type in gpu_types
+        for model_alloc in models.get(gpu_type, {}).get(Model.UPSCALER, [])
+    )
+    if not policy.use_upscaler:
+        assert upscaler_gpus == 0
+
+    for model_name in workflow.models:
+        if include_models is not None and model_name not in include_models:
+            continue
+
+        # Special conditions: models that require a policy flag
+        if model_name == Model.HF_VAE and not policy.is_disaggregated(Model.HF):
+            _assert_no_instances(models, Model.HF_VAE)
+            continue
+        if model_name == Model.FT_VAE and not policy.is_disaggregated(Model.FT):
+            _assert_no_instances(models, Model.FT_VAE)
+            continue
+        if model_name == Model.UPSCALER and not policy.use_upscaler:
+            _assert_no_instances(models, Model.UPSCALER)
+            continue
+
+        _assert_at_least_one_instance(models, model_name)
+
+        if not workflow.is_parallelizable(model_name):
+            # Single-instance: no work splitting
+            for gpu_type in gpu_types:
+                if model_name in models[gpu_type]:
+                    for model_alloc in models[gpu_type][model_name]:
+                        model_alloc.calculate_time(
+                            policy, workflow, latency_data)
+                        model_alloc.calculate_time_first(
+                            policy, workflow, latency_data)
+            continue
+
+        # Parallel: capacity-based work splitting (throughput-weighted)
+        capacities: dict[GPUType, list[float]] = {}
+        for gpu_type in gpu_types:
+            capacities[gpu_type] = []
+            if model_name not in models[gpu_type]:
+                continue
+            for model_alloc in models[gpu_type][model_name]:
+                if model_alloc.get_num_gpus() > 0:
+                    latency = latency_data[gpu_type][model_name, model_alloc.devices]
+                    # When not disaggregated, include VAE overhead in capacity
+                    if model_name == Model.FT and not policy.is_disaggregated(Model.FT):
+                        latency += latency_data[gpu_type][Model.FT_VAE, 1] / workflow.num_steps[Model.FT]
+                    if model_name == Model.HF and not policy.is_disaggregated(Model.HF):
+                        latency += latency_data[gpu_type][Model.HF_VAE, 1] / workflow.num_steps[Model.HF]
+                    if model_name in (Model.HF, Model.HF_VAE, Model.FT, Model.FT_VAE):
+                        latency *= workflow.get_resolution_scale(policy.use_upscaler)
+                    if model_name == Model.GEMMA:
+                        latency *= workflow.total_input_tokens / TOTAL_INPUT_TOKENS
+                    if latency == 0:
+                        capacities[gpu_type].append(0.0)
+                    else:
+                        capacities[gpu_type].append(model_alloc.replicas / latency)
+
+        total_capacity = sum(sum(c) for c in capacities.values())
+        for gpu_type in gpu_types:
+            if model_name not in models[gpu_type]:
+                continue
+            cap_idx = 0
+            for model_alloc in models[gpu_type][model_name]:
+                if model_alloc.get_num_gpus() > 0:
+                    work_pct = capacities[gpu_type][cap_idx] / total_capacity if total_capacity > 0 else 0.0
+                    model_alloc.calculate_time(
+                        policy, workflow, latency_data,
+                        work_pct=work_pct)
+                    model_alloc.calculate_time_first(
+                        policy, workflow, latency_data)
+                    cap_idx += 1
+
+
+def evaluate_energy(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+    power_data: PowerData,
+    workflow: WorkflowConfig,
+    total_time_s: float = 0.0,
+) -> None:
+    """
+    Calculate total energy (power * time * replicas for each model).
+    Need to run after evaluate_times since energy calculation depends on time.
+    """
+    for gpu_type_allocs in models.values():
+        for model_allocation_list in gpu_type_allocs.values():
+            for model_allocation in model_allocation_list:
+                model_allocation.calculate_energy(
+                    workflow,
+                    power_data,
+                    total_time_s)
+
+
+def evaluate_cost(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+    total_time_s: float,
+    policy: Policy,
+) -> None:
+    """
+    Calculate total cost based on GPU hours used.
+    Need to run after evaluate_times since cost calculation depends on time.
+    """
+    for gpu_type_allocs in models.values():
+        for model_allocation_list in gpu_type_allocs.values():
+            for model in model_allocation_list:
+                model.calculate_cost(policy, total_time_s)
+
+
+_EVALUATOR_CACHE: dict[str, Result] = {}
+
+
+def evaluate_model_allocation(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+    num_gpus: dict[GPUType, int],
+    workflow: WorkflowConfig,
+    latency_data: LatencyData,
+    power_data: Optional[PowerData],
+    policy: Policy,
+    include_models: Optional[list[Model]] = None,
+    cache_results: bool = False,
+    round_up_cost_to_server: bool = False,
+) -> Result:
+    """
+    Evaluate the metrics for a given allocation of models to GPUs.
+    It only evaluates the models in "include_models" if specified.
+    """
+    cache_key = None
+    if cache_results:
+        cache_key = models_to_json(models) + \
+            workflow_to_json(workflow) + \
+            str(latency_data) + \
+            str(power_data) + \
+            policy_to_json(policy) + \
+            str(include_models)
+        if cache_key in _EVALUATOR_CACHE:
+            return _EVALUATOR_CACHE[cache_key]
+
+    # Check if setup is possible
+    gpus_used = {}
+    for gpu_type, model_gpu in models.items():
+        gpus_used[gpu_type] = calc_used_gpus({gpu_type: model_gpu})
+        assert num_gpus[gpu_type] % NUM_GPUS_PER_SERVER[gpu_type] == 0, \
+            f"{gpu_type.value}: {num_gpus[gpu_type]} % {NUM_GPUS_PER_SERVER[gpu_type]}"
+        assert gpus_used[gpu_type] <= num_gpus[gpu_type], \
+            f"{gpu_type.value}: {gpus_used[gpu_type]} > {num_gpus[gpu_type]}"
+
+    # Assert input models are built correctly
+    for gpu_type in models.keys():
+        for model_name in models[gpu_type].keys():
+            for instance_id in range(len(models[gpu_type][model_name])):
+                assert models[gpu_type][model_name][instance_id].model == model_name
+                assert models[gpu_type][model_name][instance_id].gpu_type == gpu_type
+
+    # Actual evaluation
+    evaluate_times(
+        models, latency_data, workflow, policy,
+        include_models=include_models,
+    )
+    time_s = calc_total_time(models)
+
+    first_chunk_time = calc_ttff(models)
+    ttff_s = max(
+        first_chunk_time,
+        time_s - workflow.total_video_seconds
+    )
+
+    num_frames = (workflow.total_frames[Model.FT] - workflow.per_subscene_frames[Model.FT])
+    tbf_s = (time_s - first_chunk_time) / num_frames
+    if tbf_s < 0:
+        logging.debug(
+            f"Negative TBF: "
+            F"{tbf_s:.2f} = ({time_s:.2f} - {first_chunk_time:.2f}) / {num_frames}")
+        tbf_s = 0.0
+
+    # Calculate total energy (power * time * replicas for each model)
+    energy = 0.0
+    if power_data is not None:
+        evaluate_energy(models, power_data, workflow, time_s)
+        energy = calc_energy(models=models)
+
+    evaluate_cost(models, time_s, policy)
+    cost = calc_cost(
+        models, time_s, policy,
+        round_up_to_server=round_up_cost_to_server)
+
+    ret = Result(
+        models=models,
+        gpus_used=gpus_used,
+        gpus_total=num_gpus,
+        total_time_s=time_s,
+        first_chunk_time=first_chunk_time,
+        ttff_s=ttff_s,
+        tbf_s=tbf_s,
+        total_energy=energy if power_data else 0.0,
+        cost=cost,
+    )
+
+    if cache_key is not None:
+        _EVALUATOR_CACHE[cache_key] = ret
+
+    return ret
+
+
+def calc_energy(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+) -> float:
+    """
+    Calculate total energy (power * time * replicas for each model).
+    Energy in Watt x seconds (Joules).
+    This assumes that evaluate_energy() has been called already.
+    """
+    energy = 0.0  # Total energy in Watt-seconds (Joules = Watt x second)
+    for model_dict in models.values():
+        for model_allocations in model_dict.values():
+            for model_allocation in model_allocations:
+                energy += model_allocation.energy
+    return energy
+
+
+def calc_model_cost(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+) -> float:
+    """
+    Calculate total cost based on GPU hours used.
+    This assumes that evaluate_cost() has been called already.
+    """
+    costs = {}
+    for gpu_type, model_dict in models.items():
+        costs[gpu_type] = 0.0
+        for model_allocations in model_dict.values():
+            for model_allocation in model_allocations:
+                costs[gpu_type] += model_allocation.cost
+    return sum(costs.values())
+
+
+def calc_cost(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+    time_s: float,
+    policy: Policy,
+    round_up_to_server: bool = True,
+) -> float:
+    """
+    Calculate total cost based on GPU hours used.
+    """
+    used_gpus = calc_used_gpus_per_type(models)
+
+    # Round up to the nearest server (pack of GPUs) since we pay for whole servers
+    if round_up_to_server:
+        for gpu_type, used in used_gpus.items():
+            used_pack = math.ceil(used / NUM_GPUS_PER_SERVER[gpu_type]) * NUM_GPUS_PER_SERVER[gpu_type]
+            used_gpus[gpu_type] = used_pack
+
+    return calc_cost_total(used_gpus, time_s, policy)
+
+
+def calc_cost_total(
+    num_gpus: dict[GPUType, int],
+    time_s: float,
+    policy: Policy,
+) -> float:
+    """
+    Calculate total cost based on GPU hours used.
+    It includes the idle GPUs not assigned to a model.
+    """
+    cost = 0.0
+    for gpu_type, num in num_gpus.items():
+        cost += num * (time_s / SECONDS_IN_HOUR) * policy.gpu_cost[gpu_type]
+    return cost
+
+
+def calc_used_gpus_per_type(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+) -> dict[GPUType, int]:
+    """
+    Calculate number of GPUs used per GPU type across all models.
+    """
+    gpus_used = {}
+    for gpu_type, model_gpu in models.items():
+        gpus_used[gpu_type] = 0
+        for model_allocations in model_gpu.values():
+            for model_allocation in model_allocations:
+                gpus_used[gpu_type] += model_allocation.get_num_gpus()
+    return gpus_used
+
+
+def calc_used_gpus(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+) -> int:
+    """
+    Calculate total number of GPUs used across all models and GPU types.
+    """
+    gpus_used = calc_used_gpus_per_type(models)
+    return sum(gpus_used.values())
+
+
+def calc_total_time(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+) -> float:
+    """
+    Calculate total time considering all stages and dependencies.
+    This assumes that evaluate_time() has been called already.
+    """
+    total_time_secs = 0.0
+    for model_name in Model:
+        model_alloc_times = [
+            model_alloc.time
+            for gpu_type in GPUType
+            if gpu_type in models and model_name in models[gpu_type]
+            for model_alloc in models[gpu_type][model_name]
+        ]
+        model_time = max(model_alloc_times) if model_alloc_times else 0.0
+        total_time_secs += model_time
+    return total_time_secs
+
+
+def calc_ttff(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+) -> float:
+    """
+    Calculate time to first frame (chunk).
+    It takes the time to first frame (TTFF) for each model.
+    This assumes that evaluate_time() has been called already.
+    """
+    models_time_first: dict[Model, float] = {}
+    for model_name in Model:
+        times_first = []
+        for gpu_type in models.keys():
+            if model_name in models[gpu_type]:
+                for model_alloc in models[gpu_type][model_name]:
+                    if model_alloc.get_num_gpus() > 0:
+                        times_first.append(model_alloc.time_first)
+        if len(times_first) > 0:
+            models_time_first[model_name] = min(times_first)  # The fastest model determines TTFF
+    return sum(models_time_first.values())
diff --git a/simulator/model_allocator.py b/simulator/model_allocator.py
new file mode 100644
index 00000000..0f773a51
--- /dev/null
+++ b/simulator/model_allocator.py
@@ -0,0 +1,282 @@
+"""
+Defines the ModelAllocator abstract base class and its interface for model allocation strategies.
+"""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from abc import ABC
+from abc import abstractmethod
+
+from sim_types import GPUType
+from sim_types import Model
+from sim_types import ModelAllocation
+from sim_types import Policy
+from sim_types import WorkflowConfig
+from sim_types import LatencyData
+from sim_types import PowerData
+from sim_types import Result
+
+from models import FluxModelAllocation
+from models import GemmaModelAllocation
+from models import HFModelAllocation
+from models import HFVAEModelAllocation
+from models import FTModelAllocation
+from models import FTVAEModelAllocation
+from models import UpscalerModelAllocation
+from models import OthersModelAllocation
+
+from model_provisioner.policies import NAIVE_POLICY
+
+
+class ModelAllocator(ABC):
+    """
+    Abstract base class for model allocators.
+    """
+
+    def __init__(
+        self,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+        power_data: Optional[PowerData] = None,
+        policy: Policy = NAIVE_POLICY,
+    ) -> None:
+        self.workflow = workflow
+        self.latency_data = latency_data
+        self.power_data = power_data
+        self.policy = policy
+
+    @abstractmethod
+    def allocate(
+        self,
+        num_gpus: dict[GPUType, int],
+        verbose: bool = False,
+    ) -> Result:
+        """Allocate models to GPUs and return the provisioning result."""
+        ...
+
+    def _init_single_server_models(
+        self,
+        gpu_type: GPUType,
+    ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
+        """
+        Initialize model allocations for a single server (8 GPUs or fewer).
+        Each model gets a single allocation entry.
+        """
+        models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {
+            gpu_type: {
+                Model.GEMMA: [
+                    GemmaModelAllocation(
+                        gpu_type=gpu_type,
+                        devices=1, replicas=1)
+                ],
+                Model.FLUX: [
+                    FluxModelAllocation(
+                        gpu_type=gpu_type,
+                        devices=1, replicas=1)
+                ],
+                Model.HF: [
+                    HFModelAllocation(
+                        gpu_type=gpu_type,
+                        devices=1, replicas=2)
+                ],
+                Model.HF_VAE: [
+                    HFVAEModelAllocation(
+                        gpu_type=gpu_type,
+                        devices=1, replicas=1)
+                ],
+                Model.FT: [
+                    FTModelAllocation(
+                        gpu_type=gpu_type,
+                        devices=1, replicas=1)
+                ],
+                Model.FT_VAE: [
+                    FTVAEModelAllocation(
+                        gpu_type=gpu_type,
+                        devices=1, replicas=1)
+                ],
+                Model.UPSCALER: [
+                    UpscalerModelAllocation(
+                        gpu_type=gpu_type)
+                ],
+                Model.OTHERS: [
+                    OthersModelAllocation(
+                        gpu_type=gpu_type,
+                        devices=1, replicas=1)  # + 1 for Kokoro/YOLO
+                ],
+            },
+        }
+
+        if self.policy.use_upscaler:
+            # HF -> UPSCALER
+            models[gpu_type][Model.HF][0].replicas -= 1
+            models[gpu_type][Model.UPSCALER][0].replicas += 1
+
+        if not self.policy.is_disaggregated(Model.HF):
+            # HF_VAE -> HF
+            models[gpu_type][Model.HF_VAE][0].replicas -= 1
+            models[gpu_type][Model.HF][0].replicas += 1
+        if not self.policy.is_disaggregated(Model.FT):
+            # FT_VAE -> FT
+            models[gpu_type][Model.FT_VAE][0].replicas -= 1
+            models[gpu_type][Model.FT][0].replicas += 1
+
+        self._zero_out_unused_models(models)
+        return models
+
+    def _init_single_device_models(
+        self,
+        gpu_type: GPUType,
+    ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
+        """
+        Initialize model allocations for a single GPU type with >8 GPUs.
+        Each model gets two allocation entries (active and inactive).
+        """
+        models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {
+            gpu_type: {
+                Model.GEMMA: [
+                    GemmaModelAllocation(
+                        gpu_type=gpu_type,
+                        devices=1, replicas=1),
+                    GemmaModelAllocation(
+                        gpu_type=gpu_type),
+                ],
+                Model.FLUX: [
+                    FluxModelAllocation(
+                        gpu_type=gpu_type,
+                        devices=1, replicas=1),
+                    FluxModelAllocation(
+                        gpu_type=gpu_type),
+                ],
+                Model.HF: [
+                    HFModelAllocation(
+                        gpu_type=gpu_type,
+                        devices=1, replicas=1),
+                    HFModelAllocation(
+                        gpu_type=gpu_type),
+                ],
+                Model.HF_VAE: [
+                    HFVAEModelAllocation(
+                        gpu_type=gpu_type,
+                        devices=1, replicas=1),
+                    HFVAEModelAllocation(
+                        gpu_type=gpu_type),
+                ],
+                Model.FT: [
+                    FTModelAllocation(
+                        gpu_type=gpu_type,
+                        devices=2, replicas=1),
+                    FTModelAllocation(
+                        gpu_type=gpu_type),
+                ],
+                Model.FT_VAE: [
+                    FTVAEModelAllocation(
+                        gpu_type=gpu_type,
+                        devices=1, replicas=1),
+                    FTVAEModelAllocation(
+                        gpu_type=gpu_type),
+                ],
+                Model.UPSCALER: [
+                    UpscalerModelAllocation(
+                        gpu_type=gpu_type),
+                    UpscalerModelAllocation(
+                        gpu_type=gpu_type),
+                ],
+                Model.OTHERS: [
+                    OthersModelAllocation(
+                        gpu_type=gpu_type,
+                        devices=1, replicas=1),
+                    OthersModelAllocation(
+                        gpu_type=gpu_type),
+                ],
+            },
+        }
+
+        if self.policy.use_upscaler:
+            models[gpu_type][Model.UPSCALER][0].replicas = 1
+
+        if not self.policy.is_disaggregated(Model.HF):
+            # HF_VAE -> HF
+            models[gpu_type][Model.HF_VAE][0].replicas -= 1
+            models[gpu_type][Model.HF][0].replicas += 1
+        if not self.policy.is_disaggregated(Model.FT):
+            # FT_VAE -> FT
+            models[gpu_type][Model.FT_VAE][0].replicas -= 1
+            models[gpu_type][Model.FT][0].replicas += 1
+
+        self._zero_out_unused_models(models)
+        return models
+
+    def _init_both_devices_models(
+        self,
+        gpu_type1: GPUType,
+        gpu_type2: GPUType,
+    ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
+        """
+        Initialize model allocations for two GPU types.
+        gpu_type1 gets GEMMA, FLUX, OTHERS; gpu_type2 gets HF, VAE, FT, UPSCALER.
+        """
+        models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {
+            gpu_type1: {
+                Model.GEMMA: [GemmaModelAllocation(
+                    gpu_type=gpu_type1,
+                    devices=1, replicas=1)],
+                Model.FLUX: [FluxModelAllocation(
+                    gpu_type=gpu_type1,
+                    devices=1, replicas=1)],
+                Model.HF: [],
+                Model.HF_VAE: [],
+                Model.FT: [],
+                Model.FT_VAE: [],
+                Model.UPSCALER: [],
+                Model.OTHERS: [OthersModelAllocation(
+                    gpu_type=gpu_type1,
+                    devices=1, replicas=1)],  # + 1 for Kokoro/YOLO
+            },
+            gpu_type2: {
+                Model.GEMMA: [],
+                Model.FLUX: [],
+                Model.HF: [HFModelAllocation(
+                    gpu_type=gpu_type2,
+                    devices=1, replicas=1)],
+                Model.HF_VAE: [HFVAEModelAllocation(
+                    gpu_type=gpu_type2,
+                    devices=1, replicas=1)],
+                Model.FT: [FTModelAllocation(
+                    gpu_type=gpu_type2,
+                    devices=2, replicas=1)],
+                Model.FT_VAE: [FTVAEModelAllocation(
+                    gpu_type=gpu_type2,
+                    devices=1, replicas=1)],
+                Model.UPSCALER: [UpscalerModelAllocation(
+                    gpu_type=gpu_type2)],
+                Model.OTHERS: [],
+            },
+        }
+
+        if not self.policy.is_disaggregated(Model.HF):
+            # HF_VAE -> HF
+            models[gpu_type2][Model.HF_VAE][0].replicas -= 1
+            models[gpu_type2][Model.HF][0].replicas += 1
+        if not self.policy.is_disaggregated(Model.FT):
+            # FT_VAE -> FT
+            models[gpu_type2][Model.FT_VAE][0].replicas -= 1
+            models[gpu_type2][Model.FT][0].replicas += 1
+
+        if self.policy.use_upscaler:
+            models[gpu_type2][Model.UPSCALER][0].replicas = 1
+
+        self._zero_out_unused_models(models)
+        return models
+
+    def _zero_out_unused_models(
+        self,
+        models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+    ) -> None:
+        """Zero out replicas for models not in the workflow."""
+        for gpu_type in models:
+            for model in Model:
+                if model not in self.workflow.models:
+                    for alloc in models[gpu_type][model]:
+                        alloc.replicas = 0
diff --git a/simulator/models.py b/simulator/models.py
new file mode 100644
index 00000000..9a56ab79
--- /dev/null
+++ b/simulator/models.py
@@ -0,0 +1,811 @@
+"""
+Contains the definition for each model.
+It includes the calculations for time, energy, and cost.
+"""
+from __future__ import annotations
+
+import math
+
+from typing import override
+from typing import Callable
+from typing import Optional
+from typing import Type
+from typing import ClassVar
+
+from sim_types import LatencyData
+from sim_types import PowerData
+from sim_types import ModelAllocation
+from sim_types import Model
+from sim_types import Policy
+from sim_types import QualityLevel
+from sim_types import WorkflowConfig
+from sim_types import GPUType
+
+from constants import TOTAL_INPUT_TOKENS
+
+
+# ModelAllocation Factory
+ModelAllocationCls = Type[ModelAllocation]
+
+_MODEL_ALLOCATION_REGISTRY: dict[Model, ModelAllocationCls] = {}
+
+
+def register_model(
+    model: Model
+) -> Callable[[ModelAllocationCls], ModelAllocationCls]:
+    """Register a ModelAllocation class for the factory."""
+    def decorator(cls: ModelAllocationCls) -> ModelAllocationCls:
+        _MODEL_ALLOCATION_REGISTRY[model] = cls
+        return cls
+    return decorator
+
+
+def get_model_allocation(
+    *,
+    model: Model,
+    gpu_type: GPUType,
+    devices: int = 1,
+    replicas: int = 0,
+) -> ModelAllocation:
+    """Factory to get the ModelAllocation instance for a specific model."""
+    if model not in _MODEL_ALLOCATION_REGISTRY:
+        raise ValueError(f"No ModelAllocation for model {model}")
+    cls = _MODEL_ALLOCATION_REGISTRY[model]
+    return cls(
+        gpu_type=gpu_type,
+        devices=devices,
+        replicas=replicas,
+    )
+
+
+def _calculate_total_time(
+    total_work: float,
+    num_replicas: int,
+    time_per_work: float,
+) -> float:
+    """Calculate total time given work, replicas, and time per work unit."""
+    if num_replicas <= 0:
+        return 0.0
+    total_time = (total_work / num_replicas) * time_per_work
+    if total_time < time_per_work:  # We cannot go faster than single work unit time
+        total_time = time_per_work
+    return total_time
+
+
+def assert_pixel_config(
+    workflow: WorkflowConfig
+) -> None:
+    """Verify that the workflow's pixel configuration is valid for upscaling."""
+    from sim_types import RESOLUTION_PIXELS
+    assert 0 < RESOLUTION_PIXELS[QualityLevel.MEDIUM] < RESOLUTION_PIXELS[QualityLevel.HIGH]
+
+
+@register_model(Model.GEMMA)
+class GemmaModelAllocation(ModelAllocation):
+    """Gemma model allocation."""
+    model: ClassVar[Model] = Model.GEMMA
+
+    @override
+    def get_max_replicas(
+        self,
+        workflow: WorkflowConfig,
+    ) -> int:
+        return workflow.model_work.get(Model.GEMMA, 1)
+
+    @override
+    def calculate_time(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+        work_pct: float = 1.0,
+    ) -> float:
+        if self.get_num_gpus() == 0:
+            self.time = 0.0
+            return self.time
+        latency_first = latency_data[self.gpu_type].gemma_first_scene[self.devices]
+        latency_per_scene = latency_data[self.gpu_type].gemma_per_scene[self.devices]
+        latency_first *= workflow.total_input_tokens / TOTAL_INPUT_TOKENS
+        latency_per_scene *= workflow.total_input_tokens / TOTAL_INPUT_TOKENS
+        total_work = workflow.model_work.get(Model.GEMMA, 1)
+        if total_work > 1:
+            num_scenes = math.ceil(work_pct * total_work)
+            total_time_per_scene = latency_first + latency_per_scene * (num_scenes - 1)
+            self.time = _calculate_total_time(
+                num_scenes,
+                self.replicas,
+                total_time_per_scene / num_scenes)
+        else:
+            self.time = latency_first + latency_per_scene * (workflow.total_scenes - 1)
+        return self.time
+
+    @override
+    def calculate_time_first(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+    ) -> float:
+        if self.get_num_gpus() == 0:
+            self.time_first = 0.0
+            return self.time_first
+        latency_first = latency_data[self.gpu_type].gemma_first_scene[self.devices]
+        latency_first *= workflow.total_input_tokens / TOTAL_INPUT_TOKENS
+        self.time_first = latency_first
+        return self.time_first
+
+    @override
+    def calculate_energy(
+        self,
+        workflow: WorkflowConfig,
+        power_data: Optional[PowerData] = None,
+        total_time_s: float = 0.0,
+    ) -> float:
+        if self.get_num_gpus() == 0 or power_data is None:
+            self.energy = 0.0
+            return self.energy
+        # Gemma energy
+        latency_first = self.time_first
+        latency_per_scene = max(0.0, self.time - latency_first)
+        power_first = power_data[self.gpu_type].gemma_first_scene[self.devices]
+        power_per_scene = power_data[self.gpu_type].gemma_per_scene[self.devices]
+        self.energy = \
+            power_first * latency_first + \
+            power_per_scene * latency_per_scene * (workflow.total_scenes - 1)
+        # Idle energy
+        power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus()
+        time_idle = total_time_s - self.time
+        if time_idle > 0:
+            self.energy += power_idle * time_idle
+        return self.energy
+
+
+@register_model(Model.FLUX)
+class FluxModelAllocation(ModelAllocation):
+    """Flux model allocation."""
+    model: ClassVar[Model] = Model.FLUX
+
+    def _calc_time_per_scene(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+    ) -> float:
+        return (
+            latency_data[self.gpu_type][self.model, self.devices]
+            * workflow.num_steps[Model.FLUX]
+        )
+
+    @override
+    def get_max_replicas(
+        self,
+        workflow: WorkflowConfig,
+    ) -> int:
+        return workflow.model_work.get(Model.FLUX, 1)
+
+    @override
+    def calculate_time(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+        work_pct: float = 1.0,
+    ) -> float:
+        if self.get_num_gpus() == 0:
+            self.time = 0.0
+            return self.time
+        time_per_scene = self._calc_time_per_scene(
+            policy,
+            workflow,
+            latency_data,
+        )
+        total_work = workflow.model_work.get(Model.FLUX, 1)
+        if total_work > 1:
+            num_scenes = math.ceil(work_pct * total_work)
+            self.time = _calculate_total_time(
+                num_scenes,
+                self.replicas,
+                time_per_scene)
+        else:
+            self.time = time_per_scene
+        return self.time
+
+    @override
+    def calculate_time_first(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+    ) -> float:
+        if self.get_num_gpus() == 0:
+            self.time_first = 0.0
+            return self.time_first
+        time_per_scene = self._calc_time_per_scene(
+            policy,
+            workflow,
+            latency_data,
+        )
+        self.time_first = time_per_scene
+        return self.time_first
+
+    @override
+    def calculate_energy(
+        self,
+        workflow: WorkflowConfig,
+        power_data: Optional[PowerData] = None,
+        total_time_s: float = 0.0,
+    ) -> float:
+        if self.get_num_gpus() == 0 or power_data is None:
+            self.energy = 0.0
+            return self.energy
+        power_flux = power_data[self.gpu_type][Model.FLUX, self.devices]
+        self.energy = power_flux * self.time * self.replicas
+        # Idle energy
+        power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus()
+        time_idle = total_time_s - self.time
+        if time_idle > 0:
+            self.energy += power_idle * time_idle
+        return self.energy
+
+
+@register_model(Model.HF)
+class HFModelAllocation(ModelAllocation):
+    """HunyuanFramePack model allocation."""
+    model: ClassVar[Model] = Model.HF
+
+    def _calc_time_per_frame(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+    ) -> float:
+        return (
+            latency_data[self.gpu_type][self.model, self.devices]
+            * workflow.get_resolution_scale(policy.use_upscaler)
+            * workflow.num_steps[Model.HF]
+        )
+
+    def _calc_time_per_subscene(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+    ) -> float:
+        return (
+            workflow.per_subscene_frames[Model.HF]
+            / workflow.hf_frames[workflow.frames_per_step_idx]
+            * latency_data[self.gpu_type][self.model, self.devices]
+            * workflow.get_resolution_scale(policy.use_upscaler)  # latency_ratio
+            * workflow.num_steps[Model.HF]
+        )
+
+    @override
+    def calculate_time(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+        work_pct: float = 1.0,
+    ) -> float:
+        if self.get_num_gpus() == 0:
+            self.time = 0.0
+            return self.time
+
+        hf_time_per_subscene = self._calc_time_per_subscene(
+            policy,
+            workflow,
+            latency_data,
+        )
+        self.time = _calculate_total_time(
+            math.ceil(work_pct * workflow.total_subscenes),
+            self.replicas,
+            hf_time_per_subscene)
+
+        if not policy.is_disaggregated(Model.HF):
+            # Include VAE time in the same GPU when disaggregation is disabled
+            hf_vae_time_per_frame = (
+                latency_data[self.gpu_type][Model.HF_VAE, 1]  # VAE is single-device only in current policy
+                * workflow.get_resolution_scale(policy.use_upscaler)
+                / workflow.hf_frames[workflow.frames_per_step_idx]
+            )
+            self.time += _calculate_total_time(
+                math.ceil(work_pct * workflow.total_frames[Model.HF]),
+                self.replicas,
+                hf_vae_time_per_frame)
+
+        return self.time
+
+    @override
+    def calculate_time_first(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+    ) -> float:
+        if self.get_num_gpus() == 0:
+            self.time_first = 0.0
+            return self.time_first
+
+        if policy.is_disaggregated(Model.HF):
+            # HF for the first chunk
+            self.time_first = min(
+                # Option 1: the first few frames until the first chunk is done
+                workflow.hf_frames[0]
+                / workflow.hf_frames[workflow.frames_per_step_idx]
+                * self._calc_time_per_frame(
+                    policy,
+                    workflow,
+                    latency_data
+                ),
+                # Option 2: the full subscene
+                self._calc_time_per_subscene(
+                    policy,
+                    workflow,
+                    latency_data
+                ),
+            )
+        else:
+            # HF + VAE for the full subscene
+            hf_time_per_subscene = self._calc_time_per_subscene(
+                policy,
+                workflow,
+                latency_data)
+            hf_vae_time_per_subscene = (
+                workflow.per_subscene_frames[Model.HF]
+                / workflow.hf_frames[workflow.frames_per_step_idx]
+                * latency_data[self.gpu_type][Model.HF_VAE, 1]  # VAE is single-device only in current policy
+                * workflow.get_resolution_scale(policy.use_upscaler)
+            )
+            self.time_first = hf_time_per_subscene + hf_vae_time_per_subscene
+
+        return self.time_first
+
+    @override
+    def calculate_energy(
+        self,
+        workflow: WorkflowConfig,
+        power_data: Optional[PowerData] = None,
+        total_time_s: float = 0.0,
+    ) -> float:
+        if self.get_num_gpus() == 0 or power_data is None:
+            self.energy = 0.0
+            return self.energy
+        power_hf = power_data[self.gpu_type][Model.HF, self.devices]
+        self.energy = power_hf * self.time * self.replicas
+        # Idle energy
+        power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus()
+        time_idle = total_time_s - self.time
+        if time_idle > 0:
+            self.energy += power_idle * time_idle
+        return self.energy
+
+    @override
+    def get_max_replicas(
+        self,
+        workflow: WorkflowConfig,
+    ) -> int:
+        return workflow.model_work.get(Model.HF, 1)
+
+
+@register_model(Model.HF_VAE)
+class HFVAEModelAllocation(ModelAllocation):
+    """HunyuanFramePack VAE model allocation."""
+    model: ClassVar[Model] = Model.HF_VAE
+
+    def _calc_time_per_frame(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+    ) -> float:
+        return (
+            latency_data[self.gpu_type][Model.HF_VAE, self.devices]
+            * workflow.get_resolution_scale(policy.use_upscaler)
+            / workflow.hf_frames[workflow.frames_per_step_idx]
+        )
+
+    @override
+    def calculate_time(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+        work_pct: float = 1.0,
+    ) -> float:
+        if not policy.is_disaggregated(Model.HF):
+            assert self.get_num_gpus() == 0
+            self.time = 0.0
+            return self.time
+        if self.get_num_gpus() == 0:
+            self.time = 0.0
+            return self.time
+
+        vae_time_per_frame = self._calc_time_per_frame(
+            policy,
+            workflow,
+            latency_data
+        )
+        self.time = _calculate_total_time(
+            math.ceil(workflow.total_frames[Model.HF] * work_pct),
+            self.replicas,
+            vae_time_per_frame)
+        return self.time
+
+    @override
+    def calculate_time_first(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+    ) -> float:
+        if not policy.is_disaggregated(Model.HF):
+            assert self.get_num_gpus() == 0
+            self.time_first = 0.0
+            return self.time_first
+        if self.get_num_gpus() == 0:
+            self.time_first = 0.0
+            return self.time_first
+
+        vae_time_per_frame = self._calc_time_per_frame(
+            policy,
+            workflow,
+            latency_data,
+        )
+        num_frames = workflow.per_subscene_frames[Model.HF]
+        self.time_first = num_frames * vae_time_per_frame
+        return self.time_first
+
+    @override
+    def calculate_energy(
+        self,
+        workflow: WorkflowConfig,
+        power_data: Optional[PowerData] = None,
+        total_time_s: float = 0.0,
+    ) -> float:
+        if self.get_num_gpus() == 0 or power_data is None:
+            self.energy = 0.0
+            return self.energy
+        self.energy = power_data[self.gpu_type][Model.HF_VAE, self.devices] * self.time * self.replicas
+        # Idle energy
+        power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus()
+        time_idle = total_time_s - self.time
+        if time_idle > 0:
+            self.energy += power_idle * time_idle
+        return self.energy
+
+    @override
+    def get_max_replicas(
+        self,
+        workflow: WorkflowConfig,
+    ) -> int:
+        return workflow.model_work.get(Model.HF_VAE, 1)
+
+
+@register_model(Model.FT)
+class FTModelAllocation(ModelAllocation):
+    """FantasyTalking model allocation."""
+    model: ClassVar[Model] = Model.FT
+
+    def _calc_time_per_subscene(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+    ) -> float:
+        return (
+            workflow.per_subscene_frames[Model.FT]
+            / workflow.ft_frames[workflow.frames_per_step_idx]
+            * latency_data[self.gpu_type][Model.FT, self.devices]
+            * workflow.get_resolution_scale(policy.use_upscaler)
+            * workflow.num_steps[Model.FT]
+        )
+
+    @override
+    def calculate_time(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+        work_pct: float = 1.0,
+    ) -> float:
+        if self.get_num_gpus() == 0:
+            self.time = 0.0
+            return self.time
+
+        ft_time_per_subscene = self._calc_time_per_subscene(
+            policy,
+            workflow,
+            latency_data,
+        )
+        self.time = _calculate_total_time(
+            math.ceil(work_pct * workflow.total_subscenes),
+            self.replicas,
+            ft_time_per_subscene)
+
+        if not policy.is_disaggregated(Model.FT):
+            # Include VAE time in the same GPU when disaggregation is disabled
+            # Note: VAE latency uses devices=1 as VAE processing is not parallelized
+            # across multiple devices in the same way as the main FT diffusion
+            ft_vae_time_per_frame = (
+                latency_data[self.gpu_type][Model.FT_VAE, 1]
+                * workflow.get_resolution_scale(policy.use_upscaler)
+                / workflow.ft_frames[workflow.frames_per_step_idx]
+            )
+            self.time += _calculate_total_time(
+                math.ceil(work_pct * workflow.total_frames[Model.FT]),
+                self.replicas,
+                ft_vae_time_per_frame)
+
+        return self.time
+
+    @override
+    def calculate_time_first(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+    ) -> float:
+        if self.get_num_gpus() == 0:
+            self.time_first = 0.0
+            return self.time_first
+
+        ft_time_per_subscene = self._calc_time_per_subscene(
+            policy,
+            workflow,
+            latency_data,
+        )
+        self.time_first = ft_time_per_subscene
+
+        if not policy.is_disaggregated(Model.FT):
+            # Include VAE time_first when FT-VAE is not disaggregated
+            # Note: VAE latency uses devices=1 (see note in calculate_time)
+            ft_vae_time_per_subscene = (
+                workflow.per_subscene_frames[Model.FT]
+                / workflow.ft_frames[workflow.frames_per_step_idx]
+                * latency_data[self.gpu_type][Model.FT_VAE, 1]
+                * workflow.get_resolution_scale(policy.use_upscaler)
+            )
+            self.time_first += ft_vae_time_per_subscene
+
+        return self.time_first
+
+    @override
+    def calculate_energy(
+        self,
+        workflow: WorkflowConfig,
+        power_data: Optional[PowerData] = None,
+        total_time_s: float = 0.0,
+    ) -> float:
+        if self.get_num_gpus() == 0 or power_data is None:
+            self.energy = 0.0
+            return self.energy
+        power_ft = power_data[self.gpu_type][Model.FT, self.devices]
+        self.energy = power_ft * self.time * self.replicas
+        # Idle energy
+        power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus()
+        time_idle = total_time_s - self.time
+        if time_idle > 0:
+            self.energy += power_idle * time_idle
+        return self.energy
+
+    @override
+    def get_max_replicas(
+        self,
+        workflow: WorkflowConfig,
+    ) -> int:
+        return workflow.model_work.get(Model.FT, 1)
+
+
+@register_model(Model.FT_VAE)
+class FTVAEModelAllocation(ModelAllocation):
+    """FantasyTalking VAE model allocation."""
+    model: ClassVar[Model] = Model.FT_VAE
+
+    def _calc_time_per_frame(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+    ) -> float:
+        return (
+            latency_data[self.gpu_type][Model.FT_VAE, self.devices]
+            * workflow.get_resolution_scale(policy.use_upscaler)
+            / workflow.ft_frames[workflow.frames_per_step_idx]
+        )
+
+    @override
+    def calculate_time(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+        work_pct: float = 1.0,
+    ) -> float:
+        if not policy.is_disaggregated(Model.FT):
+            assert self.get_num_gpus() == 0
+            self.time = 0.0
+            return self.time
+        if self.get_num_gpus() == 0:
+            self.time = 0.0
+            return self.time
+
+        vae_time_per_frame = self._calc_time_per_frame(
+            policy,
+            workflow,
+            latency_data,
+        )
+        self.time = _calculate_total_time(
+            math.ceil(workflow.total_frames[Model.FT] * work_pct),
+            self.replicas,
+            vae_time_per_frame)
+        return self.time
+
+    @override
+    def calculate_time_first(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+    ) -> float:
+        if not policy.is_disaggregated(Model.FT):
+            assert self.get_num_gpus() == 0
+            self.time_first = 0.0
+            return self.time_first
+        if self.get_num_gpus() == 0:
+            self.time_first = 0.0
+            return self.time_first
+
+        vae_time_per_frame = self._calc_time_per_frame(
+            policy,
+            workflow,
+            latency_data,
+        )
+        num_frames = workflow.per_subscene_frames[Model.FT]
+        self.time_first = num_frames * vae_time_per_frame
+        return self.time_first
+
+    @override
+    def calculate_energy(
+        self,
+        workflow: WorkflowConfig,
+        power_data: Optional[PowerData] = None,
+        total_time_s: float = 0.0,
+    ) -> float:
+        if self.get_num_gpus() == 0 or power_data is None:
+            self.energy = 0.0
+            return self.energy
+        self.energy = power_data[self.gpu_type][Model.FT_VAE, self.devices] * self.time * self.replicas
+        # Idle energy
+        power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus()
+        time_idle = total_time_s - self.time
+        if time_idle > 0:
+            self.energy += power_idle * time_idle
+        return self.energy
+
+    @override
+    def get_max_replicas(
+        self,
+        workflow: WorkflowConfig,
+    ) -> int:
+        return workflow.model_work.get(Model.FT_VAE, 1)
+
+
+@register_model(Model.UPSCALER)
+class UpscalerModelAllocation(ModelAllocation):
+    """Upscaler model allocation."""
+    model: ClassVar[Model] = Model.UPSCALER
+
+    @override
+    def calculate_time(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+        work_pct: float = 1.0,
+    ) -> float:
+        if self.get_num_gpus() == 0:
+            self.time = 0.0
+            return self.time
+        self.time = _calculate_total_time(
+            math.ceil(work_pct * workflow.total_frames[Model.FT]),
+            self.replicas,
+            latency_data[self.gpu_type][self.model, self.devices])
+        return self.time
+
+    @override
+    def calculate_time_first(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+    ) -> float:
+        if not policy.use_upscaler:
+            assert self.get_num_gpus() == 0
+        if self.get_num_gpus() == 0:
+            self.time_first = 0.0
+            return self.time_first
+
+        self.time_first = (
+            workflow.per_subscene_frames[Model.FT]
+            * latency_data[self.gpu_type][self.model, self.devices]
+        )
+        return self.time_first
+
+    @override
+    def calculate_energy(
+        self,
+        workflow: WorkflowConfig,
+        power_data: Optional[PowerData] = None,
+        total_time_s: float = 0.0,
+    ) -> float:
+        if self.get_num_gpus() == 0 or power_data is None:
+            self.energy = 0.0
+            return self.energy
+        # Assumes a single device and multiple replicas
+        self.energy = power_data[self.gpu_type][self.model, self.devices] * self.time * self.replicas
+        # Idle energy
+        power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus()
+        time_idle = total_time_s - self.time
+        if time_idle > 0:
+            self.energy += power_idle * time_idle
+        return self.energy
+
+    @override
+    def get_max_replicas(
+        self,
+        workflow: WorkflowConfig,
+    ) -> int:
+        return workflow.model_work.get(Model.UPSCALER, 1)
+
+
+@register_model(Model.OTHERS)
+class OthersModelAllocation(ModelAllocation):
+    """Others: Kokoro + YOLO."""
+    model: ClassVar[Model] = Model.OTHERS
+
+    @override
+    def calculate_time(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+        work_pct: float = 1.0,
+    ) -> float:
+        if self.get_num_gpus() == 0:
+            self.time = 0.0
+            return self.time
+
+        self.time = (
+            workflow.total_scenes
+            * latency_data[self.gpu_type][self.model, self.devices]
+        )
+        return self.time
+
+    @override
+    def calculate_time_first(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+    ) -> float:
+        if self.get_num_gpus() == 0:
+            self.time_first = 0.0
+            return self.time_first
+
+        self.time_first = latency_data[self.gpu_type][self.model, self.devices]
+        return self.time_first
+
+    @override
+    def calculate_energy(
+        self,
+        workflow: WorkflowConfig,
+        power_data: Optional[PowerData] = None,
+        total_time_s: float = 0.0,
+    ) -> float:
+        if self.get_num_gpus() == 0 or power_data is None:
+            self.energy = 0.0
+            return self.energy
+        # Idle energy; not much GPU usage
+        power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus()
+        self.energy = power_idle * self.time
+        return self.energy
diff --git a/simulator/multirequests.py b/simulator/multirequests.py
index 82957c8f..a8d87a8b 100644
--- a/simulator/multirequests.py
+++ b/simulator/multirequests.py
@@ -4,23 +4,23 @@
 import os
 from dataclasses import replace
 
-from model_provisioner.sim_types import GPUType
-from model_provisioner.sim_types import Model
-from model_provisioner.sim_types import QualityLevel
-from model_provisioner.sim_types import RESOLUTION_PIXELS
-from model_provisioner.sim_types import Result
-from model_provisioner.sim_types import WorkflowConfig
-from model_provisioner.sim_types import LatencyData
+from sim_types import GPUType
+from sim_types import Model
+from sim_types import QualityLevel
+from sim_types import RESOLUTION_PIXELS
+from sim_types import Result
+from sim_types import WorkflowConfig
+from sim_types import LatencyData
 
-from model_provisioner.data_loading import load_latency_data
-from model_provisioner.data_loading import load_power_data
-from model_provisioner.data_loading import load_adaptive_quality_data
+from data_loading import load_latency_data
+from data_loading import load_power_data
+from data_loading import load_adaptive_quality_data
 
-from model_provisioner.workflows import PODCAST_WORKFLOW
+from workflows import PODCAST_WORKFLOW
 
 from model_provisioner.policies import STREAMWISE_POLICY
 
-from model_provisioner.auto_model_allocator import AutoModelAllocator
+from auto_model_allocator import AutoModelAllocator
 
 
 # Queries per minute
diff --git a/simulator/plot_utils.py b/simulator/plot_utils.py
index 2ec13de9..4b0d5849 100644
--- a/simulator/plot_utils.py
+++ b/simulator/plot_utils.py
@@ -10,12 +10,12 @@
 
 from typing import Optional
 
-from model_provisioner.utils import get_pareto_frontier
+from utils import get_pareto_frontier
 
-from model_provisioner.sim_types import ProvisioningResult
-from model_provisioner.sim_types import GPUType
-from model_provisioner.sim_types import Model
-from model_provisioner.sim_types import QualityLevel
+from sim_types import ProvisioningResult
+from sim_types import GPUType
+from sim_types import Model
+from sim_types import QualityLevel
 
 
 FIG_SIZE = (7, 5)
diff --git a/simulator/provisioning.py b/simulator/provisioning.py
index 26e9c8a9..51e1ab11 100644
--- a/simulator/provisioning.py
+++ b/simulator/provisioning.py
@@ -15,6 +15,15 @@
     if _p not in sys.path:
         sys.path.insert(0, _p)
 
+# Propagate paths to child processes spawned by ProcessPoolExecutor (Windows
+# uses 'spawn' which starts a fresh interpreter that reads PYTHONPATH).
+_EXTRA_PATHS = os.pathsep.join((_REPO_ROOT, _STREAMWISE_DIR, _SIMULATOR_DIR))
+_EXISTING = os.environ.get("PYTHONPATH", "")
+if _SIMULATOR_DIR not in _EXISTING:
+    os.environ["PYTHONPATH"] = (
+        _EXTRA_PATHS + os.pathsep + _EXISTING if _EXISTING else _EXTRA_PATHS
+    )
+
 from tqdm.auto import tqdm
 
 import logging
@@ -30,24 +39,24 @@
 from concurrent.futures import TimeoutError
 from concurrent.futures import as_completed
 
-from model_provisioner.sim_types import WorkflowConfig
-from model_provisioner.sim_types import GPUType
-from model_provisioner.sim_types import LatencyData
-from model_provisioner.sim_types import Provision
-from model_provisioner.sim_types import ProvisioningResult
-from model_provisioner.sim_types import Model
-from model_provisioner.sim_types import ModelAllocation
-from model_provisioner.sim_types import PowerData
-from model_provisioner.sim_types import QualityLevel
-from model_provisioner.sim_types import Policy
-from model_provisioner.sim_types import Result
-from model_provisioner.sim_types import num_gpus_to_str
-
-from model_provisioner.auto_model_allocator import AutoModelAllocator
+from sim_types import WorkflowConfig
+from sim_types import GPUType
+from sim_types import LatencyData
+from sim_types import Provision
+from sim_types import ProvisioningResult
+from sim_types import Model
+from sim_types import ModelAllocation
+from sim_types import PowerData
+from sim_types import QualityLevel
+from sim_types import Policy
+from sim_types import Result
+from sim_types import num_gpus_to_str
+
+from auto_model_allocator import AutoModelAllocator
 
 from model_provisioner.policies import STREAMWISE_POLICY
 
-from model_provisioner.constants import SECONDS_IN_HOUR
+from constants import SECONDS_IN_HOUR
 
 
 GPU_PROVISIONS: list[int] = [
diff --git a/simulator/sim_types.py b/simulator/sim_types.py
new file mode 100644
index 00000000..a83cec22
--- /dev/null
+++ b/simulator/sim_types.py
@@ -0,0 +1,796 @@
+from __future__ import annotations
+
+import pandas as pd
+import numpy as np
+
+from typing import Optional
+from typing import ClassVar
+
+from abc import ABC
+from abc import abstractmethod
+
+from dataclasses import dataclass
+from dataclasses import field
+
+from enum import Enum
+
+
+class GPUType(Enum):
+    A100 = "A100"
+    H100 = "H100"
+    H200 = "H200"
+    GB200 = "GB200"
+
+    def __lt__(self, other: object) -> bool:
+        if not isinstance(other, GPUType):
+            return NotImplemented
+        order = [GPUType.A100, GPUType.H100, GPUType.H200, GPUType.GB200]
+        return order.index(self) < order.index(other)
+
+
+class QualityLevel(Enum):
+    ORIGINAL = "original"
+    HIGH = "high"
+    MEDIUM = "medium"
+    LOW = "low"
+
+
+# Pixel counts per quality level (16:10 aspect ratio).
+# Latency data is profiled at MEDIUM resolution.
+RESOLUTION_PIXELS: dict[QualityLevel, int] = {
+    QualityLevel.HIGH: 1280 * 800,
+    QualityLevel.MEDIUM: 640 * 400,
+    QualityLevel.LOW: 320 * 200,
+}
+
+
+class Model(Enum):
+    GEMMA = "gemma"
+    FLUX = "flux"
+    HF = "hf"  # HunyuanFramePack
+    HF_VAE = "hf_vae"  # HunyuanFramePack VAE
+    FT = "ft"  # FantasyTalking
+    FT_VAE = "ft_vae"  # FantasyTalking VAE
+    UPSCALER = "upscaler"
+    OTHERS = "others"  # YOLO + Kokoro
+
+
+# Used for FIFO
+MODEL_ORDER: dict[Model, int] = {
+    Model.GEMMA: 0,
+    Model.FLUX: 1,
+    Model.OTHERS: 2,
+    Model.HF: 3,
+    Model.HF_VAE: 4,
+    Model.FT: 5,
+    Model.FT_VAE: 6,
+    Model.UPSCALER: 7,
+}
+
+
+@dataclass
+class ModelAllocation(ABC):
+    model: ClassVar[Model]
+
+    # policy TODO
+    # workflow TODO
+    gpu_type: GPUType
+    devices: int = 1
+    replicas: int = 0  # No replicas by default
+    work: int = 0
+    time: float = 0.0
+    time_first: float = 0.0
+    energy: float = 0.0
+    cost: float = 0.0
+
+    def __str__(self) -> str:
+        if self.replicas <= 0:
+            assert self.time == 0.0, f"time must be 0 when no replicas, got {self.time:.2f}"
+            assert self.energy == 0.0, f"energy must be 0 when no replicas, got {self.energy:.2f}"
+            return "--"
+        return \
+            f"devices={self.devices:2d}, " \
+            f"replicas={self.replicas}, " \
+            f"work={self.work}, " \
+            f"time={self.time:.2f} secs, " \
+            f"time_first={self.time_first:.2f} secs, " \
+            f"energy={self.energy / 60.0 / 60.0:.2f} Wh, " \
+            f"cost=${self.cost:.2f}"
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+    def __post_init__(self) -> None:
+        if self.replicas > 0:
+            return
+        if self.time != 0.0 or self.energy != 0.0:
+            raise ValueError(
+                f"time and energy must be 0.0 when no replicas, got time={self.time:.2f}, energy={self.energy:.2f}")
+
+    def get_num_gpus(self) -> int:
+        if self.replicas <= 0:
+            return 0
+        return self.devices * self.replicas
+
+    def disable(self) -> None:
+        self.devices = 0
+        self.replicas = 0
+        self.time = 0.0
+        self.time_first = 0.0
+        self.energy = 0.0
+
+    @abstractmethod
+    def calculate_time(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+        work_pct: float = 1.0,
+    ) -> float:
+        ...
+
+    @abstractmethod
+    def calculate_time_first(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+    ) -> float:
+        ...
+
+    @abstractmethod
+    def calculate_energy(
+        self,
+        workflow: WorkflowConfig,
+        power_data: Optional[PowerData] = None,
+        total_time_s: float = 0.0,
+    ) -> float:
+        ...
+
+    def calculate_cost(
+        self,
+        policy: Policy,
+        total_time_s: float = 0.0,
+    ) -> float:
+        """Calculate the cost for this model allocation."""
+        SECONDS_IN_HOUR = 60 * 60
+        gpu_cost = policy.gpu_cost[self.gpu_type]
+        self.cost = total_time_s * (self.get_num_gpus() * gpu_cost) / SECONDS_IN_HOUR
+        return self.cost
+
+    def calculate(
+        self,
+        policy: Policy,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+        power_data: Optional[PowerData] = None,
+        total_time_s: float = 0.0,
+        work_pct: float = 1.0,
+    ) -> None:
+        """Calculate all the values for this model allocation."""
+        self.calculate_time(policy, workflow, latency_data, work_pct)
+        self.calculate_time_first(policy, workflow, latency_data)
+        self.calculate_cost(policy, total_time_s)
+        self.calculate_energy(workflow, power_data, total_time_s)
+
+    def get_max_replicas(
+        self,
+        workflow: WorkflowConfig,
+    ) -> int:
+        """Get the maximum number of replicas that can leverage parallelism."""
+        return 1
+
+
+class Objective(Enum):
+    FIFO = "fifo"
+    TIME = "time"
+    TTFF = "ttff"
+    COST = "cost"
+    ENERGY = "energy"
+    TIME_COST = "time_cost"
+    TTFF_COST = "ttff_cost"
+    ENERGY_COST = "energy_cost"
+    TIME_ENERGY = "time_energy"
+    RANDOM = "random"
+    NONE = "none"
+
+    TTFF_THEN_TIME = "ttff_then_time"  # first minimize ttff, then minimize time
+
+    def is_monotonic(self) -> bool:
+        return self not in {Objective.RANDOM, Objective.FIFO}
+
+
+@dataclass
+class WorkflowConfig:
+    total_video_seconds: int
+    total_scenes: int
+    total_frames: dict[Model, int]
+    total_subscenes: int
+    per_subscene_frames: dict[Model, int]
+    # default per-frame number of denoising steps
+    num_steps: dict[Model, int]
+    # supported number of generation frames
+    hf_frames: list[int]
+    ft_frames: list[int]
+    frames_per_step_idx: int
+    # target output resolution (default: HIGH)
+    target_resolution: QualityLevel = QualityLevel.HIGH
+
+    # total input tokens
+    total_input_tokens: int = 0
+
+    # work per model (determines parallelism; work > 1 means parallelizable across replicas)
+    # models included in the workflow are derived from the keys of this dict
+    model_work: dict[Model, int] = field(default_factory=dict)
+
+    @property
+    def models(self) -> list[Model]:
+        """Models included in the workflow (derived from model_work keys)."""
+        return list(self.model_work.keys())
+
+    @property
+    def work(self) -> dict[Model, int]:
+        """Units of work per model (0 for models not in the workflow)."""
+        return {
+            model_name: self.model_work.get(model_name, 0)
+            for model_name in Model
+        }
+
+    def get_model_order(self) -> list[Model]:
+        """Get ordered list of models in the workflow, sorted by MODEL_ORDER."""
+        return sorted(
+            [m for m in self.models if m in MODEL_ORDER],
+            key=lambda m: MODEL_ORDER[m],
+        )
+
+    def get_resolution_scale(self, use_upscaler: bool) -> float:
+        """Compute latency scaling factor based on target resolution.
+
+        Latency data is profiled at MEDIUM resolution.  The scale factor
+        adjusts for the actual generation resolution:
+
+        1. Upscaler used, HIGH   → 1.0 (models generate at MEDIUM)
+        2. Upscaler used, MEDIUM → LOW / MEDIUM (models generate at LOW)
+        3. No upscaler, HIGH     → HIGH / MEDIUM  (scale up)
+        4. No upscaler, MEDIUM   → 1.0
+        5. No upscaler, LOW      → LOW / MEDIUM   (scale down)
+        """
+        if use_upscaler:
+            assert self.target_resolution in (QualityLevel.HIGH, QualityLevel.MEDIUM), \
+                "Upscaler can only be used when target resolution is HIGH or MEDIUM"
+            if self.target_resolution == QualityLevel.HIGH:
+                return 1.0
+            # MEDIUM target with upscaler: generate at LOW, upscale to MEDIUM
+            return RESOLUTION_PIXELS[QualityLevel.LOW] / RESOLUTION_PIXELS[QualityLevel.MEDIUM]
+        if self.target_resolution == QualityLevel.MEDIUM:
+            return 1.0
+        return RESOLUTION_PIXELS[self.target_resolution] / RESOLUTION_PIXELS[QualityLevel.MEDIUM]
+
+    def is_parallelizable(self, model: Model) -> bool:
+        """Whether the given model can be parallelized across multiple replicas."""
+        return self.model_work.get(model, 0) > 1
+
+    def filter_parallelizable_models(
+        self,
+        models: list[Model],
+        disaggregation: dict[Model, bool],
+    ) -> list[Model]:
+        filtered_models = [
+            model
+            for model in models
+            if self.is_parallelizable(model)
+        ]
+        # Remove VAE models when their parent model disaggregation is disabled
+        if not disaggregation.get(Model.HF, False):
+            filtered_models = [m for m in filtered_models if m != Model.HF_VAE]
+        if not disaggregation.get(Model.FT, False):
+            filtered_models = [m for m in filtered_models if m != Model.FT_VAE]
+        return filtered_models
+
+    def __post_init__(self) -> None:
+        assert self.total_frames[Model.HF] > self.per_subscene_frames[Model.HF]
+        assert self.total_frames[Model.FT] > self.per_subscene_frames[Model.FT]
+
+        # If no models specified, populate defaults for all models
+        if not self.model_work:
+            defaults: dict[Model, int] = {
+                Model.GEMMA: 1,
+                Model.FLUX: 1,
+                Model.HF: self.total_subscenes,
+                Model.HF_VAE: self.total_frames[Model.HF],
+                Model.FT: self.total_subscenes,
+                Model.FT_VAE: self.total_frames[Model.FT],
+                Model.UPSCALER: self.total_frames[Model.FT],
+                Model.OTHERS: 1,
+            }
+            for model, work in defaults.items():
+                self.model_work[model] = work
+        if self.target_resolution != QualityLevel.HIGH:
+            if Model.UPSCALER in self.model_work:
+                del self.model_work[Model.UPSCALER]
+
+    @property
+    def num_frames(self) -> int:
+        """Number of frames generated by the workflow."""
+        if Model.FT in self.total_frames:
+            return self.total_frames[Model.FT]
+        return 0
+
+
+class ActionName(Enum):
+    MERGE = "merge"
+    ADD_DEVICE = "add device"
+    ADD_REPLICA = "add replica"
+    ADD_DEVICE_REPLICA = "add device replica"
+    ADD_INSTANCE = "add instance"
+    REMOVE_DEVICE = "remove device"
+    REMOVE_REPLICA = "remove replica"
+
+
+@dataclass
+class Action:
+    """
+    Optimization action to take.
+    """
+    name: ActionName
+    model: Model
+    gpu_type: GPUType
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]]
+
+    action_result: Result = field(repr=False)
+
+    arrival_time_s: float = 0.0  # For FIFO scheduling
+
+    # Derived fields from action_result (not passed by caller)
+    time: float = field(init=False)  # Total execution time
+    ttff: float = field(init=False)  # Time to first frame
+    cost: float = field(init=False)  # Cost in $
+    energy: float = field(init=False)  # Energy in W*s
+
+    def __post_init__(self) -> None:
+        # ---- type checks ----
+        if not isinstance(self.model, Model):
+            raise ValueError(f"Model {self.model} [{type(self.model)}] not supported")
+        if not isinstance(self.name, ActionName):
+            raise ValueError(f"Action name {self.name} [{type(self.name)}] not supported")
+        if not isinstance(self.models, dict):
+            raise ValueError(f"models must be a dict, got {type(self.models)}")
+        if not isinstance(self.gpu_type, GPUType):
+            raise ValueError(f"Device type {self.gpu_type} [{type(self.gpu_type)}] not supported")
+        """
+        if not isinstance(self.allocation_id, int) or self.allocation_id < 0:
+            raise ValueError(f"Allocation ID {self.allocation_id} must be a non-negative integer")
+        if self.num_replicas <= 0:
+            raise ValueError(f"num_replicas {self.num_replicas} must be > 0")
+        if self.num_devices <= 0:
+            raise ValueError(f"num_devices {self.num_devices} must be > 0")
+        """
+        # ---- derive values ----
+        self.time = self.action_result.total_time_s
+        self.ttff = self.action_result.ttff_s
+        self.cost = self.action_result.cost
+        self.energy = self.action_result.total_energy
+        if self.cost < 0.0:
+            raise ValueError("cost must be >= 0")
+
+    def __str__(self) -> str:
+        return (
+            f"Action("
+            f"{self.name.value}, "
+            f"model={self.model.value}, "
+            f"gpu={self.gpu_type.value}, "
+            f"time={self.time:.2f} s, "
+            f"ttff={self.ttff:.2f} s, "
+            f"cost=${self.cost:.2f}, "
+            f"time*cost={self.time_cost():.2f}, "
+            f"ttff*cost={self.ttff_cost():.2f}, "
+            f"energy*cost={self.energy_cost():.2f}, "
+            f"time*energy={self.time_energy():.2f}, "
+            f"energy={self.energy:.2f} Ws, "
+            f"models={self.models}"
+            f")"
+        )
+
+    def time_cost(self) -> float:
+        """We use improvement in time * $."""
+        if self.time <= 0:
+            return self.cost
+        if self.cost <= 0:
+            return self.time
+        return self.time * self.cost
+
+    def ttff_cost(self) -> float:
+        """We use improvement in TTFF * $."""
+        if self.ttff <= 0:
+            return self.cost
+        if self.cost <= 0:
+            return self.ttff
+        return self.ttff * self.cost
+
+    def energy_cost(self) -> float:
+        """We use improvement in Wh * $."""
+        if self.cost <= 0:
+            return self.energy
+        if self.energy <= 0:
+            return self.cost
+        return self.energy * self.cost
+
+    def time_energy(self) -> float:
+        """We use improvement in TTFF * Wh."""
+        if self.energy <= 0:
+            return self.time
+        if self.time <= 0:
+            return self.energy
+        return self.time * self.energy
+
+    def get_order(self) -> int:
+        " ""For FIFO scheduling."" "
+        return MODEL_ORDER[self.model]
+
+    def get_metric(
+        self,
+        obj: Objective,
+        switch_objective: bool = False,
+    ) -> float:
+        if obj == Objective.RANDOM:
+            return 0.0
+        if obj == Objective.TIME:
+            return self.time
+        if obj == Objective.TTFF:
+            return self.ttff
+        if obj == Objective.COST:
+            return self.cost
+        if obj == Objective.ENERGY:
+            return self.energy
+        if obj == Objective.TIME_COST:
+            return self.time_cost()
+        if obj == Objective.TTFF_COST:
+            return self.ttff_cost()
+        if obj == Objective.ENERGY_COST:
+            return self.energy_cost()
+        if obj == Objective.TIME_ENERGY:
+            return self.time_energy()
+        if obj == Objective.FIFO:
+            # return self.get_order()
+            return 0  # TODO
+        if obj == Objective.TTFF_THEN_TIME:
+            if switch_objective:
+                return self.time
+            else:
+                return self.ttff
+        raise ValueError(f"Unknown objective {obj}")
+
+
+@dataclass
+class Result:
+    total_time_s: float = 0.0
+    first_chunk_time: float = 0.0  # Time to first chunk
+    ttff_s: float = 0.0  # Time to first frame (accounts for total time and workflow length)
+    tbf_s: float = 0.0  # Time between frames
+    total_energy: float = 0.0  # Watts x second
+    cost: float = 0.0  # Total $ cost
+    gpus_used: dict[GPUType, int] = field(default_factory=dict)
+    gpus_total: dict[GPUType, int] = field(default_factory=dict)
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]] = field(default_factory=dict)
+
+    def __post_init__(self) -> None:
+        assert self.total_time_s >= 0.0, f"total_time_s={self.total_time_s} must be >= 0.0"
+        assert self.first_chunk_time >= 0.0, f"first_chunk_time={self.first_chunk_time} must be >= 0.0"
+        assert self.ttff_s >= 0.0, f"ttff_s={self.ttff_s} must be >= 0.0"
+        assert self.tbf_s >= 0.0, f"tbf_s={self.tbf_s} must be >= 0.0"
+        assert self.total_energy >= 0.0, f"total_energy={self.total_energy} must be >= 0.0"
+        assert self.cost >= 0.0, f"cost={self.cost} must be >= 0.0"
+        assert len(self.gpus_used) >= 0, f"gpus_used cannot be empty: {self.gpus_used}"
+        for gpu_used in self.gpus_used.values():
+            assert gpu_used >= 0, f"all gpus_used value {self.gpus_used} must be >= 0"
+
+    def to_csv(self) -> str:
+        num_a100 = self.gpus_used.get(GPUType.A100, 0)
+        num_h100 = self.gpus_used.get(GPUType.H100, 0)
+        num_h200 = self.gpus_used.get(GPUType.H200, 0)
+        num_gb200 = self.gpus_used.get(GPUType.GB200, 0)
+        return (
+            f"{num_a100},{num_h100},{num_h200},{num_gb200},"
+            f"{self.ttff_s:.2f},{self.tbf_s:.2f},{self.cost:.2f},"
+            f"{self.total_time_s:.2f},{self.total_energy:.2f}"
+        )
+
+    def __str__(self) -> str:
+        SECONDS_IN_HOUR = 60 * 60
+        return (
+            f"Time:{self.total_time_s:.2f} s TTFF:{self.ttff_s:.2f} s "
+            f"Cost:${self.cost:.2f} TTFF*Cost:{self.ttff_s * self.cost:.2f} "
+            f"Energy:{self.total_energy / SECONDS_IN_HOUR / 1000:.2f} kWh "
+            f"GPUS: {num_gpus_to_str(self.gpus_used)}"
+        )
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+
+@dataclass
+class LatencyGPUTypeData:
+    gpu_type: GPUType
+    # TP -> latency mappings
+    flux: dict[int, float] = field(default_factory=dict)
+    hf: dict[int, float] = field(default_factory=dict)
+    hf_high: dict[int, float] = field(default_factory=dict)
+    hf_vae: dict[int, float] = field(default_factory=dict)
+    hf_vae_high: dict[int, float] = field(default_factory=dict)
+    ft: dict[int, float] = field(default_factory=dict)
+    ft_high: dict[int, float] = field(default_factory=dict)
+    ft_vae: dict[int, float] = field(default_factory=dict)
+    ft_vae_high: dict[int, float] = field(default_factory=dict)
+    upscaler: dict[int, float] = field(default_factory=dict)
+    gemma_first_scene: dict[int, float] = field(default_factory=dict)
+    gemma_per_scene: dict[int, float] = field(default_factory=dict)
+    others: dict[int, float] = field(default_factory=dict)
+
+    def __getitem__(
+        self,
+        key: Model | tuple[Model, int]
+    ) -> float:
+        if isinstance(key, tuple):
+            assert isinstance(key[0], Model)
+            assert isinstance(key[1], int)
+            model, num_devices = key
+            if model == Model.FLUX:
+                return self.flux[num_devices]
+            if model == Model.HF:
+                return self.hf[num_devices]
+            if model == Model.HF_VAE:
+                return self.hf_vae[num_devices]
+            if model == Model.FT:
+                return self.ft[num_devices]
+            if model == Model.FT_VAE:
+                return self.ft_vae[num_devices]
+            if model == Model.GEMMA:
+                return self.gemma_first_scene[num_devices]
+            if model == Model.UPSCALER:
+                return self.upscaler[num_devices]
+            if model == Model.OTHERS:
+                return self.others[num_devices]
+        raise KeyError(f"Latency for model {key} not found")
+
+    def __contains__(self, key: Model | tuple[Model, int]) -> bool:
+        if isinstance(key, tuple):
+            assert isinstance(key[0], Model)
+            assert isinstance(key[1], int)
+            model, num_devices = key
+            if model == Model.GEMMA:
+                return num_devices in self.gemma_first_scene
+            if model == Model.FLUX:
+                return num_devices in self.flux
+            if model == Model.HF:
+                return num_devices in self.hf
+            if model == Model.HF_VAE:
+                return num_devices in self.hf_vae
+            if model == Model.FT:
+                return num_devices in self.ft
+            if model == Model.FT_VAE:
+                return num_devices in self.ft_vae
+            if model == Model.UPSCALER:
+                return num_devices in self.upscaler
+            if model == Model.HF_VAE:
+                return num_devices in self.hf_vae
+            if model == Model.OTHERS:
+                return num_devices in self.others
+        return False
+
+    def get_max_parallelism(self, model: Model) -> int:
+        """Max number of devices supported for the given model."""
+        if model == Model.FLUX:
+            return max(self.flux.keys())
+        if model == Model.HF:
+            return max(self.hf.keys())
+        if model == Model.FT:
+            return max(self.ft.keys())
+        if model == Model.FT_VAE:
+            return max(self.ft_vae.keys())
+        if model == Model.GEMMA:
+            return max(self.gemma_first_scene.keys())
+        if model == Model.UPSCALER:
+            return max(self.upscaler.keys())
+        if model == Model.HF_VAE:
+            return max(self.hf_vae.keys())
+        if model == Model.OTHERS:
+            return max(self.others.keys())
+        raise KeyError(f"Model {model} not found in latency data")
+
+
+@dataclass
+class PowerGPUTypeData:
+    gpu_type: GPUType
+    # TP -> power mappings
+    flux: dict[int, float] = field(default_factory=dict)
+    hf: dict[int, float] = field(default_factory=dict)
+    hf_high: dict[int, float] = field(default_factory=dict)
+    hf_vae: dict[int, float] = field(default_factory=dict)
+    hf_vae_high: dict[int, float] = field(default_factory=dict)
+    ft: dict[int, float] = field(default_factory=dict)
+    ft_high: dict[int, float] = field(default_factory=dict)
+    ft_vae: dict[int, float] = field(default_factory=dict)
+    ft_vae_high: dict[int, float] = field(default_factory=dict)
+    upscaler: dict[int, float] = field(default_factory=dict)
+    gemma_first_scene: dict[int, float] = field(default_factory=dict)
+    gemma_per_scene: dict[int, float] = field(default_factory=dict)
+    # Other values
+    idle: float = 0.0  # Idle power in Watts
+    tdp: float = 0.0  # TDP power in Watts
+
+    def __getitem__(
+        self,
+        key: Model | tuple[Model, int] | str
+    ) -> float:
+        if isinstance(key, tuple):
+            assert isinstance(key[0], Model)
+            assert isinstance(key[1], int)
+            model, devices = key
+            if model == Model.FLUX:
+                return self.flux[devices]
+            if model == Model.HF:
+                return self.hf[devices]
+            if model == Model.HF_VAE:
+                return self.hf_vae[devices]
+            if model == Model.FT:
+                return self.ft[devices]
+            if model == Model.FT_VAE:
+                return self.ft_vae[devices]
+            if model == Model.UPSCALER:
+                return self.upscaler[devices]
+        if isinstance(key, str):
+            if key == "idle":
+                return self.idle
+            if key == "tdp":
+                return self.tdp
+        raise KeyError(f"Power for {key} not found")
+
+
+@dataclass
+class LatencyData:
+    gpus: dict[GPUType, LatencyGPUTypeData]
+
+    def __getitem__(self, gpu_type: GPUType) -> LatencyGPUTypeData:
+        return self.gpus[gpu_type]
+
+    def __setitem__(
+        self,
+        gpu_type: GPUType,
+        latency_data: LatencyGPUTypeData
+    ) -> None:
+        self.gpus[gpu_type] = latency_data
+
+
+@dataclass
+class PowerData:
+    gpus: dict[GPUType, PowerGPUTypeData]
+
+    def __getitem__(self, gpu_type: GPUType) -> PowerGPUTypeData:
+        return self.gpus[gpu_type]
+
+    def __setitem__(
+        self,
+        gpu_type: GPUType,
+        power_data: PowerGPUTypeData
+    ) -> None:
+        self.gpus[gpu_type] = power_data
+
+
+def num_gpus_to_str(
+    provision: dict[GPUType, int]
+) -> str:
+    return "+".join([
+        f"{num_gpus}x{gpu_type.name}"
+        for gpu_type, num_gpus in provision.items()
+        if num_gpus > 0
+    ])
+
+
+@dataclass
+class Provision:
+    num_gpus: dict[GPUType, int] = field(default_factory=dict)
+
+    def __getitem__(self, gpu_type: GPUType) -> int:
+        return self.num_gpus[gpu_type]
+
+    def __str__(self) -> str:
+        return num_gpus_to_str(self.num_gpus)
+
+
+@dataclass
+class ProvisioningResult:
+    latencies: list[float]
+    costs: list[float]
+    ttffs: list[float]
+    tbfs: list[float]
+    actual_provision: list[dict[GPUType, int]]
+    config_provision: list[dict[GPUType, int]]
+    model_provision: list[dict[GPUType, dict[Model, list[ModelAllocation]]]]
+    qualities: list[float] = field(default_factory=list)
+    energies: list[float] = field(default_factory=list)
+
+    def save(
+        self,
+        policy_name: str,
+        results_dir: str,
+    ) -> None:
+        """Save the provisioning results to a CSV file."""
+        num_a100: list[int] = []
+        num_h100: list[int] = []
+        num_h200: list[int] = []
+        num_gb200: list[int] = []
+        for provision in self.actual_provision:
+            num_a100.append(provision.get(GPUType.A100, 0))
+            num_h100.append(provision.get(GPUType.H100, 0))
+            num_h200.append(provision.get(GPUType.H200, 0))
+            num_gb200.append(provision.get(GPUType.GB200, 0))
+        df_latency = pd.DataFrame({
+            'num_a100': num_a100,
+            'num_h100': num_h100,
+            'num_h200': num_h200,
+            'num_gb200': num_gb200,
+            'ttff_s': self.ttffs,
+            'tbf_s': self.tbfs,
+            'cost': self.costs,
+            'total_time': self.latencies,
+            'energy': self.energies,
+        })
+        df_latency[['ttff_s', 'tbf_s', 'cost', 'total_time', 'energy']] = (
+            df_latency[['ttff_s', 'tbf_s', 'cost', 'total_time', 'energy']].round(2)
+        )
+        policy_name_clean = policy_name.replace(" ", "_").replace("*", "x").replace("/", "_").lower()
+        file_name = results_dir + f"provisioning_{policy_name_clean}.csv"
+        df_latency.to_csv(file_name, index=False)
+
+    def get_pareto_frontier(
+        self,
+        max_x: Optional[float] = None,
+        max_y: Optional[float] = None,
+    ) -> np.ndarray:
+        from utils import get_pareto_frontier  # TODO this is a lazy fix, we need to reset
+        # points = np.array(list(zip(self.ttffs, self.costs)))
+        return get_pareto_frontier(
+            self.ttffs,
+            self.costs,
+            max_x=max_x,
+            max_y=max_y,
+        )
+
+
+class Solver(Enum):
+    GUROBI = "gurobi"
+    HIGHS = "highs"
+    GREEDY = "greedy"
+    NAIVE = "naive"
+    HEXGEN = "hexgen"
+    HELIX = "helix"
+
+
+@dataclass
+class Policy:
+    name: str
+    gpu_cost: dict[GPUType, float]
+    objective: Objective
+    disaggregation: dict[Model, bool]
+    use_upscaler: bool
+    hardware: list[GPUType] = field(default_factory=lambda: [GPUType.A100, GPUType.H100, GPUType.H200, GPUType.GB200])
+    solver: Solver = Solver.GREEDY
+
+    def is_disaggregated(self, model: Model) -> bool:
+        """Check if a model has disaggregation enabled."""
+        return self.disaggregation.get(model, False)
+
+    def __str__(self) -> str:
+        disag_str = {
+            model.value: disaggregated
+            for model, disaggregated in self.disaggregation.items()
+            if disaggregated
+        }
+        return (
+            f"Policy({self.name}, "
+            f"objective={self.objective}, "
+            f"disag={disag_str}, "
+            f"upscaler={self.use_upscaler}, "
+            f"cost={self.gpu_cost}, "
+            f"solver={self.solver})"
+        )
diff --git a/simulator/sim_types_json.py b/simulator/sim_types_json.py
new file mode 100644
index 00000000..9f5451ea
--- /dev/null
+++ b/simulator/sim_types_json.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+import json
+
+from dataclasses import asdict
+
+from sim_types import Model
+from sim_types import Policy
+from sim_types import GPUType
+from sim_types import ModelAllocation
+from sim_types import WorkflowConfig
+
+
+def models_to_json(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]]
+) -> str:
+    result = {}
+    for gpu_type, model_dict in models.items():
+        inner_result = {}
+        for model, allocation_list in model_dict.items():
+            for allocation in allocation_list:
+                alloc_dict = {
+                    'devices': allocation.devices,
+                    'replicas': allocation.replicas,
+                }
+                inner_result[model.value] = alloc_dict
+        result[gpu_type.name] = inner_result
+    return str(result).replace("}}, '", "}},'")
+
+
+def workflow_to_json(workflow: WorkflowConfig) -> str:
+    d = asdict(workflow)
+    # Convert Model enum keys in dict fields to string values
+    for dict_field in ('total_frames', 'per_subscene_frames', 'num_steps', 'model_work'):
+        if dict_field in d:
+            d[dict_field] = {
+                (k.value if hasattr(k, 'value') else k): v
+                for k, v in d[dict_field].items()
+            }
+    # Convert QualityLevel enum to string value
+    if 'target_resolution' in d and hasattr(d['target_resolution'], 'value'):
+        d['target_resolution'] = d['target_resolution'].value
+    return json.dumps(d)
+
+
+def policy_to_json(policy: Policy) -> str:
+    result = {
+        'name': policy.name,
+        'objective': str(policy.objective),
+        'disaggregation': {model.value: enabled for model, enabled in policy.disaggregation.items()},
+        'use_upscaler': policy.use_upscaler,
+        'hardware': [gpu.name for gpu in policy.hardware],
+    }
+    return json.dumps(result)
+
+
+def model_list_to_json(models: list[Model]) -> str:
+    return json.dumps(models, default=lambda o: o.value)
diff --git a/simulator/utils.py b/simulator/utils.py
new file mode 100644
index 00000000..29ffe7ab
--- /dev/null
+++ b/simulator/utils.py
@@ -0,0 +1,297 @@
+"""
+Utilities for the simulator.
+"""
+
+from __future__ import annotations
+
+from copy import deepcopy
+
+import pandas as pd
+import numpy as np
+
+from scipy.interpolate import interp1d
+
+from sim_types import ProvisioningResult
+from sim_types import GPUType
+from sim_types import Model
+from sim_types import ModelAllocation
+
+from typing import Optional
+
+
+def to_models_df(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]]
+) -> pd.DataFrame:
+    """
+    Convert the models dictionary to a pandas DataFrame for easier analysis and visualization.
+    """
+    records = []
+    for gpu_type, model_allocations in models.items():
+        for model, allocations in model_allocations.items():
+            for allocation in allocations:
+                if allocation is None or allocation.get_num_gpus() == 0:
+                    continue  # Ignoring empty allocations
+                record = {
+                    "GPU": gpu_type.value,
+                    "Model": model.value,
+                    "Devices": allocation.devices,
+                    "Replicas": allocation.replicas,
+                    "Work": allocation.work,
+                    "#GPUs": allocation.get_num_gpus(),
+                    "Time (s)": allocation.time,
+                    "TTFF (s)": allocation.time_first,
+                    "Energy (kWh)": allocation.energy / (60 * 60) / 1000.0,  # Convert to kWh
+                    "Cost ($)": allocation.cost,
+                }
+                records.append(record)
+    df = pd.DataFrame(records)
+    df = df.set_index(["GPU", "Model"])
+    df = df.round(2)
+
+    total = df.sum(numeric_only=True)
+    total["Time (s)"] = df["Time (s)"].groupby(level="Model").max().sum()
+    total["TTFF (s)"] = df["TTFF (s)"].groupby(level="Model").min().sum()
+    total.name = ("TOTAL", "")
+    df = pd.concat([df, total.to_frame().T])
+
+    df[["Devices", "Replicas", "#GPUs", "Work"]] = df[["Devices", "Replicas", "#GPUs", "Work"]].astype(int)
+
+    return df
+
+
+def coalesce_models(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]]
+) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
+    """The models with the same parallelism and same work, should be accounted as replicas."""
+    merged: dict[GPUType, dict[Model, list[ModelAllocation]]] = {}
+    for gpu_type, model_dict in models.items():
+        merged[gpu_type] = {}
+        for model_name, allocations in model_dict.items():
+            merged_allocations: list[ModelAllocation] = []
+            for alloc in allocations:
+                # Check if there's an existing allocation with the same devices and work
+                match = next((
+                    model_alloc
+                    for model_alloc in merged_allocations
+                    if model_alloc.devices == alloc.devices and model_alloc.work == alloc.work
+                ), None)
+                if match:
+                    # If found, increment replicas and aggregate energy/cost
+                    match.replicas += 1
+                    match.energy += alloc.energy
+                    match.cost += alloc.cost
+                else:
+                    # Otherwise, add as new allocation
+                    merged_allocations.append(deepcopy(alloc))
+            merged[gpu_type][model_name] = merged_allocations
+    return merged
+
+
+def simplify_model_allocations(
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
+    """
+    Simplify model allocations by merging replicas with the same number of devices.
+    This is to reduce the search space for the optimization loop.
+    """
+    new_models = deepcopy(models)
+    for gpu_type in new_models.keys():
+        for model in new_models[gpu_type].keys():
+            model_instances = new_models[gpu_type][model]
+            alloc_map: dict[int, ModelAllocation] = {}
+            for model_instance in model_instances:
+                if model_instance.get_num_gpus() == 0:
+                    continue
+                if model_instance.devices not in alloc_map:
+                    alloc_map[model_instance.devices] = deepcopy(model_instance)
+                else:
+                    alloc_map[model_instance.devices].replicas += model_instance.replicas
+            new_models[gpu_type][model] = list(alloc_map.values())
+    return new_models
+
+
+def find_fastest_provisioning(
+    provisioning: ProvisioningResult,
+) -> int:
+    """Find the fastest provisioning option."""
+    min_latency = min(provisioning.latencies)
+    min_latency_index = provisioning.latencies.index(min_latency)
+    return min_latency_index
+
+
+def find_fastest_ttff_provisioning(
+    provisioning: ProvisioningResult,
+) -> int:
+    """Find the fastest provisioning option."""
+    min_ttff = min(provisioning.ttffs)
+    min_ttff_index = provisioning.ttffs.index(min_ttff)
+    return min_ttff_index
+
+
+def find_cheapest_provisioning(
+    provisioning: ProvisioningResult,
+) -> int:
+    """Find the cheapest provisioning option."""
+    min_cost = min(provisioning.costs)
+    min_cost_index = provisioning.costs.index(min_cost)
+    return min_cost_index
+
+
+def find_most_cost_effective_provisioning(
+    provisioning: ProvisioningResult,
+) -> int:
+    """Find the most cost-effective provisioning option."""
+    min_cost = min(provisioning.costs)
+    min_latency = min(provisioning.latencies)
+    min_cost_index = provisioning.costs.index(min_cost)
+    min_latency_index = provisioning.latencies.index(min_latency)
+    if min_cost_index == min_latency_index:
+        return min_cost_index
+
+    # if the indices are different, return the provisioning option with the minimum cost*latency
+    cost_latency_list = [
+        cost * latency
+        for cost, latency in zip(provisioning.costs, provisioning.latencies)
+    ]
+    min_cost_latency = min(cost_latency_list)
+    min_cost_latency_index = cost_latency_list.index(min_cost_latency)
+    return min_cost_latency_index
+
+
+def find_most_energy_efficient_provisioning(
+    provisioning: ProvisioningResult,
+) -> int:
+    """Find the most energy-efficient provisioning option."""
+    min_energy = min(provisioning.energies)
+    min_latency = min(provisioning.latencies)
+    min_energy_index = provisioning.energies.index(min_energy)
+    min_latency_index = provisioning.latencies.index(min_latency)
+    if min_energy_index == min_latency_index:
+        return min_energy_index
+
+    # if the indices are different, return the provisioning option with the minimum energy*latency
+    energy_latency_list = [
+        energy * latency
+        for energy, latency in zip(provisioning.energies, provisioning.latencies)
+    ]
+    min_energy_latency = min(energy_latency_list)
+    min_energy_latency_index = energy_latency_list.index(min_energy_latency)
+    return min_energy_latency_index
+
+
+def find_pareto_frontier(
+    latency_list: list[float],
+    energy_list: list[float],
+    provision: list[float]
+) -> tuple[list[float], list[float], list[float]]:
+    pareto_provision = []
+    pareto_latency = []
+    pareto_energy = []
+    for i in range(len(latency_list)):
+        dominated = False
+        for j in range(len(latency_list)):
+            if i != j:
+                if latency_list[j] <= latency_list[i] and energy_list[j] <= energy_list[i]:
+                    if latency_list[j] < latency_list[i] or energy_list[j] < energy_list[i]:
+                        dominated = True
+                        break
+        if not dominated:
+            pareto_provision.append(provision[i])
+            pareto_latency.append(latency_list[i])
+            pareto_energy.append(energy_list[i])
+    return pareto_provision, pareto_latency, pareto_energy
+
+
+def get_pareto_frontier_paper(
+    points: np.ndarray,
+    max_y: Optional[float] = None,
+    max_x: Optional[float] = None,
+) -> np.ndarray:
+    """
+    Calculate the Pareto frontier from a set of data points
+    """
+    if points.size == 0:
+        return points.copy()
+
+    # points = points[np.argsort(points[:, 0])]
+    points = points[np.lexsort((points[:, 1], points[:, 0]))]
+
+    pareto_front = [points[0]]
+    for point in points[1:]:
+        if point[1] < pareto_front[-1][1]:
+            pareto_front.append(point)
+
+    # Add extreme points to the Pareto frontier
+    extreme_point_0 = [pareto_front[0][0], max(points[:, 1])]
+    extreme_point_1 = [max(points[:, 0]), pareto_front[-1][1]]
+    pareto_front.append(extreme_point_0)
+    pareto_front.append(extreme_point_1)
+
+    if max_x is not None:
+        candidate = np.array([max_x, min(points[:, 1])])
+        if candidate[0] > pareto_front[-1][0] and candidate[1] <= pareto_front[-1][1]:
+            pareto_front.append(candidate)
+    if max_y is not None:
+        candidate = np.array([min(points[:, 0]), max_y])
+        if candidate[1] > pareto_front[0][1] and candidate[0] <= pareto_front[0][0]:
+            pareto_front.append(candidate)
+
+    pareto_front_np = np.array(pareto_front)
+    pareto_front_np = pareto_front_np[np.lexsort((
+        -pareto_front_np[:, 1],
+        pareto_front_np[:, 0]))]
+
+    # Avoid repeated points
+    _, idx = np.unique(pareto_front_np, axis=0, return_index=True)
+    pareto_front_np = pareto_front_np[np.sort(idx)]
+
+    return pareto_front_np
+
+
+def get_pareto_frontier(
+    ttff_list: list[float],
+    costs: list[float],
+    max_y: Optional[float] = None,
+    max_x: Optional[float] = None,
+) -> np.ndarray:
+    points = np.array(list(zip(ttff_list, costs)))
+    return get_pareto_frontier_paper(
+        points,
+        max_x,
+        max_y,
+    )
+
+
+def clean_frontier(
+    frontier: np.ndarray
+) -> np.ndarray:
+    F = frontier[np.argsort(frontier[:, 0])]
+    xs = []
+    ys = []
+    i = 0
+    while i < len(F):
+        x = F[i, 0]
+        same_x = F[F[:, 0] == x]
+        xs.append(x)
+        ys.append(same_x[:, 1].min())
+        i += len(same_x)
+    return np.column_stack([xs, ys])
+
+
+def area_between_frontiers(
+    A: np.ndarray,
+    B: np.ndarray,
+    n: int = 5000
+) -> np.ndarray:
+    A = clean_frontier(A)
+    B = clean_frontier(B)
+    xmin = max(A[:, 0].min(), B[:, 0].min())
+    xmax = min(A[:, 0].max(), B[:, 0].max())
+    xs = np.linspace(xmin, xmax, n)
+    fA = interp1d(A[:, 0], A[:, 1], kind="linear")
+    fB = interp1d(B[:, 0], B[:, 1], kind="linear")
+    yA = fA(xs)
+    yB = fB(xs)
+    # return np.trapezoid(yB - yA, xs)
+    delta = yB - yA
+    return 100.0 * delta / yB
diff --git a/simulator/workflows.py b/simulator/workflows.py
new file mode 100644
index 00000000..ba0caa46
--- /dev/null
+++ b/simulator/workflows.py
@@ -0,0 +1,253 @@
+from __future__ import annotations
+
+import math
+
+from typing import Optional
+
+from sim_types import WorkflowConfig
+from sim_types import Model
+from sim_types import QualityLevel
+
+from constants import FPS
+from constants import FRAMES_OPTIONS
+from constants import FRAMES_PER_STEP_IDX
+from constants import NUM_STEPS
+from constants import SECONDS_IN_HOUR, SECONDS_IN_MINUTE
+from constants import TOTAL_INPUT_TOKENS
+
+
+# Shared physical constants
+MAX_FT_FRAMES: int = 1 + 80
+SUBSCENE_SECONDS: float = MAX_FT_FRAMES / FPS[Model.FT]  # 81 frames @ 23 FPS → ~3.52 s
+SUBSCENES_PER_SCENE: int = 4  # default subscene grouping
+TOKENS_PER_FRAME = 500  # 1 frame generates around 500 tokens
+
+
+def _get_num_subscenes(total_video_seconds: int) -> int:
+    """Return the number of subscenes needed to cover the given video duration."""
+    return math.ceil(total_video_seconds / SUBSCENE_SECONDS)
+
+
+def _get_num_scenes(total_video_seconds: int) -> int:
+    """Return the number of scenes needed to cover the given video duration."""
+    return math.ceil(_get_num_subscenes(total_video_seconds) / SUBSCENES_PER_SCENE)
+
+
+def _get_num_frames(total_video_seconds: int, model: Model) -> int:
+    """Return the number of frames needed for the given video duration and model."""
+    return math.ceil(total_video_seconds * FPS[model])
+
+
+def _video_gen_work(
+    total_video_seconds: int,
+    num_scenes: int,
+    num_subscenes: int,
+    model_work_overrides: Optional[dict[Model, int | str | None]] = None,
+) -> dict[Model, int]:
+    """Standard model work for video-generation workflows (Podcast, Movie, etc.)."""
+    ret = {
+        Model.GEMMA: 1,
+        Model.FLUX: 1,
+        Model.HF: num_subscenes,
+        Model.HF_VAE: _get_num_frames(total_video_seconds, Model.HF),
+        Model.FT: num_subscenes,
+        Model.FT_VAE: _get_num_frames(total_video_seconds, Model.FT),
+        Model.UPSCALER: _get_num_frames(total_video_seconds, Model.FT),
+        Model.OTHERS: 1,
+    }
+    if model_work_overrides:
+        for model, value in model_work_overrides.items():
+            if value == "num_scenes":
+                ret[model] = num_scenes
+            elif value == "num_subscenes":
+                ret[model] = num_subscenes
+            elif isinstance(value, str):
+                raise ValueError(f"Invalid model_work override value: {value}")
+            elif value == 0 or value is None:
+                del ret[model]
+            else:
+                ret[model] = value
+    return ret
+
+
+class WorkOverrideType:
+    def __init__(self, value: int | str | None = None):
+        self.value = value
+
+
+def build_workflow_config(
+    total_video_seconds: int,
+    input_tokens: int,
+    model_work: dict[Model, int] | None = None,
+    *,
+    model_work_overrides: dict[Model, int | str | None] | None = None,
+    num_scenes_override: int | None = None,
+    num_steps_override: dict[Model, int] | None = None,
+    target_resolution: QualityLevel = QualityLevel.HIGH,
+) -> WorkflowConfig:
+    """Build a ``WorkflowConfig`` from base parameters, computing all derived values.
+
+    Parameters
+    ----------
+    model_work:
+        Explicit model-work dictionary.  When ``None`` (default), standard
+        video-generation work is auto-generated from the other parameters.
+    exclude_models:
+        Models to remove from auto-generated ``model_work``.
+    model_work_overrides:
+        Key-value overrides applied on top of auto-generated ``model_work``.
+        If a value is set to "num_scenes", it will be replaced with the number of scenes (i.e. per-scene work).
+    target_resolution:
+        The target output resolution for the workflow (default HIGH).
+        When not HIGH, UPSCALER is automatically removed from model_work.
+    """
+    num_subscenes = _get_num_subscenes(total_video_seconds)
+
+    num_scenes = _get_num_scenes(total_video_seconds)
+    if num_scenes_override is not None:
+        num_scenes = num_scenes_override
+
+    num_steps = dict(NUM_STEPS)
+    if num_steps_override:
+        num_steps.update(num_steps_override)
+
+    if model_work is None:
+        model_work = _video_gen_work(
+            total_video_seconds,
+            num_scenes,
+            num_subscenes,
+            model_work_overrides,
+        )
+
+    return WorkflowConfig(
+        total_video_seconds=total_video_seconds,
+        total_scenes=num_scenes,
+        total_subscenes=num_subscenes,
+        total_frames={
+            Model.HF: _get_num_frames(total_video_seconds, Model.HF),
+            Model.FT: _get_num_frames(total_video_seconds, Model.FT),
+        },
+        per_subscene_frames={
+            Model.HF: math.ceil(_get_num_frames(total_video_seconds, Model.HF) / num_subscenes),
+            Model.FT: math.ceil(_get_num_frames(total_video_seconds, Model.FT) / num_subscenes),
+        },
+        num_steps=num_steps,
+        hf_frames=FRAMES_OPTIONS[Model.HF],
+        ft_frames=FRAMES_OPTIONS[Model.FT],
+        frames_per_step_idx=FRAMES_PER_STEP_IDX,
+        target_resolution=target_resolution,
+        total_input_tokens=input_tokens,
+        model_work=model_work,
+    )
+
+
+WORKFLOW_DURATIONS = {  # in seconds
+    "podcast": int(10 * SECONDS_IN_MINUTE),
+    # TODO The input is two hours but the output should be shorter something like 1 or 2 minutes
+    "short": int(2 * SECONDS_IN_HOUR),
+    "movie": int(2 * SECONDS_IN_HOUR),
+    "story": int(10 * SECONDS_IN_MINUTE),
+    "lecture": int(5 * SECONDS_IN_MINUTE),
+    "slide": int(10 * SECONDS_IN_MINUTE),
+    "dubbing": int(10 * SECONDS_IN_MINUTE),
+    "editing": int(10 * SECONDS_IN_MINUTE),
+    "chat": 5,
+}
+
+
+# Podcast: 10-minute video from text/PDF input
+PODCAST_WORKFLOW = build_workflow_config(
+    total_video_seconds=WORKFLOW_DURATIONS["podcast"],
+    input_tokens=TOTAL_INPUT_TOKENS,
+)
+
+# Shorts: short clips from a 2-hour input video
+_SHORTS_SECONDS = WORKFLOW_DURATIONS["short"]
+_SHORTS_SCENES = _SHORTS_SECONDS // 10  # 10-second scene segmentation → 720
+SHORTS_WORKFLOW = build_workflow_config(
+    total_video_seconds=_SHORTS_SECONDS,
+    input_tokens=int(_SHORTS_SECONDS * TOKENS_PER_FRAME),  # 1 fps × 500 tokens/frame
+    model_work={
+        Model.GEMMA: _SHORTS_SCENES,
+        Model.OTHERS: 1,  # TODO isn't this 1 by default?
+    },
+    num_scenes_override=_SHORTS_SCENES,
+)
+
+# Movie: 2-hour movie
+MOVIE_WORKFLOW = build_workflow_config(
+    total_video_seconds=WORKFLOW_DURATIONS["movie"],
+    input_tokens=TOTAL_INPUT_TOKENS,
+    model_work_overrides={
+        Model.FLUX: "num_scenes",
+    },
+)
+
+# Animated Story: Podcast + 5% more HF denoising steps (LoRA overhead)
+OVERHEAD_PCT = 5
+ANIMATED_STORY_WORKFLOW = build_workflow_config(
+    total_video_seconds=WORKFLOW_DURATIONS["story"],
+    input_tokens=TOTAL_INPUT_TOKENS,
+    num_steps_override={
+        Model.HF: int(NUM_STEPS[Model.HF] * 1 + (OVERHEAD_PCT / 100.0))
+    },
+)
+
+# Lecture: 5-minute video, Flux generates per-scene images
+LECTURE_WORKFLOW = build_workflow_config(
+    total_video_seconds=WORKFLOW_DURATIONS["lecture"],
+    input_tokens=TOTAL_INPUT_TOKENS,
+    model_work_overrides={
+        Model.FLUX: "num_scenes",
+    },
+)
+
+# Slide Persona: same as Podcast but at low resolution, no upscaler
+SLIDE_PERSONA_WORKFLOW = build_workflow_config(
+    total_video_seconds=WORKFLOW_DURATIONS["slide"],
+    input_tokens=TOTAL_INPUT_TOKENS,
+    target_resolution=QualityLevel.LOW,
+    model_work_overrides={
+        Model.UPSCALER: None,
+    },
+)
+
+# Dubbing: like Podcast but without Flux, and double the audio work
+DUBBING_WORKFLOW = build_workflow_config(
+    total_video_seconds=WORKFLOW_DURATIONS["dubbing"],
+    input_tokens=TOTAL_INPUT_TOKENS,
+    model_work_overrides={
+        Model.FLUX: None,
+        Model.OTHERS: 2,  # Double audio work
+    },
+)
+
+# Editing: like Podcast but without GEMMA, FLUX, or OTHERS
+EDITING_WORKFLOW = build_workflow_config(
+    total_video_seconds=WORKFLOW_DURATIONS["editing"],
+    input_tokens=TOTAL_INPUT_TOKENS,
+    model_work_overrides={
+        Model.GEMMA: None,
+        Model.FLUX: None,
+        Model.OTHERS: None,
+    }
+)
+
+# Video Chat: like Podcast but only 5 seconds of output video
+VIDEO_CHAT_WORKFLOW = build_workflow_config(
+    total_video_seconds=WORKFLOW_DURATIONS["chat"],
+    input_tokens=TOTAL_INPUT_TOKENS,
+)
+
+
+WORKFLOWS = {
+    "podcast": PODCAST_WORKFLOW,
+    "chat": VIDEO_CHAT_WORKFLOW,
+    "dubbing": DUBBING_WORKFLOW,
+    "editing": EDITING_WORKFLOW,
+    "lecture": LECTURE_WORKFLOW,
+    "movie": MOVIE_WORKFLOW,
+    "short": SHORTS_WORKFLOW,
+    "slide": SLIDE_PERSONA_WORKFLOW,
+    "story": ANIMATED_STORY_WORKFLOW,
+}
diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py
new file mode 100644
index 00000000..44dd2512
--- /dev/null
+++ b/streamwise/allocator_bridge.py
@@ -0,0 +1,256 @@
+"""
+Bridge between the model provisioner's allocator output and StreamWise pod deployment.
+
+Translates ModelAllocation results (abstract Model enum + GPU counts) into concrete
+container deployment parameters compatible with pod_manager.add_pod().
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+
+# Add simulator/ to sys.path so foundation modules are importable.
+_SIMULATOR_DIR = os.path.normpath(
+    os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "simulator")
+)
+if _SIMULATOR_DIR not in sys.path:
+    sys.path.insert(0, _SIMULATOR_DIR)
+
+from dataclasses import dataclass
+from typing import Optional
+
+from sim_types import GPUType
+from sim_types import Model
+from sim_types import Result
+
+from auto_model_allocator import AutoModelAllocator
+from data_loading import load_latency_data
+from model_provisioner.policies import STREAMWISE_POLICY
+from workflows import WORKFLOWS
+
+
+# Mapping from simulator Model enum to concrete container names used by pod_manager.
+# Some Model entries map to multiple containers (e.g., OTHERS -> kokoro + yolo).
+MODEL_TO_CONTAINERS: dict[Model, list[str]] = {
+    Model.GEMMA: ["gemma"],
+    Model.FLUX: ["flux"],
+    Model.HF: ["hunyuanframepackf1"],
+    Model.HF_VAE: ["hunyuanframepackvae"],
+    Model.FT: ["fantasytalking"],
+    Model.FT_VAE: [],  # FT_VAE is handled within fantasytalking container
+    Model.UPSCALER: ["realesrgan"],
+    Model.OTHERS: ["kokoro", "yolo"],
+}
+
+# Default CPU/memory/storage for each container when deployed via auto-deploy.
+# Format: (cpu_cores, memory_gib, ephemeral_storage_gib)
+CONTAINER_RESOURCES: dict[str, tuple[int, int, int]] = {
+    "gemma": (16, 192, 64),
+    "flux": (12, 128, 64),
+    "hunyuanframepackf1": (24, 128, 64),
+    "hunyuanframepackvae": (4, 32, 16),
+    "fantasytalking": (12, 192, 64),
+    "realesrgan": (4, 32, 16),
+    "kokoro": (2, 8, 16),
+    "yolo": (4, 8, 16),
+}
+
+# GPU type string used by pod_manager (lowercase)
+GPU_TYPE_TO_POD_STR: dict[GPUType, str] = {
+    GPUType.A100: "a100",
+    GPUType.H100: "h100",
+    GPUType.H200: "h200",
+    GPUType.GB200: "gb200",
+}
+
+# MIG containers: these use a MIG slice instead of a full GPU
+MIG_CONTAINERS: dict[str, str] = {
+    "kokoro": "1g.10gb",
+    "yolo": "1g.10gb",
+    "realesrgan": "1g.10gb",
+}
+
+# Mapping from StreamWise app name to simulator workflow key
+APP_TO_WORKFLOW: dict[str, str] = {
+    "streamcast": "podcast",
+    "streampersona": "slide",
+    "streamchat": "chat",
+    "streamshort": "short",
+    "streammovie": "movie",
+    "streamanimate": "story",
+    "streamlecture": "lecture",
+    "streamdub": "dubbing",
+    "streamedit": "editing",
+}
+
+
+@dataclass
+class DeploymentSpec:
+    """A single container deployment specification."""
+    container_name: str
+    cpu: int
+    memory_gib: int
+    ephemeral_storage_gib: int
+    gpu: int
+    gpu_type: Optional[str]
+    mig_profile: Optional[str]
+
+
+@dataclass
+class DeploymentPlan:
+    """Complete deployment plan produced by the auto-allocator."""
+    specs: list[DeploymentSpec]
+    result: Result
+    workflow_name: str
+    gpu_budget: dict[str, int]
+
+
+def _get_data_dir() -> str:
+    """Get the path to the simulator data directory."""
+    default_path = os.path.join(os.path.dirname(__file__), "..", "simulator", "data")
+    return os.getenv("SIMULATOR_DATA_DIR", default_path)
+
+
+def get_available_workflows() -> list[str]:
+    """Return list of available workflow names for the UI."""
+    return list(APP_TO_WORKFLOW.keys())
+
+
+def get_available_gpu_types() -> list[str]:
+    """Return list of available GPU type strings for the UI."""
+    return [gpu_type.value for gpu_type in GPUType]
+
+
+def run_allocator(
+    gpu_budget: dict[str, int],
+    workflow_name: str,
+) -> DeploymentPlan:
+    """
+    Run the greedy model allocator and return a deployment plan.
+
+    Args:
+        gpu_budget: GPU counts keyed by GPU type string (e.g., {"A100": 8, "H100": 0}).
+        workflow_name: StreamWise app name (e.g., "streamcast").
+
+    Returns:
+        DeploymentPlan with concrete container deployment specs.
+
+    Raises:
+        ValueError: If workflow_name or GPU types are invalid.
+    """
+    # Validate workflow
+    workflow_key = APP_TO_WORKFLOW.get(workflow_name)
+    if workflow_key is None:
+        raise ValueError(
+            f"Unknown workflow '{workflow_name}'. "
+            f"Available: {list(APP_TO_WORKFLOW.keys())}")
+
+    workflow = WORKFLOWS[workflow_key]
+
+    # Parse GPU budget into GPUType enum
+    num_gpus: dict[GPUType, int] = {}
+    for gpu_str, count in gpu_budget.items():
+        try:
+            gpu_type = GPUType(gpu_str)
+        except ValueError:
+            raise ValueError(
+                f"Unknown GPU type '{gpu_str}'. "
+                f"Available: {[g.value for g in GPUType]}")
+        if count > 0:
+            num_gpus[gpu_type] = count
+
+    if not num_gpus or sum(num_gpus.values()) < 8:
+        raise ValueError("Total GPU budget must be at least 8 GPUs.")
+
+    # Load latency data and run allocator
+    data_dir = _get_data_dir()
+    latency_data = load_latency_data(data_dir=data_dir)
+
+    allocator = AutoModelAllocator(
+        workflow=workflow,
+        latency_data=latency_data,
+        policy=STREAMWISE_POLICY,
+    )
+
+    result = allocator.allocate(num_gpus=num_gpus, verbose=False)
+
+    # Convert result to deployment specs
+    specs = result_to_deployment_specs(result)
+
+    return DeploymentPlan(
+        specs=specs,
+        result=result,
+        workflow_name=workflow_name,
+        gpu_budget=gpu_budget,
+    )
+
+
+def result_to_deployment_specs(result: Result) -> list[DeploymentSpec]:
+    """
+    Convert an allocator Result into a list of DeploymentSpec objects.
+
+    Each ModelAllocation with replicas > 0 is mapped to one or more container deployments.
+    """
+    specs: list[DeploymentSpec] = []
+
+    for gpu_type, model_dict in result.models.items():
+        gpu_type_str = GPU_TYPE_TO_POD_STR[gpu_type]
+
+        for model, allocations in model_dict.items():
+            containers = MODEL_TO_CONTAINERS.get(model, [])
+            if not containers:
+                continue
+
+            for allocation in allocations:
+                if allocation.replicas <= 0:
+                    continue
+
+                for container_name in containers:
+                    resources = CONTAINER_RESOURCES.get(container_name, (4, 16, 16))
+                    cpu, memory_gib, ephemeral_storage_gib = resources
+
+                    mig_profile = MIG_CONTAINERS.get(container_name)
+                    gpu_count = allocation.devices if not mig_profile else 1
+
+                    for _ in range(allocation.replicas):
+                        specs.append(DeploymentSpec(
+                            container_name=container_name,
+                            cpu=cpu,
+                            memory_gib=memory_gib,
+                            ephemeral_storage_gib=ephemeral_storage_gib,
+                            gpu=gpu_count,
+                            gpu_type=gpu_type_str,
+                            mig_profile=mig_profile,
+                        ))
+
+    return specs
+
+
+def deployment_plan_to_json(plan: DeploymentPlan) -> dict:
+    """Serialize a DeploymentPlan to a JSON-friendly dict."""
+    return {
+        "workflow_name": plan.workflow_name,
+        "gpu_budget": plan.gpu_budget,
+        "metrics": {
+            "total_time_s": round(plan.result.total_time_s, 2),
+            "ttff_s": round(plan.result.ttff_s, 2),
+            "cost": round(plan.result.cost, 4),
+            "gpus_used": {
+                gpu_type.value: count
+                for gpu_type, count in plan.result.gpus_used.items()
+            },
+        },
+        "specs": [
+            {
+                "container_name": spec.container_name,
+                "cpu": spec.cpu,
+                "memory_gib": spec.memory_gib,
+                "ephemeral_storage_gib": spec.ephemeral_storage_gib,
+                "gpu": spec.gpu,
+                "gpu_type": spec.gpu_type,
+                "mig_profile": spec.mig_profile,
+            }
+            for spec in plan.specs
+        ],
+    }
diff --git a/streamwise/model_provisioner/__init__.py b/streamwise/model_provisioner/__init__.py
new file mode 100644
index 00000000..c79b0cde
--- /dev/null
+++ b/streamwise/model_provisioner/__init__.py
@@ -0,0 +1,15 @@
+"""
+Model Provisioner — allocation policy implementations for GPU resource distribution.
+
+Contains greedy, naive, MILP, HexGen, and Helix allocation strategies.
+The foundation types (sim_types, constants, models, etc.) live in simulator/.
+"""
+import os
+import sys
+
+# Add simulator/ to sys.path so policy files can import foundation modules.
+_SIMULATOR_DIR = os.path.normpath(
+    os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "simulator")
+)
+if _SIMULATOR_DIR not in sys.path:
+    sys.path.insert(0, _SIMULATOR_DIR)
diff --git a/streamwise/model_provisioner/greedy.py b/streamwise/model_provisioner/greedy.py
new file mode 100644
index 00000000..8c1a1dd0
--- /dev/null
+++ b/streamwise/model_provisioner/greedy.py
@@ -0,0 +1,573 @@
+"""
+Greedy algorithm for the StreamWise workflow allocation problem.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from tabulate import tabulate
+
+from typing import Optional
+
+from operator import itemgetter
+
+from constants import NUM_GPUS_PER_SERVER
+from constants import SECONDS_IN_MINUTE
+from constants import SECONDS_IN_HOUR
+
+from sim_types import Result
+from sim_types import GPUType
+from sim_types import WorkflowConfig
+from sim_types import LatencyData
+from sim_types import PowerData
+from sim_types import Model
+from sim_types import ModelAllocation
+from sim_types import Policy
+from sim_types import Solver
+
+from utils import simplify_model_allocations
+
+from evaluator import calc_used_gpus
+from evaluator import evaluate_model_allocation
+
+from model_allocator import ModelAllocator
+
+from .policies import STREAMWISE_POLICY
+from .policies import MAX_ITERATIONS
+from .policies import USE_ALL_GPUS
+
+from actions import gen_actions
+from actions import choose_action
+from actions import apply_action
+
+
+class GreedyAllocator(ModelAllocator):
+    """
+    Greedy allocator that iteratively applies the best action.
+    """
+    def __init__(
+        self,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+        power_data: Optional[PowerData] = None,
+        policy: Policy = STREAMWISE_POLICY,
+    ) -> None:
+        super().__init__(
+            workflow,
+            latency_data,
+            power_data,
+            policy,
+        )
+        assert self.policy.solver in {Solver.GREEDY, Solver.HEXGEN}
+
+    def allocate(
+        self,
+        num_gpus: dict[GPUType, int],
+        verbose: bool = False,
+        # Greedy policy parameters
+        allow_removal: bool = False,
+        allow_merging: bool = False,
+        look_ahead_replicas: int = 3,
+    ) -> Result:
+        total_gpus = sum(num_gpus.values())
+        assert total_gpus >= 8, f"Total number of GPUs must be at least 8 ({num_gpus})"
+
+        gpu_types = [
+            gpu_type
+            for gpu_type, count in num_gpus.items()
+            if count > 0
+        ]
+        assert 1 <= len(gpu_types) <= 2, f"Only up to two GPU types are supported ({len(gpu_types)})"
+        gpu_type1 = gpu_types[0]
+
+        if len(gpu_types) == 1 and num_gpus[gpu_type1] == 8:
+            # 8 x GPUs
+            return self._pick_from_single_server(
+                gpu_type=gpu_type1,
+                verbose=verbose,
+            )
+
+        if len(gpu_types) == 1:
+            # More than 8 x GPUs
+            return self._pick_from_single_device_mapping(
+                num_gpus.get(gpu_type1, 0),
+                gpu_type=gpu_type1,
+                verbose=verbose,
+                allow_removal=allow_removal,
+                allow_merging=allow_merging,
+                look_ahead_replicas=look_ahead_replicas,
+            )
+
+        # Mixed setup of GPU types (e.g., A100 and H100)
+        return self._pick_from_both_devices_mapping(
+            num_gpus,
+            verbose=verbose,
+            allow_removal=allow_removal,
+            allow_merging=allow_merging,
+            look_ahead_replicas=look_ahead_replicas,
+        )
+
+    def _pick_from_both_devices_mapping(
+        self,
+        num_gpus: dict[GPUType, int],
+        verbose: bool = False,
+        allow_removal: bool = False,
+        allow_merging: bool = False,
+        look_ahead_replicas: int = 3,
+    ) -> Result:
+        """
+        Calculate based on two GPU types.
+        """
+        gpu_types = list(num_gpus.keys())
+        assert len(gpu_types) == 2
+        assert len(num_gpus) == 2
+        gpu_type1 = gpu_types[0]
+        gpu_type2 = gpu_types[1]
+        assert num_gpus[gpu_type1] >= NUM_GPUS_PER_SERVER[gpu_type1]
+        assert num_gpus[gpu_type2] >= NUM_GPUS_PER_SERVER[gpu_type2]
+
+        # Initialize allocations with minimal setup
+        models = self._init_both_devices_models(gpu_type1, gpu_type2)
+
+        remaining_gpus = {}
+        for gpu_type in num_gpus.keys():
+            remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]})
+
+        # Optimization loop
+        if verbose:
+            evaluate_model_allocation(
+                models=models,
+                num_gpus=num_gpus,
+                workflow=self.workflow,
+                latency_data=self.latency_data,
+                power_data=self.power_data,
+                policy=self.policy,
+                round_up_cost_to_server=True,
+            )
+            self._print_iteration(0, models, num_gpus)
+
+        it = 1
+        prev_metric = None
+        switch_objective = False
+        while sum(remaining_gpus.values()) > 0:
+            # Calculate current iteration times
+            evaluate_model_allocation(
+                models=models,
+                num_gpus=num_gpus,
+                workflow=self.workflow,
+                latency_data=self.latency_data,
+                power_data=self.power_data,
+                policy=self.policy,
+                round_up_cost_to_server=False,
+            )
+
+            # Calculate potential actions for each optimization option
+            actions = gen_actions(
+                workflow=self.workflow,
+                latency_data=self.latency_data,
+                power_data=self.power_data,
+                num_gpus=num_gpus,
+                models=models,
+                policy=self.policy,
+                allow_removal=allow_removal,
+                allow_merging=allow_merging,
+                look_ahead_replicas=look_ahead_replicas,
+            )
+
+            if not actions:
+                logging.debug(f"No more actions possible after {it} iterations for {self.policy}.")
+                break
+
+            best_action = choose_action(actions, self.policy.objective, switch_objective=switch_objective)
+
+            if not best_action:
+                logging.debug("No actions selected.")
+                break
+
+            new_metric = best_action.get_metric(self.policy.objective, switch_objective=switch_objective)
+
+            if self.policy.objective.is_monotonic() and prev_metric is not None and new_metric >= prev_metric:
+                msg = f"No improvement after {it} iterations for {self.policy}."
+                msg += f" Best action: {best_action}, metric: {new_metric:.2f} >= previous {prev_metric:.2f}."
+                if verbose:
+                    print(msg)
+                logging.debug(msg)
+                if not USE_ALL_GPUS:
+                    logging.debug("Not using all GPUs as USE_ALL_GPUS is False. Stopping optimization loop.")
+                    break
+                switch_objective = True
+
+            prev_metric = new_metric
+
+            models = apply_action(best_action, models=models)
+
+            models = simplify_model_allocations(models)
+
+            remaining_gpus.clear()
+            for gpu_type in num_gpus.keys():
+                remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]})
+
+            if verbose:
+                self._print_iteration(it, models, num_gpus)
+                print(f"{len(actions)} actions:")
+                for action in actions:
+                    if action == best_action:
+                        print(f"* {action} (best)")
+                    else:
+                        print(f"  {action}")
+                print(f"Metric: {new_metric:.2f}")
+                print("Remaining devices:")
+                for gpu_type in remaining_gpus.keys():
+                    print(f"  {remaining_gpus[gpu_type]} x {gpu_type.value}")
+
+            it += 1
+            if it > MAX_ITERATIONS:
+                logging.debug(f"Reached max iterations ({MAX_ITERATIONS}). Stopping optimization loop.")
+                break
+
+        # Adjust for no disaggregation
+        if not self.policy.is_disaggregated(Model.HF):
+            for models_gpu in models.values():
+                for instance_id in range(len(models_gpu[Model.HF_VAE])):
+                    assert models_gpu[Model.HF_VAE][instance_id].get_num_gpus() == 0, \
+                        "HF_VAE must have 0 GPUs when HF disaggregation is disabled"
+        if not self.policy.is_disaggregated(Model.FT):
+            for models_gpu in models.values():
+                for instance_id in range(len(models_gpu[Model.FT_VAE])):
+                    assert models_gpu[Model.FT_VAE][instance_id].get_num_gpus() == 0, \
+                        "FT_VAE must have 0 GPUs when FT disaggregation is disabled"
+
+        # Final calculations
+        result = evaluate_model_allocation(
+            models=models,
+            num_gpus=num_gpus,
+            workflow=self.workflow,
+            latency_data=self.latency_data,
+            power_data=self.power_data,
+            policy=self.policy,
+            round_up_cost_to_server=True,
+        )
+
+        if verbose:
+            self._print_final_allocation(
+                models=models,
+                used_devices=result.gpus_used,
+                total_devices={
+                    gpu_type1: num_gpus.get(gpu_type1, 0),
+                    gpu_type2: num_gpus.get(gpu_type2, 0),
+                },
+                power_data=self.power_data,
+                total_time_s=result.total_time_s,
+                ttff_s=result.ttff_s,
+                first_chunk_time=result.first_chunk_time,
+                tbf_s=result.tbf_s,
+                total_energy=result.total_energy if self.power_data else 0.0,
+                cost=result.cost,
+            )
+
+        assert result.gpus_used[gpu_type1] <= num_gpus.get(gpu_type1, 0), \
+            f"{gpu_type1.value}: {result.gpus_used[gpu_type1]} > {num_gpus.get(gpu_type1, 0)}"
+        assert result.gpus_used[gpu_type2] <= num_gpus.get(gpu_type2, 0), \
+            f"{gpu_type2.value}: {result.gpus_used[gpu_type2]} > {num_gpus.get(gpu_type2, 0)}"
+
+        return Result(
+            total_time_s=result.total_time_s,
+            models=models,
+            gpus_used=result.gpus_used,
+            ttff_s=result.ttff_s,
+            tbf_s=result.tbf_s,
+            total_energy=result.total_energy if self.power_data else 0.0,
+            cost=result.cost,
+        )
+
+    def _pick_from_single_server(
+        self,
+        gpu_type: GPUType,
+        verbose: bool = False,
+    ) -> Result:
+        """
+        The minimal setup with a servers with a single server (8 GPUs or 4 for GB200).
+        No parallelism across scenes/subscenes.
+        """
+
+        # Number of devices
+        num_gpus = NUM_GPUS_PER_SERVER[gpu_type]
+        models = self._init_single_server_models(gpu_type)
+
+        result = evaluate_model_allocation(
+            models=models,
+            num_gpus={gpu_type: num_gpus},
+            workflow=self.workflow,
+            latency_data=self.latency_data,
+            power_data=self.power_data,
+            policy=self.policy,
+            round_up_cost_to_server=True,
+        )
+
+        if verbose:
+            model_device = models[gpu_type]
+            print_data = [
+                [Model.GEMMA.value, round(model_device[Model.GEMMA][0].time, 2)],
+                [Model.FLUX.value, round(model_device[Model.FLUX][0].time, 2)],
+                [Model.HF.value, round(model_device[Model.HF][0].time, 2)],
+                [Model.HF_VAE.value, round(model_device[Model.HF_VAE][0].time, 2)],
+                [Model.FT.value, round(model_device[Model.FT][0].time, 2)],
+                [Model.FT_VAE.value, round(model_device[Model.FT_VAE][0].time, 2)],
+            ]
+            if self.policy.use_upscaler:
+                print_data.append([Model.UPSCALER.value, round(model_device[Model.UPSCALER][0].time, 2)])
+            print(f"Total time: {result.total_time_s:.2f} seconds")
+            print(tabulate(
+                print_data,
+                headers=["Model", "Time (seconds)"],
+                tablefmt="pretty",
+                colalign=["left", "right"]
+            ))
+            self._print_final_allocation(
+                models=models,
+                used_devices={gpu_type: num_gpus},
+                total_devices={gpu_type: num_gpus},
+                power_data=self.power_data,
+                total_time_s=result.total_time_s,
+                ttff_s=result.ttff_s,
+                first_chunk_time=result.first_chunk_time,
+                tbf_s=result.tbf_s,
+                total_energy=result.total_energy if self.power_data else 0.0,
+                cost=result.cost,
+            )
+
+        return Result(
+            total_time_s=result.total_time_s,
+            models=models,
+            gpus_used={gpu_type: num_gpus},
+            ttff_s=result.ttff_s,
+            tbf_s=result.tbf_s,
+            total_energy=result.total_energy if self.power_data else 0.0,
+            cost=result.cost,
+        )
+
+    def _pick_from_single_device_mapping(
+        self,
+        num_gpus: int,
+        gpu_type: GPUType,
+        verbose: bool = False,
+        allow_removal: bool = False,
+        allow_merging: bool = False,
+        look_ahead_replicas: int = 3,
+    ) -> Result:
+        """
+        Calculate time and energy based on a single GPU type.
+        """
+        assert num_gpus >= NUM_GPUS_PER_SERVER[gpu_type]
+        latency_gpu_data = self.latency_data[gpu_type]
+        assert gpu_type == latency_gpu_data.gpu_type
+
+        if self.power_data is not None:
+            power_gpu_data = self.power_data[gpu_type]
+            assert gpu_type == power_gpu_data.gpu_type
+
+        # Initialize allocations
+        models = self._init_single_device_models(gpu_type)
+
+        remaining_gpus = num_gpus - calc_used_gpus(models)
+
+        assert 0 <= remaining_gpus <= num_gpus
+
+        # Optimization loop
+        it = 0
+        prev_metric = None
+        switch_objective = False
+        while remaining_gpus > 0:
+            # Calculate current iteration times
+            evaluate_model_allocation(
+                models=models,
+                num_gpus={gpu_type: num_gpus},
+                workflow=self.workflow,
+                latency_data=self.latency_data,
+                power_data=self.power_data,
+                policy=self.policy,
+                round_up_cost_to_server=False,
+            )
+
+            # Calculate potential actions for each optimization option
+            actions = gen_actions(
+                num_gpus={gpu_type: num_gpus},
+                latency_data=self.latency_data,
+                power_data=self.power_data,
+                workflow=self.workflow,
+                models=models,
+                policy=self.policy,
+                allow_removal=allow_removal,
+                allow_merging=allow_merging,
+                look_ahead_replicas=look_ahead_replicas,
+            )
+
+            if not actions:
+                logging.debug(f"No more actions possible after {it} iterations for {self.policy}")
+                break
+
+            best_action = choose_action(
+                actions,
+                self.policy.objective,
+                switch_objective=switch_objective)
+
+            if not best_action:
+                logging.debug("No action selected.")
+                break
+
+            new_metric = best_action.get_metric(self.policy.objective, switch_objective=switch_objective)
+            if self.policy.objective.is_monotonic() and prev_metric is not None and new_metric >= prev_metric:
+                msg = f"No improvement from actions after {it} iterations for {self.policy}."
+                msg += f" Best action: {best_action}, metric: {new_metric:.2f} >= previous {prev_metric:.2f}."
+                if verbose:
+                    print(msg)
+                logging.debug(msg)
+                if not USE_ALL_GPUS:
+                    logging.debug("Not using all GPUs as USE_ALL_GPUS is False. Stopping optimization loop.")
+                    break
+                switch_objective = True
+
+            models = apply_action(best_action, models)
+
+            models = simplify_model_allocations(models)
+
+            remaining_gpus = num_gpus - calc_used_gpus(models)
+            prev_metric = new_metric
+
+            if verbose:
+                self._print_iteration(it, models, {gpu_type: num_gpus})
+                print(f"Metric: {new_metric:.2f}")
+                print(f"{len(actions)} actions:")
+                for action in actions:
+                    if action == best_action:
+                        print(f"  * {action} (best)")
+                    else:
+                        print(f"    {action}")
+                print(f"Applied: {best_action}")
+                print(f"Remaining devices: {remaining_gpus}x{gpu_type}")
+
+            it += 1
+            if it > MAX_ITERATIONS:
+                logging.debug(f"Reached max iterations ({MAX_ITERATIONS}). Stopping optimization loop.")
+                break
+
+        result = evaluate_model_allocation(
+            models=models,
+            num_gpus={gpu_type: num_gpus},
+            workflow=self.workflow,
+            latency_data=self.latency_data,
+            power_data=self.power_data,
+            policy=self.policy,
+            round_up_cost_to_server=True,
+        )
+
+        if verbose:
+            self._print_final_allocation(
+                models=models,
+                used_devices=result.gpus_used,
+                total_devices={gpu_type: num_gpus},
+                power_data=self.power_data,
+                total_time_s=result.total_time_s,
+                ttff_s=result.ttff_s,
+                first_chunk_time=result.first_chunk_time,
+                tbf_s=result.tbf_s,
+                total_energy=result.total_energy if self.power_data else 0.0,
+                cost=result.cost,
+            )
+
+        if not self.policy.is_disaggregated(Model.HF):
+            if models[gpu_type][Model.HF_VAE]:
+                assert models[gpu_type][Model.HF_VAE][0].get_num_gpus() == 0, \
+                    "HF_VAE must have 0 GPUs when HF disaggregation is disabled"
+        if not self.policy.is_disaggregated(Model.FT):
+            if models[gpu_type][Model.FT_VAE]:
+                assert models[gpu_type][Model.FT_VAE][0].get_num_gpus() == 0, \
+                    "FT_VAE must have 0 GPUs when FT disaggregation is disabled"
+        num_gpus_used = result.gpus_used[gpu_type]
+        assert num_gpus_used <= num_gpus, f"{num_gpus_used}>{num_gpus} for {gpu_type.value}"
+
+        return Result(
+            total_time_s=result.total_time_s,
+            models=models,
+            gpus_used={gpu_type: num_gpus_used},
+            gpus_total={gpu_type: num_gpus},
+            ttff_s=result.ttff_s,
+            tbf_s=result.tbf_s,
+            total_energy=result.total_energy if self.power_data else 0.0,
+            cost=result.cost,
+        )
+
+    def _print_iteration(
+        self,
+        it: int,
+        models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+        num_gpus: dict[GPUType, int],
+    ) -> None:
+        print(f"--- Iteration {it} ---")
+
+        for gpu_type in models.keys():
+            total_gpus = calc_used_gpus({gpu_type: models[gpu_type]})
+            print(f"Current {gpu_type.value} allocation: {total_gpus}/{num_gpus[gpu_type]} GPUs")
+            for model in Model:
+                for model_instance in models[gpu_type][model]:
+                    if model_instance.get_num_gpus() > 0:
+                        print(f"  {model.value:10s}:\t{model_instance}")
+
+        # Find the bottleneck stage
+        stage_times: dict[Model, float] = {}
+        ttff_times: dict[Model, float] = {}
+        for model_name in Model:
+            times = []
+            times_first = []
+            for gpu_type in models.keys():
+                for model_alloc in models[gpu_type][model_name]:
+                    times.append(model_alloc.time)
+                    times_first.append(model_alloc.time_first)
+            stage_times[model_name] = max(times) if times else 0.0
+            ttff_times[model_name] = max(times_first) if times_first else 0.0
+
+        bottleneck_stage, bottleneck_time = max(
+            stage_times.items(),
+            key=itemgetter(1)
+        )
+        bottleneck_ttff_stage, bottleneck_ttff_time = max(
+            ttff_times.items(),
+            key=itemgetter(1)
+        )
+        print(f"Bottleneck: {bottleneck_stage} ({bottleneck_time:.2f}s)")
+        print(f"Bottleneck TTFF: {bottleneck_ttff_stage} ({bottleneck_ttff_time:.2f}s)")
+        # bottleneck stage is not necessarily the stage with the
+        # highest potential gain from scaling up/out
+
+    def _print_final_allocation(
+        self,
+        models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+        used_devices: dict[GPUType, int],
+        total_devices: dict[GPUType, int],
+        power_data: Optional[PowerData],
+        total_time_s: float,
+        ttff_s: float,
+        first_chunk_time: float,
+        tbf_s: float,
+        total_energy: float,
+        cost: float,
+    ) -> None:
+        print("=== FINAL ALLOCATION ===")
+        print("Total devices used/available:")
+        for gpu_type, total_device in total_devices.items():
+            used_device = used_devices[gpu_type]
+            print(f"  {gpu_type.value}: {used_device}/{total_device}")
+        print("Model allocations:")
+        for gpu_type in models.keys():
+            print(f"  {gpu_type.value} ({used_devices[gpu_type]} used):")
+            for model in Model:
+                for model_alloc in models[gpu_type][model]:
+                    print(f"    {model.value:10s}:\t{model_alloc}")
+        print(f"Total time: {total_time_s:.2f} seconds ({total_time_s / SECONDS_IN_MINUTE:.2f} minutes)")
+        print(f"TTFF: {ttff_s:.2f} seconds")
+        print(f"First chunk time: {first_chunk_time:.2f} seconds")
+        print(f"TBF: {tbf_s:.2f} seconds")
+        print(f"Total cost: ${cost:.2f}")
+        if power_data is not None:
+            print(f"Total energy: {total_energy:.2f} Ws ({total_energy / SECONDS_IN_HOUR / 1000:.2f} kWh)")
diff --git a/streamwise/model_provisioner/helix.py b/streamwise/model_provisioner/helix.py
new file mode 100644
index 00000000..e8fededf
--- /dev/null
+++ b/streamwise/model_provisioner/helix.py
@@ -0,0 +1,403 @@
+"""
+Helix algorithm for the StreamWise workflow allocation problem.
+
+Reference: https://github.com/Thesys-lab/Helix-ASPLOS25
+
+Helix optimizes models one-by-one following MODEL_ORDER, using MILP
+for each model's resource allocation.  After each model reaches convergence
+(solver optimality or per-model time limit), its allocation is fixed and the
+remaining GPU budget is passed to the next model.
+
+Design rationale:
+    HelixAllocator does NOT inherit from MILPAllocator because the parent's
+    allocate() builds a single joint MILP for all models simultaneously.
+    Instead, HelixAllocator extends ModelAllocator and *composes*
+    MILPAllocator instances — one per model in the workflow.
+
+    For each model, a per-model WorkflowConfig is created where only the
+    target model has non-zero work (all others set to 0).  The existing MILP
+    constraints (is_active <= work, gpus <= num_gpus * is_active) naturally
+    force 0 GPU allocation for those 0-work models, so no changes to
+    milp.py are required.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from dataclasses import replace
+from typing import Optional
+
+from sim_types import Result
+from sim_types import GPUType
+from sim_types import WorkflowConfig
+from sim_types import PowerData
+from sim_types import LatencyData
+from sim_types import Model
+from sim_types import ModelAllocation
+from sim_types import Policy
+from sim_types import Solver
+from sim_types import MODEL_ORDER
+
+from model_allocator import ModelAllocator
+
+from evaluator import evaluate_model_allocation
+
+from .milp import MILPAllocator
+
+from .policies import HELIX_POLICY
+from .policies import MAX_DEVICES
+
+from constants import DEVICE_OPTIONS
+
+
+# Default per-model MILP solver time limit in seconds.
+# Each model gets this long to converge before the solver moves on.
+DEFAULT_PER_MODEL_TIME_LIMIT = 30
+
+
+def _compute_per_model_gpu_budget(
+    model_order: list[Model],
+    num_gpus: dict[GPUType, int],
+    workflow: WorkflowConfig,
+) -> dict[Model, dict[GPUType, int]]:
+    """Compute a per-model GPU budget so every model gets a fair share.
+
+    Budget is proportional to each model's ``MAX_DEVICES`` weight (capped
+    by the model's actual maximum useful device count from ``DEVICE_OPTIONS``).
+    Models not in ``MAX_DEVICES`` (e.g. OTHERS, UPSCALER) receive a minimum
+    allocation of ``min(DEVICE_OPTIONS)`` GPUs.
+
+    The allocations are floored per model, and any remainder is distributed
+    round-robin starting from the first model.
+
+    Returns:
+        Mapping ``model -> {gpu_type -> max_gpus}`` that the model may use.
+    """
+    # Effective weight per model (max useful devices)
+    weights: dict[Model, int] = {}
+    for m in model_order:
+        if workflow.model_work.get(m, 0) == 0:
+            continue
+        if m in MAX_DEVICES:
+            weights[m] = MAX_DEVICES[m]
+        else:
+            # Models not in MAX_DEVICES (OTHERS, UPSCALER) get min allocation
+            weights[m] = min(DEVICE_OPTIONS.get(m, [1]))
+
+    total_weight = sum(weights.values())
+    if total_weight == 0:
+        # Fallback: equal split
+        total_weight = len(weights) or 1
+        weights = {m: 1 for m in weights}
+
+    budget: dict[Model, dict[GPUType, int]] = {}
+    for gpu_type, total in num_gpus.items():
+        # Floor allocation per model
+        allocated = 0
+        per_model: dict[Model, int] = {}
+        for m in model_order:
+            if m not in weights:
+                continue
+            share = int(total * weights[m] / total_weight)
+            # Ensure at least 1 GPU per model (if GPUs available)
+            share = max(share, 1) if total - allocated >= 1 else 0
+            per_model[m] = share
+            allocated += share
+
+        # Distribute remainder round-robin
+        remainder = total - allocated
+        idx = 0
+        models_list = [m for m in model_order if m in per_model]
+        while remainder > 0 and models_list:
+            m = models_list[idx % len(models_list)]
+            per_model[m] += 1
+            remainder -= 1
+            idx += 1
+
+        for m in model_order:
+            if m not in per_model:
+                continue
+            if m not in budget:
+                budget[m] = {}
+            budget[m][gpu_type] = per_model[m]
+
+    return budget
+
+
+class HelixAllocator(ModelAllocator):
+    """
+    Helix-style allocator that optimizes models one at a time
+    using MILP, sequentially following MODEL_ORDER.
+
+    Reference: https://github.com/Thesys-lab/Helix-ASPLOS25
+
+    Key approach:
+    1. For each model in MODEL_ORDER, create a per-model MILP sub-problem
+       where only the target model has non-zero work.
+    2. Solve the MILP with the remaining GPU budget and a per-model time limit.
+    3. Fix the allocation for that model and subtract used GPUs.
+    4. Move to the next model with the remaining GPU budget.
+    5. Combine all per-model allocations into the final result.
+
+    The HelixAllocator uses composition (not inheritance) with MILPAllocator,
+    creating a separate MILPAllocator instance for each model's sub-problem.
+    This avoids modifying the joint MILP formulation and allows per-model
+    solver configurations.
+    """
+
+    def __init__(
+        self,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+        power_data: Optional[PowerData] = None,
+        policy: Policy = HELIX_POLICY,
+    ) -> None:
+        super().__init__(
+            workflow,
+            latency_data,
+            power_data,
+            policy,
+        )
+        assert self.policy.solver == Solver.HELIX
+
+    def allocate(
+        self,
+        num_gpus: dict[GPUType, int],
+        verbose: bool = False,
+        per_model_time_limit: int = DEFAULT_PER_MODEL_TIME_LIMIT,
+        milp_solver: Solver = Solver.HIGHS,
+    ) -> Result:
+        """
+        Allocate resources model-by-model following MODEL_ORDER.
+
+        For each model, a MILPAllocator is created with a workflow where
+        only the target model has non-zero work.  The MILP solver optimizes
+        the allocation for that model within the remaining GPU budget.
+
+        Args:
+            num_gpus: Available GPUs per type.
+            verbose: If True, print per-model allocation details.
+            per_model_time_limit: Time limit (seconds) for each per-model MILP solve.
+            milp_solver: MILP solver backend to use (GUROBI or HIGHS).
+
+        Returns:
+            Combined Result across all models.
+        """
+        assert milp_solver in (Solver.GUROBI, Solver.HIGHS), \
+            f"milp_solver must be GUROBI or HIGHS, got {milp_solver}"
+
+        model_order = self.workflow.get_model_order()
+        if not self.policy.use_upscaler and Model.UPSCALER in model_order:
+            # Remove UPSCALER from model_order if not using upscaler to avoid unnecessary MILP solve
+            model_order.remove(Model.UPSCALER)
+        remaining_gpus = dict(num_gpus)
+
+        # ---- GPU budget partitioning ----
+        # Pre-compute a per-model GPU budget proportional to MAX_DEVICES
+        # so that early models cannot starve later ones.  Unused GPUs from
+        # one model roll over to subsequent models.
+        gpu_budget = _compute_per_model_gpu_budget(
+            model_order, num_gpus, self.workflow,
+        )
+
+        if verbose:
+            logging.info("Helix GPU budget per model:")
+            for m in model_order:
+                if m in gpu_budget:
+                    logging.info(f"  {m.value}: {gpu_budget[m]}")
+
+        # Accumulated per-model allocations and metrics
+        all_model_allocations: dict[GPUType, dict[Model, list[ModelAllocation]]] = {}
+        total_makespan = 0.0
+        total_ttff = 0.0
+        total_cost = 0.0
+        total_energy = 0.0
+        total_gpus_used: dict[GPUType, int] = {gt: 0 for gt in num_gpus}
+
+        for model in model_order:
+            work = self.workflow.model_work.get(model, 0)
+            if work == 0:
+                continue
+
+            # Skip VAE models when disaggregation is disabled for the parent.
+            # Their latency is folded into the parent model's time calculation.
+            if model == Model.HF_VAE and not self.policy.is_disaggregated(Model.HF):
+                continue
+            if model == Model.FT_VAE and not self.policy.is_disaggregated(Model.FT):
+                continue
+
+            # Check if any GPUs remain
+            if all(v <= 0 for v in remaining_gpus.values()):
+                logging.warning(
+                    f"Helix: No GPUs remaining for {model.value}. Skipping.")
+                continue
+
+            # Filter out GPU types with 0 remaining.
+            # Cap per-model GPUs to the budget so later models are not starved.
+            model_budget = gpu_budget.get(model, {})
+            active_gpus = {
+                gt: min(count, model_budget.get(gt, count))
+                for gt, count in remaining_gpus.items()
+                if count > 0 and (gt not in model_budget or model_budget[gt] > 0)
+            }
+
+            if verbose:
+                logging.info(
+                    f"--- Helix: Optimizing {model.value} "
+                    f"(work={work}) with remaining GPUs: {active_gpus} ---"
+                )
+
+            # ---- build per-model workflow ----
+            # Only the target model has work; other models are excluded from
+            # model_work so the MILP only builds variables/constraints for it.
+            per_model_work = {model: self.workflow.model_work[model]}
+            per_model_workflow = replace(
+                self.workflow,
+                model_work=per_model_work,
+            )
+
+            # ---- build MILP-compatible policy ----
+            # The inner MILPAllocator requires solver ∈ {GUROBI, HIGHS}.
+            # Force disaggregation / use_upscaler flags so that the inner
+            # MILP's ``model_names`` list includes VAE / UPSCALER when those
+            # are the target model.  Without this, the MILP would construct
+            # an empty model set and produce a trivial (infeasible) problem.
+            disag = {}  # dict(self.policy.disaggregation)
+            if model == Model.HF_VAE and self.policy.is_disaggregated(Model.HF):
+                disag[Model.HF] = True
+            if model == Model.FT_VAE and self.policy.is_disaggregated(Model.FT):
+                disag[Model.FT] = True
+            milp_policy = Policy(
+                name=self.policy.name,
+                gpu_cost=self.policy.gpu_cost,
+                objective=self.policy.objective,
+                # disaggregation=self.policy.disaggregation or model == Model.HF_VAE,
+                disaggregation=disag,
+                use_upscaler=self.policy.use_upscaler or model == Model.UPSCALER,
+                hardware=self.policy.hardware,
+                solver=milp_solver,
+            )
+
+            # ---- solve per-model MILP ----
+            milp_allocator = MILPAllocator(
+                workflow=per_model_workflow,
+                latency_data=self.latency_data,
+                power_data=self.power_data,
+                policy=milp_policy,
+            )
+
+            result = milp_allocator.allocate(
+                num_gpus=active_gpus,
+                verbose=verbose,
+                time_limit=per_model_time_limit,
+                # Use running_cost=True for linear cost formulation (HiGHS-compatible)
+                running_cost=(milp_solver == Solver.HIGHS),
+                # Skip server constraint: per-model allocations don't need
+                # to be multiples of NUM_GPUS_PER_SERVER.
+                skip_server_constraint=True,
+            )
+
+            if result.total_time_s == 0.0 and not result.models:
+                logging.warning(
+                    f"Helix: MILP failed for {model.value}. Skipping.")
+                continue
+
+            # ---- record allocations & snap devices to DEVICE_OPTIONS ----
+            # The MILP constrains devices to DEVICE_OPTIONS, but floating-point
+            # precision in the solver can occasionally produce off-by-one values
+            # (e.g. 31 instead of 32).  Snap each replica to the nearest valid
+            # option, adjusting the GPU accounting so we don't exceed the total
+            # budget passed to evaluate_model_allocation at the end.
+            for gpu_type, model_dict in result.models.items():
+                if gpu_type not in all_model_allocations:
+                    all_model_allocations[gpu_type] = {}
+                for m_name, allocs in model_dict.items():
+                    for alloc in allocs:
+                        valid_devices = DEVICE_OPTIONS.get(m_name, [1])
+                        if alloc.devices not in valid_devices:
+                            nearest = min(valid_devices, key=lambda d: abs(d - alloc.devices))
+                            diff = nearest - alloc.devices  # positive = round up
+                            gpu_avail = remaining_gpus.get(gpu_type, 0) - result.gpus_used.get(gpu_type, 0)
+                            if diff > 0 and gpu_avail < diff:
+                                # Not enough spare GPUs to round up; round down instead
+                                nearest = max(
+                                    (d for d in valid_devices if d <= alloc.devices),
+                                    default=valid_devices[0],
+                                )
+                                diff = nearest - alloc.devices
+                            logging.info(
+                                f"Helix: snapping {m_name.value} from "
+                                f"{alloc.devices} to {nearest} devices "
+                                f"(solver precision fix, diff={diff:+d})")
+                            # Adjust GPU accounting for this model's result
+                            result.gpus_used[gpu_type] = result.gpus_used.get(gpu_type, 0) + diff
+                            alloc.devices = nearest
+                    all_model_allocations[gpu_type][m_name] = allocs
+
+            # ---- accumulate metrics ----
+            total_makespan += result.total_time_s
+            total_ttff += result.ttff_s
+            total_cost += result.cost
+            total_energy += result.total_energy
+            if verbose:
+                print(f'Model {model.value} - Time: {result.total_time_s:.2f}s,'
+                      f'TTFF: {result.ttff_s:.2f}s, Cost: ${result.cost:.2f}')
+                print(f'Total cost so far: ${total_cost:.2f}, Total time so far: {total_makespan:.2f}s,'
+                      f'Total TTFF so far: {total_ttff:.2f}s')
+                print(f'GPUs allocated for {model.value}: {result.gpus_used}')
+
+            # ---- subtract used GPUs ----
+            for gpu_type, used in result.gpus_used.items():
+                remaining_gpus[gpu_type] = remaining_gpus.get(gpu_type, 0) - used
+                total_gpus_used[gpu_type] = total_gpus_used.get(gpu_type, 0) + used
+
+            # ---- roll over unused budget to next models ----
+            # If this model used fewer GPUs than its budget, the surplus
+            # is distributed evenly among the remaining models.
+            remaining_models = [
+                m for m in model_order
+                if m in gpu_budget and MODEL_ORDER.get(m, 0) > MODEL_ORDER.get(model, 0)
+            ]
+            if remaining_models:
+                for gpu_type in num_gpus:
+                    budget_for_model = model_budget.get(gpu_type, 0)
+                    used_by_model = result.gpus_used.get(gpu_type, 0)
+                    surplus = budget_for_model - used_by_model
+                    if surplus > 0:
+                        per_model_extra = surplus // len(remaining_models)
+                        leftover = surplus % len(remaining_models)
+                        for i, rm in enumerate(remaining_models):
+                            extra = per_model_extra + (1 if i < leftover else 0)
+                            gpu_budget[rm][gpu_type] = gpu_budget[rm].get(gpu_type, 0) + extra
+
+            if verbose:
+                print(
+                    f"Helix: {model.value} allocated.  "
+                    f"Time: {result.total_time_s:.2f}s, "
+                    f"TTFF: {result.ttff_s:.2f}s, "
+                    f"GPUs used: {result.gpus_used}, "
+                    f"Remaining: {remaining_gpus}"
+                )
+
+        result = evaluate_model_allocation(
+            workflow=self.workflow,
+            latency_data=self.latency_data,
+            power_data=self.power_data,
+            policy=self.policy,
+            models=all_model_allocations,
+            num_gpus=num_gpus,
+        )
+
+        if verbose:
+            print(
+                f"=== Helix final: "
+                f"Makespan={result.total_time_s:.2f}s, "
+                f"TTFF={result.ttff_s:.2f}s, "
+                f"TBF={result.tbf_s:.4f}s, "
+                f"Cost=${result.cost:.2f}, "
+                f"Energy={result.total_energy:.2f}Ws, "
+                f"GPUs used={result.gpus_used} ==="
+            )
+
+        return result
diff --git a/streamwise/model_provisioner/hexgen.py b/streamwise/model_provisioner/hexgen.py
new file mode 100644
index 00000000..4f37768a
--- /dev/null
+++ b/streamwise/model_provisioner/hexgen.py
@@ -0,0 +1,629 @@
+"""
+HexGen algorithm for the StreamWise workflow allocation problem.
+
+Reference: https://arxiv.org/abs/2311.11514
+
+HexGen treats each model in the workflow as an independent component for optimization.
+It tracks metrics per model and optimizes models sequentially according to MODEL_ORDER.
+When a model's metric converges (stops dropping), it moves to the next model.
+After the last model converges, it cycles back to the first model and allocates
+remaining GPUs until exhausted.
+"""
+
+from __future__ import annotations
+import logging
+from typing import Optional
+
+from sim_types import Result
+from sim_types import GPUType
+from sim_types import WorkflowConfig
+from sim_types import PowerData
+from sim_types import LatencyData
+from sim_types import Model
+from sim_types import ModelAllocation
+from sim_types import Policy
+from sim_types import Solver
+from sim_types import MODEL_ORDER
+
+from utils import simplify_model_allocations
+
+from evaluator import calc_used_gpus
+from evaluator import evaluate_model_allocation
+
+from .greedy import GreedyAllocator
+
+from actions import gen_actions
+from actions import choose_action
+from actions import apply_action
+
+from .policies import HEXGEN_POLICY
+from .policies import MAX_ITERATIONS
+from .policies import USE_ALL_GPUS
+
+
+def _get_model_order(workflow: WorkflowConfig) -> list[Model]:
+    """Get ordered list of models in the workflow, sorted by MODEL_ORDER."""
+    return sorted(
+        [m for m in workflow.models if m in MODEL_ORDER],
+        key=lambda m: MODEL_ORDER[m],
+    )
+
+
+class HexGenAllocator(GreedyAllocator):
+    """
+    HexGen-style allocator that optimizes models one at a time,
+    sequentially following MODEL_ORDER.
+
+    Reference: https://arxiv.org/abs/2311.11514
+
+    Key differences from GreedyAllocator:
+    1. Each model is treated as an independent optimization target.
+    2. Per-model metrics are tracked separately.
+    3. Models are optimized in MODEL_ORDER sequence. When a model's metric
+       converges, it moves to the next model. After the last model converges,
+       it cycles back to the first and allocates remaining GPUs.
+    """
+
+    def __init__(
+        self,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+        power_data: Optional[PowerData] = None,
+        policy: Policy = HEXGEN_POLICY,
+    ) -> None:
+        super().__init__(
+            workflow,
+            latency_data,
+            power_data,
+            policy,
+        )
+        assert self.policy.solver == Solver.HEXGEN
+
+    def _pick_from_single_device_mapping(
+        self,
+        num_gpus: int,
+        gpu_type: GPUType,
+        verbose: bool = False,
+        allow_removal: bool = False,
+        allow_merging: bool = False,
+        look_ahead_replicas: int = 3,
+    ) -> Result:
+        """
+        HexGen-style allocation for a single GPU type (>8 GPUs).
+        Optimizes models one at a time following MODEL_ORDER.
+        """
+        from constants import NUM_GPUS_PER_SERVER
+
+        assert num_gpus >= NUM_GPUS_PER_SERVER[gpu_type]
+
+        # Initialize allocations (same as GreedyAllocator)
+        models = self._init_single_device_models(gpu_type)
+
+        remaining_gpus = num_gpus - calc_used_gpus(models)
+        assert 0 <= remaining_gpus <= num_gpus
+
+        # --- HexGen per-model sequential optimization ---
+        model_order = _get_model_order(self.workflow)
+        per_model_metrics: dict[Model, Optional[float]] = {m: None for m in model_order}
+
+        it = 0
+        current_model_idx = 0
+        cycles_without_progress = 0  # track full cycles without any improvement
+        total_models = len(model_order)
+
+        while remaining_gpus > 0:
+            if current_model_idx >= total_models:
+                # Completed a full cycle, wrap around
+                current_model_idx = 0
+                cycles_without_progress += 1
+                if cycles_without_progress >= 1:
+                    logging.debug(
+                        f"HexGen: No progress after {cycles_without_progress} full cycles.")
+                    break
+
+            current_model = model_order[current_model_idx]
+
+            if verbose:
+                print(f"--- HexGen: Optimizing {current_model.value} "
+                      f"(model {current_model_idx + 1}/{total_models}) ---")
+
+            # Inner loop: keep optimizing current model until convergence
+            inner_it = 0
+            while remaining_gpus > 0:
+                # Evaluate current state
+                evaluate_model_allocation(
+                    models=models,
+                    num_gpus={gpu_type: num_gpus},
+                    workflow=self.workflow,
+                    latency_data=self.latency_data,
+                    power_data=self.power_data,
+                    policy=self.policy,
+                    round_up_cost_to_server=False,
+                )
+
+                # Generate actions only for the current model
+                all_actions = gen_actions(
+                    num_gpus={gpu_type: num_gpus},
+                    latency_data=self.latency_data,
+                    power_data=self.power_data,
+                    workflow=self.workflow,
+                    models=models,
+                    policy=self.policy,
+                )
+
+                # Filter to actions targeting the current model only
+                model_actions = [a for a in all_actions if a.model == current_model]
+
+                if not model_actions:
+                    logging.debug(
+                        f"HexGen: No actions for {current_model.value} after {inner_it} inner iterations.")
+                    break
+
+                best_action = choose_action(model_actions, self.policy.objective)
+
+                if not best_action:
+                    logging.debug(f"HexGen: No action selected for {current_model.value}.")
+                    break
+
+                new_metric = best_action.get_metric(self.policy.objective)
+                prev_metric = per_model_metrics[current_model]
+
+                if self.policy.objective.is_monotonic() and prev_metric is not None and new_metric >= prev_metric:
+                    msg = (
+                        f"HexGen: {current_model.value} converged after {inner_it} inner iterations. "
+                        f"Metric: {new_metric:.2f} >= previous {prev_metric:.2f}."
+                    )
+                    if verbose:
+                        print(msg)
+                    logging.debug(msg)
+                    break
+
+                per_model_metrics[current_model] = new_metric
+
+                models = apply_action(best_action, models=models)
+                models = simplify_model_allocations(models)
+
+                remaining_gpus = num_gpus - calc_used_gpus(models)
+
+                if verbose:
+                    self._print_iteration(it, models, {gpu_type: num_gpus})
+                    print(f"HexGen: Applied action for {current_model.value}, "
+                          f"metric: {new_metric:.2f}, remaining: {remaining_gpus}")
+
+                it += 1
+                inner_it += 1
+
+                if it > MAX_ITERATIONS:
+                    logging.debug(f"HexGen: Reached max iterations ({MAX_ITERATIONS}). Stopping.")
+                    break
+
+            if it > MAX_ITERATIONS:
+                break
+
+            current_model_idx += 1
+
+        # --- USE_ALL_GPUS: fill remaining GPUs by cycling through MODEL_ORDER ---
+        remaining_gpus = num_gpus - calc_used_gpus(models)
+        if USE_ALL_GPUS and remaining_gpus > 0:
+            models = self._fill_remaining_gpus_single(
+                models=models,
+                num_gpus=num_gpus,
+                gpu_type=gpu_type,
+                model_order=model_order,
+                it=it,
+                verbose=verbose,
+            )
+
+        # Final evaluation
+        result = evaluate_model_allocation(
+            models=models,
+            num_gpus={gpu_type: num_gpus},
+            workflow=self.workflow,
+            latency_data=self.latency_data,
+            power_data=self.power_data,
+            policy=self.policy,
+            round_up_cost_to_server=True,
+        )
+
+        if verbose:
+            self._print_final_allocation(
+                models=models,
+                used_devices=result.gpus_used,
+                total_devices={gpu_type: num_gpus},
+                power_data=self.power_data,
+                total_time_s=result.total_time_s,
+                ttff_s=result.ttff_s,
+                first_chunk_time=result.first_chunk_time,
+                tbf_s=result.tbf_s,
+                total_energy=result.total_energy if self.power_data else 0.0,
+                cost=result.cost,
+            )
+
+        if not self.policy.is_disaggregated(Model.HF):
+            if models[gpu_type][Model.HF_VAE]:
+                assert models[gpu_type][Model.HF_VAE][0].get_num_gpus() == 0, \
+                    "HF_VAE must have 0 GPUs when HF disaggregation is disabled"
+        if not self.policy.is_disaggregated(Model.FT):
+            if models[gpu_type][Model.FT_VAE]:
+                assert models[gpu_type][Model.FT_VAE][0].get_num_gpus() == 0, \
+                    "FT_VAE must have 0 GPUs when FT disaggregation is disabled"
+
+        num_gpus_used = result.gpus_used[gpu_type]
+        assert num_gpus_used <= num_gpus, f"{num_gpus_used}>{num_gpus} for {gpu_type.value}"
+
+        return Result(
+            total_time_s=result.total_time_s,
+            models=models,
+            gpus_used={gpu_type: num_gpus_used},
+            gpus_total={gpu_type: num_gpus},
+            ttff_s=result.ttff_s,
+            tbf_s=result.tbf_s,
+            total_energy=result.total_energy if self.power_data else 0.0,
+            cost=result.cost,
+        )
+
+    def _pick_from_both_devices_mapping(
+        self,
+        num_gpus: dict[GPUType, int],
+        verbose: bool = False,
+        allow_removal: bool = False,
+        allow_merging: bool = False,
+        look_ahead_replicas: int = 3,
+    ) -> Result:
+        """
+        HexGen-style allocation for two GPU types.
+        Optimizes models one at a time following MODEL_ORDER.
+        """
+        from constants import NUM_GPUS_PER_SERVER
+
+        gpu_types = list(num_gpus.keys())
+        assert len(gpu_types) == 2
+        gpu_type1 = gpu_types[0]
+        gpu_type2 = gpu_types[1]
+        assert num_gpus[gpu_type1] >= NUM_GPUS_PER_SERVER[gpu_type1]
+        assert num_gpus[gpu_type2] >= NUM_GPUS_PER_SERVER[gpu_type2]
+
+        # Initialize allocations (same as GreedyAllocator)
+        models = self._init_both_devices_models(gpu_type1, gpu_type2)
+
+        remaining_gpus: dict[GPUType, int] = {}
+        for gpu_type in num_gpus.keys():
+            remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]})
+
+        # --- HexGen per-model sequential optimization ---
+        model_order = _get_model_order(self.workflow)
+        per_model_metrics: dict[Model, Optional[float]] = {m: None for m in model_order}
+
+        if verbose:
+            evaluate_model_allocation(
+                models=models,
+                num_gpus=num_gpus,
+                workflow=self.workflow,
+                latency_data=self.latency_data,
+                power_data=self.power_data,
+                policy=self.policy,
+                round_up_cost_to_server=True,
+            )
+            self._print_iteration(0, models, num_gpus)
+
+        it = 1
+        current_model_idx = 0
+        cycles_without_progress = 0
+        total_models = len(model_order)
+
+        while sum(remaining_gpus.values()) > 0:
+            if current_model_idx >= total_models:
+                current_model_idx = 0
+                cycles_without_progress += 1
+                if cycles_without_progress >= 1:
+                    logging.debug(
+                        f"HexGen: No progress after {cycles_without_progress} full cycles.")
+                    break
+
+            current_model = model_order[current_model_idx]
+
+            if verbose:
+                print(f"--- HexGen: Optimizing {current_model.value} "
+                      f"(model {current_model_idx + 1}/{total_models}) ---")
+
+            inner_it = 0
+
+            while sum(remaining_gpus.values()) > 0:
+                evaluate_model_allocation(
+                    models=models,
+                    num_gpus=num_gpus,
+                    workflow=self.workflow,
+                    latency_data=self.latency_data,
+                    power_data=self.power_data,
+                    policy=self.policy,
+                    round_up_cost_to_server=False,
+                )
+
+                all_actions = gen_actions(
+                    workflow=self.workflow,
+                    latency_data=self.latency_data,
+                    power_data=self.power_data,
+                    num_gpus=num_gpus,
+                    models=models,
+                    policy=self.policy,
+                )
+
+                # Filter to current model
+                model_actions = [a for a in all_actions if a.model == current_model]
+
+                if not model_actions:
+                    logging.debug(
+                        f"HexGen: No actions for {current_model.value} after {inner_it} inner iterations.")
+                    break
+
+                best_action = choose_action(model_actions, self.policy.objective)
+
+                if not best_action:
+                    logging.debug(f"HexGen: No action selected for {current_model.value}.")
+                    break
+
+                new_metric = best_action.get_metric(self.policy.objective)
+                prev_metric = per_model_metrics[current_model]
+
+                if self.policy.objective.is_monotonic() and prev_metric is not None and new_metric >= prev_metric:
+                    msg = (
+                        f"HexGen: {current_model.value} converged. "
+                        f"Metric: {new_metric:.2f} >= previous {prev_metric:.2f}."
+                    )
+                    if verbose:
+                        print(msg)
+                    logging.debug(msg)
+                    break
+
+                per_model_metrics[current_model] = new_metric
+
+                models = apply_action(best_action, models=models)
+                models = simplify_model_allocations(models)
+
+                remaining_gpus.clear()
+                for gpu_type in num_gpus.keys():
+                    remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]})
+
+                if verbose:
+                    self._print_iteration(it, models, num_gpus)
+                    print(f"HexGen: Applied action for {current_model.value}, "
+                          f"metric: {new_metric:.2f}")
+                    print("Remaining devices:")
+                    for gt in remaining_gpus:
+                        print(f"  {remaining_gpus[gt]} x {gt.value}")
+
+                it += 1
+                inner_it += 1
+
+                if it > MAX_ITERATIONS:
+                    logging.debug(f"HexGen: Reached max iterations ({MAX_ITERATIONS}). Stopping.")
+                    break
+
+            if it > MAX_ITERATIONS:
+                break
+
+            current_model_idx += 1
+
+        # --- USE_ALL_GPUS: fill remaining GPUs by cycling through MODEL_ORDER ---
+        remaining_gpus_total = sum(
+            num_gpus[gt] - calc_used_gpus({gt: models[gt]})
+            for gt in num_gpus
+        )
+        if USE_ALL_GPUS and remaining_gpus_total > 0:
+            models = self._fill_remaining_gpus_multi(
+                models=models,
+                num_gpus=num_gpus,
+                model_order=model_order,
+                it=it,
+                verbose=verbose,
+            )
+
+        # Adjust for no disaggregation
+        if not self.policy.is_disaggregated(Model.HF):
+            for models_gpu in models.values():
+                for instance_id in range(len(models_gpu[Model.HF_VAE])):
+                    assert models_gpu[Model.HF_VAE][instance_id].get_num_gpus() == 0, \
+                        "HF_VAE must have 0 GPUs when HF disaggregation is disabled"
+        if not self.policy.is_disaggregated(Model.FT):
+            for models_gpu in models.values():
+                for instance_id in range(len(models_gpu[Model.FT_VAE])):
+                    assert models_gpu[Model.FT_VAE][instance_id].get_num_gpus() == 0, \
+                        "FT_VAE must have 0 GPUs when FT disaggregation is disabled"
+
+        # Final evaluation
+        result = evaluate_model_allocation(
+            models=models,
+            num_gpus=num_gpus,
+            workflow=self.workflow,
+            latency_data=self.latency_data,
+            power_data=self.power_data,
+            policy=self.policy,
+            round_up_cost_to_server=True,
+        )
+
+        if verbose:
+            self._print_final_allocation(
+                models=models,
+                used_devices=result.gpus_used,
+                total_devices={
+                    gpu_type1: num_gpus.get(gpu_type1, 0),
+                    gpu_type2: num_gpus.get(gpu_type2, 0),
+                },
+                power_data=self.power_data,
+                total_time_s=result.total_time_s,
+                ttff_s=result.ttff_s,
+                first_chunk_time=result.first_chunk_time,
+                tbf_s=result.tbf_s,
+                total_energy=result.total_energy if self.power_data else 0.0,
+                cost=result.cost,
+            )
+
+        assert result.gpus_used[gpu_type1] <= num_gpus.get(gpu_type1, 0), \
+            f"{gpu_type1.value}: {result.gpus_used[gpu_type1]} > {num_gpus.get(gpu_type1, 0)}"
+        assert result.gpus_used[gpu_type2] <= num_gpus.get(gpu_type2, 0), \
+            f"{gpu_type2.value}: {result.gpus_used[gpu_type2]} > {num_gpus.get(gpu_type2, 0)}"
+
+        return Result(
+            total_time_s=result.total_time_s,
+            models=models,
+            gpus_used=result.gpus_used,
+            ttff_s=result.ttff_s,
+            tbf_s=result.tbf_s,
+            total_energy=result.total_energy if self.power_data else 0.0,
+            cost=result.cost,
+        )
+
+    def _fill_remaining_gpus_single(
+        self,
+        models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+        num_gpus: int,
+        gpu_type: GPUType,
+        model_order: list[Model],
+        it: int = 0,
+        verbose: bool = False,
+    ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
+        """
+        Fill remaining GPUs by cycling through MODEL_ORDER (single GPU type).
+        Applies any available action per model, ignoring metric convergence.
+        Stops when all GPUs are used or no model can accept more.
+        """
+        remaining_gpus = num_gpus - calc_used_gpus(models)
+        total_models = len(model_order)
+        model_idx = 0
+        models_exhausted: set[Model] = set()
+
+        if verbose:
+            print(f"--- HexGen: USE_ALL_GPUS fill phase, {remaining_gpus} remaining ---")
+
+        while remaining_gpus > 0 and len(models_exhausted) < total_models:
+            current_model = model_order[model_idx % total_models]
+            model_idx += 1
+
+            if current_model in models_exhausted:
+                continue
+
+            evaluate_model_allocation(
+                models=models,
+                num_gpus={gpu_type: num_gpus},
+                workflow=self.workflow,
+                latency_data=self.latency_data,
+                power_data=self.power_data,
+                policy=self.policy,
+                round_up_cost_to_server=False,
+            )
+
+            all_actions = gen_actions(
+                num_gpus={gpu_type: num_gpus},
+                latency_data=self.latency_data,
+                power_data=self.power_data,
+                workflow=self.workflow,
+                models=models,
+                policy=self.policy,
+            )
+            model_actions = [a for a in all_actions if a.model == current_model]
+
+            if not model_actions:
+                models_exhausted.add(current_model)
+                logging.debug(f"HexGen fill: {current_model.value} exhausted (no actions).")
+                continue
+
+            best_action = choose_action(model_actions, self.policy.objective)
+            if not best_action:
+                models_exhausted.add(current_model)
+                logging.debug(f"HexGen fill: {current_model.value} exhausted (no action selected).")
+                continue
+
+            models = apply_action(best_action, models=models)
+            models = simplify_model_allocations(models)
+            remaining_gpus = num_gpus - calc_used_gpus(models)
+
+            if verbose:
+                self._print_iteration(it, models, {gpu_type: num_gpus})
+                print(f"HexGen fill: Allocated to {current_model.value}, remaining: {remaining_gpus}")
+
+            it += 1
+            if it > MAX_ITERATIONS:
+                logging.debug(f"HexGen fill: Reached max iterations ({MAX_ITERATIONS}). Stopping.")
+                break
+
+        return models
+
+    def _fill_remaining_gpus_multi(
+        self,
+        models: dict[GPUType, dict[Model, list[ModelAllocation]]],
+        num_gpus: dict[GPUType, int],
+        model_order: list[Model],
+        it: int = 0,
+        verbose: bool = False,
+    ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
+        """
+        Fill remaining GPUs by cycling through MODEL_ORDER (multi GPU type).
+        Applies any available action per model, ignoring metric convergence.
+        Stops when all GPUs are used or no model can accept more.
+        """
+        total_remaining = sum(
+            num_gpus[gt] - calc_used_gpus({gt: models[gt]})
+            for gt in num_gpus
+        )
+        total_models = len(model_order)
+        model_idx = 0
+        models_exhausted: set[Model] = set()
+
+        if verbose:
+            print(f"--- HexGen: USE_ALL_GPUS fill phase, {total_remaining} remaining ---")
+
+        while total_remaining > 0 and len(models_exhausted) < total_models:
+            current_model = model_order[model_idx % total_models]
+            model_idx += 1
+
+            if current_model in models_exhausted:
+                continue
+
+            evaluate_model_allocation(
+                models=models,
+                num_gpus=num_gpus,
+                workflow=self.workflow,
+                latency_data=self.latency_data,
+                power_data=self.power_data,
+                policy=self.policy,
+                round_up_cost_to_server=False,
+            )
+
+            all_actions = gen_actions(
+                workflow=self.workflow,
+                latency_data=self.latency_data,
+                power_data=self.power_data,
+                num_gpus=num_gpus,
+                models=models,
+                policy=self.policy,
+            )
+            model_actions = [a for a in all_actions if a.model == current_model]
+
+            if not model_actions:
+                models_exhausted.add(current_model)
+                logging.debug(f"HexGen fill: {current_model.value} exhausted (no actions).")
+                continue
+
+            best_action = choose_action(model_actions, self.policy.objective)
+            if not best_action:
+                models_exhausted.add(current_model)
+                logging.debug(f"HexGen fill: {current_model.value} exhausted (no action selected).")
+                continue
+
+            models = apply_action(best_action, models=models)
+            models = simplify_model_allocations(models)
+            total_remaining = sum(
+                num_gpus[gt] - calc_used_gpus({gt: models[gt]})
+                for gt in num_gpus
+            )
+
+            if verbose:
+                self._print_iteration(it, models, num_gpus)
+                print(f"HexGen fill: Allocated to {current_model.value}, remaining: {total_remaining}")
+
+            it += 1
+            if it > MAX_ITERATIONS:
+                logging.debug(f"HexGen fill: Reached max iterations ({MAX_ITERATIONS}). Stopping.")
+                break
+
+        return models
diff --git a/streamwise/model_provisioner/milp.py b/streamwise/model_provisioner/milp.py
new file mode 100644
index 00000000..67749258
--- /dev/null
+++ b/streamwise/model_provisioner/milp.py
@@ -0,0 +1,1070 @@
+"""
+MILP formulation for the StreamWise workflow allocation problem.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+
+from typing import Callable
+from typing import Optional
+
+from pyomo.environ import ConcreteModel
+from pyomo.environ import Var
+from pyomo.environ import Set
+from pyomo.environ import Objective as OptObjective
+from pyomo.environ import Binary
+from pyomo.environ import NonNegativeIntegers
+from pyomo.environ import NonNegativeReals
+from pyomo.environ import minimize
+from pyomo.environ import SolverFactory
+from pyomo.environ import ConstraintList
+
+from sim_types import GPUType
+from sim_types import Model
+from sim_types import WorkflowConfig
+from sim_types import LatencyData
+from sim_types import PowerData
+from sim_types import Result
+from sim_types import Policy
+from sim_types import ModelAllocation
+from sim_types import Objective
+from sim_types import Solver
+
+from models import get_model_allocation
+
+from model_allocator import ModelAllocator
+
+from constants import DEVICE_OPTIONS
+from constants import NUM_GPUS_PER_SERVER
+from constants import SECONDS_IN_HOUR
+
+from .policies import STREAMWISE_MILP_POLICY
+
+
+MAX_INSTANCES = 16
+
+# Maximum time it can take: 24 hours in seconds
+# Used for big-M constraints to link TTFF and makespan to instance variables
+MAX_TIME = 24 * SECONDS_IN_HOUR
+
+
+# Allocators that require quadratic (bilinear) objectives - need Gurobi
+QUADRATIC_OBJECTIVES = [
+    Objective.TTFF_COST,
+    Objective.TIME_ENERGY,
+    Objective.ENERGY_COST,
+]
+
+
+def idx(
+    gpu_type: GPUType,
+    model_name: Model,
+    instance_id: int
+) -> tuple[str, str, int]:
+    """Helper to convert enum to index key for instance variables."""
+    return (gpu_type.value, model_name.value, instance_id)
+
+
+def dev_idx(
+    gpu_type: GPUType,
+    model_name: Model,
+    instance_id: int,
+    num_devices: int
+) -> tuple[str, str, int, int]:
+    """Helper to convert enum to index key for device variables."""
+    return (gpu_type.value, model_name.value, instance_id, num_devices)
+
+
+class MILPAllocator(ModelAllocator):
+    """
+    MILP-based allocator that computes the optimal model allocation.
+    """
+    def __init__(
+        self,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+        power_data: Optional[PowerData] = None,
+        policy: Policy = STREAMWISE_MILP_POLICY,
+    ) -> None:
+        super().__init__(
+            workflow,
+            latency_data,
+            power_data,
+            policy,
+        )
+        assert self.policy.solver in [Solver.GUROBI, Solver.HIGHS]
+
+    def allocate(
+        self,
+        num_gpus: dict[GPUType, int],
+        verbose: bool = False,
+        running_cost: bool = False,  # If True, cost = active time only; False = makespan x GPUs
+        max_cost: Optional[float] = None,  # If set, adds a constraint to limit cost
+        max_ttff: Optional[float] = None,  # If set, adds a constraint to limit TTFF
+        max_makespan: Optional[float] = None,  # If set, adds a constraint to limit makespan
+        time_limit: Optional[int] = None,  # Time limit for the solver in seconds
+        save_solution_path: Optional[str] = None,  # If set, saves the solution to a JSON file
+        warm_start_path: Optional[str] = None,  # If set, loads a warm start solution from a JSON file
+        force_num_gpus: bool = False,  # If True, adds constraints to force the use of all available GPUs
+        skip_server_constraint: bool = False,  # If True, skips the GPU-per-server constraint
+    ) -> Result:
+        """
+        Calculate the optimal model allocation and resulting metrics using MILP formulation.
+        """
+        m = ConcreteModel()
+
+        # Options: "gurobi", "highs"
+        solver_name = self.policy.solver.value
+
+        # Define index sets
+        gpu_types = list(num_gpus.keys())
+
+        model_names = [
+            Model.GEMMA,
+            Model.FLUX,
+            Model.HF,
+            # Model.HF_VAE,
+            Model.FT,
+            # Model.FT_VAE,
+            # Model.UPSCALER,
+            Model.OTHERS,
+        ]
+        if self.policy.use_upscaler:
+            model_names.append(Model.UPSCALER)
+        if self.policy.is_disaggregated(Model.HF):
+            model_names.append(Model.HF_VAE)
+        if self.policy.is_disaggregated(Model.FT):
+            model_names.append(Model.FT_VAE)
+
+        # Remove models not in the workflow
+        model_names = [
+            model_name
+            for model_name in model_names
+            if model_name in self.workflow.models
+        ]
+
+        instance_ids = list(range(MAX_INSTANCES))
+
+        # The units of work that each model has to do
+        work: dict[Model, int] = self.workflow.work
+
+        # Create Pyomo Sets
+        m.GPU_TYPES = Set(initialize=[g.value for g in gpu_types])
+        m.MODEL_NAMES = Set(initialize=[mn.value for mn in model_names])
+        m.INSTANCES = Set(initialize=instance_ids)
+
+        # Create index set for device choices: (gpu_type, model_name, instance_id, device_count)
+        device_index_set = [
+            (gpu_type.value, model_name.value, instance_id, num_devices)
+            for gpu_type in gpu_types
+            for model_name in model_names
+            for instance_id in instance_ids
+            for num_devices in [0] + DEVICE_OPTIONS[model_name]
+        ]
+        m.DEVICE_INDEX = Set(initialize=device_index_set)
+
+        # Create index set for instance variables: (gpu_type, model_name, instance_id)
+        instance_index_set = [
+            (gpu_type.value, model_name.value, instance_id)
+            for gpu_type in gpu_types
+            for model_name in model_names
+            for instance_id in instance_ids
+        ]
+        m.INSTANCE_INDEX = Set(initialize=instance_index_set)
+
+        # Define indexed variables
+        m.device_choice = Var(m.DEVICE_INDEX, domain=Binary)
+        m.work_device = Var(m.DEVICE_INDEX, domain=NonNegativeIntegers)  # Linearization: work per device choice
+        m.gpus = Var(m.INSTANCE_INDEX, domain=NonNegativeIntegers)
+        m.is_active = Var(m.INSTANCE_INDEX, domain=Binary)
+        m.is_min = Var(m.INSTANCE_INDEX, domain=Binary)
+        m.work = Var(m.INSTANCE_INDEX, domain=NonNegativeIntegers)
+        m.time = Var(m.INSTANCE_INDEX, domain=NonNegativeReals)
+        m.ttff = Var(m.INSTANCE_INDEX, domain=NonNegativeReals)
+
+        # Objective variables
+        m.makespan = Var(domain=NonNegativeReals)
+        m.ttff_user = Var(domain=NonNegativeReals)
+        m.ttff_min = Var(m.MODEL_NAMES, domain=NonNegativeReals)  # Per-model minimum TTFF
+        m.time_max = Var(m.MODEL_NAMES, domain=NonNegativeReals)  # Per-model maximum time
+        m.cost = Var(domain=NonNegativeReals)
+        m.energy = Var(domain=NonNegativeReals)
+
+        # Constraint list for dynamic constraints
+        m.constraints = ConstraintList()
+
+        for gpu_type in gpu_types:
+            for model_name in model_names:
+                for instance_id in instance_ids:
+                    key = idx(gpu_type, model_name, instance_id)
+
+                    # GPUs used = sum of num_devices * device_choice[num_devices]
+                    m.constraints.add(
+                        m.gpus[key] == sum(
+                            num_devices * m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                            for num_devices in [0] + DEVICE_OPTIONS[model_name]
+                        )
+                    )
+
+                    # Cannot select inactive instance as min
+                    m.constraints.add(m.is_min[key] <= m.is_active[key])
+                    # If active = 0 -> GPUs = 0
+                    m.constraints.add(m.gpus[key] <= num_gpus[gpu_type] * m.is_active[key])
+                    # If active = 1 -> GPUs ≥ 1
+                    m.constraints.add(m.gpus[key] >= m.is_active[key])
+                    # If work = 0 -> active = 0 -> GPUs = 0
+                    m.constraints.add(m.is_active[key] <= m.work[key])
+
+                    # If device = 0 -> work = 0
+                    dev_idx_0 = dev_idx(gpu_type, model_name, instance_id, 0)
+                    m.constraints.add(
+                        m.work[key]
+                        <= work[model_name] * (1 - m.device_choice[dev_idx_0])
+                    )
+
+                    # Linearization: work_device links device_choice and work
+                    # work = sum(work_device[d] for d in devices) - excludes 0 GPUs since they can't do work
+                    m.constraints.add(
+                        m.work[key] == sum(
+                            m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                            for num_devices in DEVICE_OPTIONS[model_name]
+                        )
+                    )
+                    # If any non-zero device is selected, work must be >= 1
+                    m.constraints.add(
+                        m.work[key] >= sum(
+                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                            for num_devices in DEVICE_OPTIONS[model_name]
+                        )
+                    )
+                    # work_device[d] <= TOTAL_WORK * device_choice[d]
+                    for num_devices in [0] + DEVICE_OPTIONS[model_name]:
+                        didx = dev_idx(gpu_type, model_name, instance_id, num_devices)
+                        m.constraints.add(
+                            m.work_device[didx] <= work[model_name] * m.device_choice[didx]
+                        )
+
+                    # Link instance time to per-model max time
+                    m.constraints.add(m.time[key] <= m.time_max[model_name.value])
+
+                    # Link TTFF to per-model TTFF min
+                    # If selected → ttff_min[model] == ttff_var
+                    m.constraints.add(m.ttff_min[model_name.value] >= m.ttff[key] - MAX_TIME * (1 - m.is_min[key]))
+                    m.constraints.add(m.ttff_min[model_name.value] <= m.ttff[key] + MAX_TIME * (1 - m.is_active[key]))
+
+                # One device per instance
+                for instance_id in instance_ids:
+                    m.constraints.add(
+                        sum(
+                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                            for num_devices in [0] + DEVICE_OPTIONS[model_name]
+                        ) == 1
+                    )
+
+                # Symmetry breaking (fill earlier instances first)
+                for instance_id in range(MAX_INSTANCES - 1):
+                    m.constraints.add(
+                        m.gpus[idx(gpu_type, model_name, instance_id)]
+                        >= m.gpus[idx(gpu_type, model_name, instance_id + 1)]
+                    )
+
+        # Makespan is the sum of max times per model (models run sequentially)
+        m.constraints.add(m.makespan == sum(m.time_max[model_name.value] for model_name in model_names))
+
+        # User TTFF definition: sum of min TTFF per model
+        m.constraints.add(m.ttff_user >= sum(m.ttff_min[model_name.value] for model_name in model_names))
+        m.constraints.add(m.ttff_user >= m.makespan - self.workflow.total_video_seconds)
+
+        # Select exactly 1 instance as the min TTFF instance per model
+        for model_name in model_names:
+            m.constraints.add(
+                sum(
+                    m.is_min[idx(gpu_type, model_name, instance_id)]
+                    for gpu_type in gpu_types
+                    for instance_id in instance_ids
+                ) == 1
+            )
+
+        # Resolution scaling factor for HF/VAE/FT
+        latency_ratio = self.workflow.get_resolution_scale(self.policy.use_upscaler)
+
+        # Time constraints
+        # Each model block is guarded by membership in model_names so that
+        # the MILP can be built for a subset of models (e.g. Helix per-model).
+        for gpu_type in gpu_types:
+            # Gemma
+            if Model.GEMMA in model_names and work[Model.GEMMA] > 0:
+                model_name = Model.GEMMA
+                for instance_id in instance_ids:
+                    key = idx(gpu_type, model_name, instance_id)
+                    # Makespan is the max time across all instances
+                    # Linearized: use work_device instead of device_choice * work
+                    if work[model_name] > 1:
+                        # Parallel: each work unit = 1 scene
+                        # Time for w scenes
+                        # = gemma_first_scene + gemma_per_scene * (w - 1)
+                        # = (gemma_first_scene - gemma_per_scene) * is_active + gemma_per_scene * work
+                        # Using linearized variables:
+                        # = (gemma_first_scene[d] - gemma_per_scene[d]) * \
+                        # device_choice[d] + gemma_per_scene[d] * work_device[d]
+                        m.constraints.add(
+                            m.time[key] == sum(
+                                (
+                                    self.latency_data[gpu_type].gemma_first_scene[num_devices]
+                                    - self.latency_data[gpu_type].gemma_per_scene[num_devices]
+                                )
+                                * m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                                + self.latency_data[gpu_type].gemma_per_scene[num_devices]
+                                * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                                for num_devices in DEVICE_OPTIONS[model_name]
+                            )
+                        )
+                    else:
+                        m.constraints.add(
+                            m.time[key] == sum(
+                                (
+                                    self.latency_data[gpu_type].gemma_first_scene[num_devices]
+                                    + self.latency_data[gpu_type].gemma_per_scene[num_devices]
+                                    * (self.workflow.total_scenes - 1)
+                                )
+                                * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                                for num_devices in DEVICE_OPTIONS[model_name]
+                            )
+                        )
+                    # TTFF is for 1 work unit
+                    m.constraints.add(
+                        m.ttff[key] == sum(
+                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                            * self.latency_data[gpu_type].gemma_first_scene[num_devices]
+                            * 1  # TTFF for tokens in first scene
+                            for num_devices in DEVICE_OPTIONS[model_name]
+                        )
+                    )
+
+            # Flux
+            if Model.FLUX in model_names and work[Model.FLUX] > 0:
+                model_name = Model.FLUX
+                for instance_id in instance_ids:
+                    key = idx(gpu_type, model_name, instance_id)
+                    # Makespan is the max time across all instances
+                    # Linearized: use work_device instead of device_choice * work
+                    if work[model_name] > 1:
+                        # Parallel: each work unit = 1 scene
+                        # Time for w scenes = latency * num_steps_flux * w
+                        m.constraints.add(
+                            m.time[key] == sum(
+                                self.latency_data[gpu_type][model_name, num_devices]
+                                * self.workflow.num_steps[model_name]
+                                * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                                for num_devices in DEVICE_OPTIONS[model_name]
+                            )
+                        )
+                    else:
+                        # Non-parallel: single work unit covers all scenes
+                        m.constraints.add(
+                            m.time[key] == sum(
+                                self.latency_data[gpu_type][model_name, num_devices]
+                                * self.workflow.num_steps[model_name]
+                                * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                                for num_devices in DEVICE_OPTIONS[model_name]
+                            )
+                        )
+                    # TTFF is for 1 work unit
+                    m.constraints.add(
+                        m.ttff[key] == sum(
+                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                            * self.latency_data[gpu_type][model_name, num_devices]
+                            * self.workflow.num_steps[model_name]
+                            * 1  # TTFF for first work unit
+                            for num_devices in DEVICE_OPTIONS[model_name]
+                        )
+                    )
+
+            # Hunyuan FramePack
+            if Model.HF in model_names and work[Model.HF] > 0:
+                model_name = Model.HF
+                for instance_id in instance_ids:
+                    key = idx(gpu_type, model_name, instance_id)
+
+                    """
+                    from models import HFModelAllocation
+                    HFModelAllocation(
+                        gpu_type,
+                        num_devices,
+                        replicas=1,
+                    )._calc_time_per_subscene(
+                        self.policy,
+                        self.workflow,
+                        self.latency_data[gpu_type]
+                    )
+                    """
+
+                    # Makespan is the max time across all instances
+                    # Linearized: use work_device instead of device_choice * work
+                    hf_time_expr = sum(
+                        self.workflow.per_subscene_frames[model_name]
+                        / self.workflow.hf_frames[self.workflow.frames_per_step_idx]
+                        * self.latency_data[gpu_type][model_name, num_devices]
+                        * latency_ratio
+                        * self.workflow.num_steps[model_name]
+                        * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                        for num_devices in DEVICE_OPTIONS[model_name]
+                    )
+                    # When not disaggregated, VAE runs on the same instance
+                    if not self.policy.is_disaggregated(Model.HF):
+                        hf_vae_time_per_work = (
+                            self.latency_data[gpu_type][Model.HF_VAE, 1]
+                            * latency_ratio
+                            / self.workflow.hf_frames[self.workflow.frames_per_step_idx]
+                        )
+                        hf_time_expr += hf_vae_time_per_work * m.work[key]
+                    m.constraints.add(m.time[key] == hf_time_expr)
+                    # TTFF is for first chunk (can be smaller than subscene when disaggregated)
+                    ttff_frames_hf = min(
+                        self.workflow.hf_frames[0],
+                        self.workflow.per_subscene_frames[model_name])
+                    hf_ttff_expr = sum(
+                        m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                        * ttff_frames_hf
+                        / self.workflow.hf_frames[self.workflow.frames_per_step_idx]
+                        * self.latency_data[gpu_type][model_name, num_devices]
+                        * latency_ratio
+                        * self.workflow.num_steps[model_name]
+                        * 1  # TTFF for first chunk
+                        for num_devices in DEVICE_OPTIONS[model_name]
+                    )
+                    # When not disaggregated, add VAE decode time for first chunk
+                    if not self.policy.is_disaggregated(Model.HF):
+                        hf_vae_ttff = (
+                            ttff_frames_hf
+                            / self.workflow.hf_frames[self.workflow.frames_per_step_idx]
+                            * self.latency_data[gpu_type][Model.HF_VAE, 1]
+                            * latency_ratio
+                        )
+                        hf_ttff_expr += hf_vae_ttff * m.is_active[key]
+                    m.constraints.add(m.ttff[key] == hf_ttff_expr)
+
+            # Hunyuan FramePack VAE
+            if Model.HF_VAE in model_names and work[Model.HF_VAE] > 0:
+                model_name = Model.HF_VAE
+                for instance_id in instance_ids:
+                    key = idx(gpu_type, model_name, instance_id)
+                    # Makespan is the max time across all instances
+                    # Linearized: use work_device instead of device_choice * work
+                    m.constraints.add(
+                        m.time[key] == sum(
+                            self.latency_data[gpu_type][model_name, num_devices]
+                            * latency_ratio
+                            / self.workflow.hf_frames[self.workflow.frames_per_step_idx]
+                            * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                            for num_devices in DEVICE_OPTIONS[model_name]
+                        )
+                    )
+                    # TTFF is for 1 subscene
+                    m.constraints.add(
+                        m.ttff[key] == sum(
+                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                            * self.workflow.per_subscene_frames[Model.HF]
+                            * self.latency_data[gpu_type][model_name, num_devices]
+                            * latency_ratio
+                            / self.workflow.hf_frames[self.workflow.frames_per_step_idx]  # frames_per_step_hf
+                            * 1  # TTFF for first subscene
+                            for num_devices in DEVICE_OPTIONS[model_name]
+                        )
+                    )
+
+            # Fantasy Talking
+            if Model.FT in model_names and work[Model.FT] > 0:
+                model_name = Model.FT
+                for instance_id in instance_ids:
+                    key = idx(gpu_type, model_name, instance_id)
+                    # Makespan is the max time across all instances
+                    # Linearized: use work_device instead of device_choice * work
+                    ft_time_expr = sum(
+                        self.workflow.per_subscene_frames[model_name]
+                        / self.workflow.ft_frames[self.workflow.frames_per_step_idx]
+                        * self.latency_data[gpu_type][model_name, num_devices]
+                        * latency_ratio
+                        * self.workflow.num_steps[model_name]
+                        * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                        for num_devices in DEVICE_OPTIONS[model_name]
+                    )
+                    # When not disaggregated, VAE runs on the same instance
+                    if not self.policy.is_disaggregated(Model.FT):
+                        ft_vae_time_per_work = (
+                            self.latency_data[gpu_type][Model.FT_VAE, 1]
+                            * latency_ratio
+                            / self.workflow.ft_frames[self.workflow.frames_per_step_idx]
+                        )
+                        ft_time_expr += ft_vae_time_per_work * m.work[key]
+                    m.constraints.add(m.time[key] == ft_time_expr)
+                    # TTFF is for 1 work unit (e.g., subscene)
+                    ft_ttff_expr = sum(
+                        m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                        * self.workflow.per_subscene_frames[model_name]
+                        / self.workflow.ft_frames[self.workflow.frames_per_step_idx]
+                        * self.latency_data[gpu_type][model_name, num_devices]
+                        * latency_ratio
+                        * self.workflow.num_steps[model_name]
+                        * 1  # TTFF for first subscene
+                        for num_devices in DEVICE_OPTIONS[model_name]
+                    )
+                    # When not disaggregated, add VAE decode time for first subscene
+                    if not self.policy.is_disaggregated(Model.FT):
+                        ft_vae_ttff = (
+                            self.workflow.per_subscene_frames[Model.FT]
+                            / self.workflow.ft_frames[self.workflow.frames_per_step_idx]
+                            * self.latency_data[gpu_type][Model.FT_VAE, 1]
+                            * latency_ratio
+                        )
+                        ft_ttff_expr += ft_vae_ttff * m.is_active[key]
+                    m.constraints.add(m.ttff[key] == ft_ttff_expr)
+
+            # Fantasy Talking VAE
+            if Model.FT_VAE in model_names and work[Model.FT_VAE] > 0:
+                model_name = Model.FT_VAE
+                for instance_id in instance_ids:
+                    key = idx(gpu_type, model_name, instance_id)
+                    # Makespan is the max time across all instances
+                    # Linearized: use work_device instead of device_choice * work
+                    m.constraints.add(
+                        m.time[key] == sum(
+                            self.latency_data[gpu_type][model_name, num_devices]
+                            * latency_ratio
+                            / self.workflow.ft_frames[self.workflow.frames_per_step_idx]
+                            * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                            for num_devices in DEVICE_OPTIONS[model_name]
+                        )
+                    )
+                    # TTFF is for 1 subscene
+                    m.constraints.add(
+                        m.ttff[key] == sum(
+                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                            * self.workflow.per_subscene_frames[Model.FT]
+                            * self.latency_data[gpu_type][model_name, num_devices]
+                            * latency_ratio
+                            / self.workflow.ft_frames[self.workflow.frames_per_step_idx]  # frames_per_step_ft
+                            * 1  # TTFF for first subscene
+                            for num_devices in DEVICE_OPTIONS[model_name]
+                        )
+                    )
+
+            # Upscaler
+            if Model.UPSCALER in model_names and work[Model.UPSCALER] > 0 and self.policy.use_upscaler:
+                model_name = Model.UPSCALER
+                for instance_id in instance_ids:
+                    key = idx(gpu_type, model_name, instance_id)
+                    # Linearized: use work_device instead of device_choice * work
+                    m.constraints.add(
+                        m.time[key] == sum(
+                            self.latency_data[gpu_type][model_name, num_devices]
+                            * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                            for num_devices in DEVICE_OPTIONS[model_name]
+                        )
+                    )
+                    # TTFF is for 1 work unit (e.g., subscene)
+                    m.constraints.add(
+                        m.ttff[key] == sum(
+                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                            * self.latency_data[gpu_type][model_name, num_devices]
+                            * self.workflow.per_subscene_frames[Model.FT]
+                            * 1  # TTFF is for first subscene
+                            for num_devices in DEVICE_OPTIONS[model_name]
+                        )
+                    )
+
+            # Others
+            if Model.OTHERS in model_names and work[Model.OTHERS] > 0:
+                model_name = Model.OTHERS
+                for instance_id in instance_ids:
+                    key = idx(gpu_type, model_name, instance_id)
+                    # Makespan is the max time across all instances
+                    m.constraints.add(
+                        m.time[key] == sum(
+                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                            * self.latency_data[gpu_type][model_name, num_devices]
+                            * self.workflow.total_scenes
+                            for num_devices in DEVICE_OPTIONS[model_name]
+                        )
+                    )
+                    # TTFF is for 1 work unit
+                    m.constraints.add(
+                        m.ttff[key] == sum(
+                            m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                            * self.latency_data[gpu_type][model_name, num_devices]
+                            * 1  # TTFF is for first scene
+                            for num_devices in DEVICE_OPTIONS[model_name]
+                        )
+                    )
+
+        # Total work to do for each model
+        for model_name in model_names:
+            m.constraints.add(
+                sum(
+                    m.work[idx(gpu_type, model_name, instance_id)]
+                    for gpu_type in gpu_types
+                    for instance_id in instance_ids
+                ) == work[model_name]
+            )
+
+        # Number of GPUs per type
+        # Add a variable to represent the number of servers for each GPU type
+        m.num_servers = Var(m.GPU_TYPES, domain=NonNegativeIntegers)
+
+        for gpu_type in gpu_types:
+            total_gpus = sum(
+                m.gpus[idx(gpu_type, model_name, instance_id)]
+                for model_name in model_names
+                for instance_id in instance_ids
+            )
+            if force_num_gpus:
+                m.constraints.add(total_gpus == num_gpus[gpu_type])
+            else:
+                m.constraints.add(total_gpus <= num_gpus[gpu_type])
+
+            # GPUs used must be a multiple of NUM_GPUS_PER_SERVER
+            if not skip_server_constraint:
+                m.constraints.add(total_gpus == m.num_servers[gpu_type.value] * NUM_GPUS_PER_SERVER[gpu_type])
+
+        # Cost calculation
+        # running_cost=True: cost based only on active model running time
+        if running_cost:
+            cost_expr = sum(
+                self._get_latency_per_work(
+                    gpu_type,
+                    model_name,
+                    num_devices,
+                )
+                * num_devices
+                * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                * self.policy.gpu_cost[gpu_type] / SECONDS_IN_HOUR
+                for gpu_type in gpu_types
+                for model_name in model_names
+                for instance_id in instance_ids
+                for num_devices in DEVICE_OPTIONS[model_name]
+            )
+        # running_cost=False: cost = makespan × total_GPUs_used (GPUs allocated for full job duration)
+        else:
+            cost_expr = m.makespan * sum(
+                m.gpus[idx(gpu_type, model_name, instance_id)]
+                * self.policy.gpu_cost[gpu_type] / SECONDS_IN_HOUR
+                for gpu_type in gpu_types
+                for model_name in model_names
+                for instance_id in instance_ids
+            )
+        m.constraints.add(m.cost == cost_expr)
+
+        # Energy: model-specific power * active time + idle power * (makespan - active time)
+        if self.power_data is None:
+            energy_expr = 0.0
+        else:
+            # Active energy: Use model-specific power values (not TDP)
+            energy_expr = sum(
+                self._get_latency_per_work(
+                    gpu_type,
+                    model_name,
+                    num_devices,
+                )
+                * num_devices
+                * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)]
+                * (
+                    self._get_power_per_work(
+                        gpu_type,
+                        model_name,
+                        num_devices,
+                    ) - self.power_data[gpu_type]["idle"]
+                )
+                for gpu_type in gpu_types
+                for model_name in model_names
+                for instance_id in instance_ids
+                for num_devices in DEVICE_OPTIONS[model_name]
+            )
+            # Idle energy: idle power * num_gpus * makespan
+            energy_expr += sum(
+                self.power_data[gpu_type]["idle"] * num_gpus[gpu_type] * m.makespan
+                for gpu_type in gpu_types
+            )
+        m.constraints.add(m.energy == energy_expr)
+
+        # Bounds
+        if max_cost is not None:
+            m.constraints.add(m.cost <= max_cost)
+        if max_ttff is not None:
+            m.constraints.add(m.ttff_user <= max_ttff)
+        if max_makespan is not None:
+            m.constraints.add(m.makespan <= max_makespan)
+
+        # Objective functions
+        obj = get_objective(
+            m=m,
+            allocator=self.policy.objective,
+            solver_name=solver_name,
+        )
+        if obj is not None:
+            m.objective = obj
+
+        # Solve
+        solver = SolverFactory(solver_name)
+        if solver_name == "gurobi" and time_limit:
+            solver.options["TimeLimit"] = time_limit
+        if solver_name == "highs" and time_limit:
+            solver.options["time_limit"] = time_limit
+        if self.policy.objective in QUADRATIC_OBJECTIVES and solver_name == "gurobi":
+            solver.options['NonConvex'] = 2  # Option for bilinear objectives
+        if solver_name == "highs":
+            solver.options["time_limit"] = 50  # seconds
+
+        if warm_start_path is not None:
+            _load_warm_start(m, warm_start_path)
+
+        if solver_name == "gurobi":
+            opt_result = solver.solve(
+                m,
+                tee=verbose,
+                warmstart=warm_start_path is not None,
+            )
+        else:
+            opt_result = solver.solve(m, tee=verbose)
+
+        if opt_result.solver.status != "ok":
+            logging.error(f"Solver failed with status: {opt_result.solver.status}")
+
+        if save_solution_path is not None:
+            _save_solution(m, save_solution_path)
+
+        models = milp_to_models_dict(
+            m=m,
+            gpu_types=gpu_types,
+            model_names=model_names,
+            instance_ids=instance_ids,
+            idx=idx,
+            workflow=self.workflow,
+            power_data=self.power_data,
+            policy=self.policy,
+        )
+
+        if not self._is_valid_result(m):
+            return Result()
+
+        tbf_s = 0.0
+        if m.makespan.value and self.workflow.num_frames > 0:
+            tbf_s = m.makespan.value / self.workflow.num_frames
+        return Result(
+            models=models,
+            gpus_used=self._get_num_gpus(m, gpu_types, model_names, instance_ids),
+            total_time_s=m.makespan.value,
+            ttff_s=m.ttff_user.value,
+            tbf_s=tbf_s,
+            cost=m.cost.value,
+            total_energy=m.energy.value,
+        )
+
+    def _is_valid_result(self, m: ConcreteModel) -> bool:
+        for gpu_type in m.GPU_TYPES:
+            for model_name in m.MODEL_NAMES:
+                for instance_id in m.INSTANCES:
+                    if m.gpus[gpu_type, model_name, instance_id].value is None:
+                        return False
+        return True
+
+    def _get_num_gpus(
+        self,
+        m: ConcreteModel,
+        gpu_types: list[GPUType],
+        model_names: list[Model],
+        instance_ids: list[int],
+    ) -> dict[GPUType, int]:
+        if not self._is_valid_result(m):
+            return {}
+        return {
+            gpu_type: sum(
+                # round() snaps solver float to nearest int (e.g. 1.9999 -> 2)
+                int(round(m.gpus[idx(gpu_type, model_name, instance_id)].value))
+                for model_name in model_names
+                for instance_id in instance_ids
+                if m.gpus[idx(gpu_type, model_name, instance_id)].value is not None
+            )
+            for gpu_type in gpu_types
+        }
+
+    def _get_latency_per_work(
+        self,
+        gpu_type: GPUType,
+        model_name: Model,
+        num_devices: int,
+    ) -> float:
+        """
+        Cost per unit of work for a given model and GPU type, based on latency data.
+        Cost: Linearized - sum of (latency * work_device * num_devices * ratio)
+        This replaces the bilinear makespan * GPUs.
+        """
+        # Resolution scaling factor for HF/VAE/FT
+        latency_ratio = self.workflow.get_resolution_scale(self.policy.use_upscaler)
+
+        if model_name == Model.GEMMA:
+            return (
+                self.latency_data[gpu_type].gemma_first_scene[num_devices]
+                + self.latency_data[gpu_type].gemma_per_scene[num_devices] * (self.workflow.total_scenes - 1)
+            )
+
+        if model_name == Model.FLUX:
+            return (
+                self.latency_data[gpu_type][model_name, num_devices]
+                * self.workflow.num_steps[Model.FLUX]
+            )
+
+        if model_name == Model.HF:
+            time_per_work = (
+                self.workflow.per_subscene_frames[Model.HF]
+                / self.workflow.hf_frames[self.workflow.frames_per_step_idx]
+                * self.latency_data[gpu_type][model_name, num_devices]
+                * latency_ratio
+                * self.workflow.num_steps[Model.HF]
+            )
+            if not self.policy.is_disaggregated(Model.HF):
+                time_per_work += self._get_latency_per_work(
+                    gpu_type,
+                    Model.HF_VAE,
+                    1,  # VAE is single-device only in current policy
+                )
+            return time_per_work
+
+        if model_name == Model.HF_VAE:
+            return (
+                self.latency_data[gpu_type][model_name, num_devices]
+                * latency_ratio
+                / self.workflow.hf_frames[self.workflow.frames_per_step_idx]
+            )
+
+        if model_name == Model.FT:
+            time_per_work = (
+                self.workflow.per_subscene_frames[Model.FT]
+                / self.workflow.ft_frames[self.workflow.frames_per_step_idx]
+                * self.latency_data[gpu_type][model_name, num_devices]
+                * latency_ratio
+                * self.workflow.num_steps[Model.FT]
+            )
+            if not self.policy.is_disaggregated(Model.FT):
+                time_per_work += self._get_latency_per_work(
+                    gpu_type,
+                    Model.FT_VAE,
+                    1,  # VAE is single-device only in current policy
+                )
+            return time_per_work
+
+        if model_name == Model.FT_VAE:
+            return (
+                self.latency_data[gpu_type][model_name, num_devices]
+                * latency_ratio
+                / self.workflow.ft_frames[self.workflow.frames_per_step_idx]
+            )
+
+        if model_name == Model.UPSCALER:
+            return self.latency_data[gpu_type][model_name, num_devices]
+
+        if model_name == Model.OTHERS:
+            return self.latency_data[gpu_type][model_name, num_devices] * self.workflow.total_scenes
+
+        raise ValueError(f"Unknown model_name {model_name}")
+
+    def _get_power_per_work(
+        self,
+        gpu_type: GPUType,
+        model_name: Model,
+        num_devices: int,
+    ) -> float:
+        """
+        Average power per unit of work for a given model and GPU type.
+        Returns the time-weighted average power consumption in watts.
+        For energy calculation:
+        energy = _get_latency_per_work(...) * _get_power_per_work(...) * num_devices * work
+        """
+        if self.power_data is None:
+            return 0.0
+
+        if model_name == Model.GEMMA:
+            # For Gemma, power varies between first scene and subsequent scenes
+            # Compute energy then divide by total time to get average power
+            power_first = self.power_data[gpu_type].gemma_first_scene[num_devices]
+            power_per_scene = self.power_data[gpu_type].gemma_per_scene[num_devices]
+            latency_first = self.latency_data[gpu_type].gemma_first_scene[num_devices]
+            latency_per_scene = self.latency_data[gpu_type].gemma_per_scene[num_devices]
+
+            total_energy = (
+                power_first * latency_first
+                + power_per_scene * latency_per_scene * (self.workflow.total_scenes - 1)
+            )
+            total_time = latency_first + latency_per_scene * (self.workflow.total_scenes - 1)
+
+            return total_energy / total_time if total_time > 0 else power_first
+
+        if model_name == Model.FLUX:
+            return self.power_data[gpu_type][model_name, num_devices]
+
+        if model_name == Model.HF:
+            return self.power_data[gpu_type][model_name, num_devices]
+
+        if model_name == Model.HF_VAE:
+            return self.power_data[gpu_type][model_name, num_devices]
+
+        if model_name == Model.FT:
+            return self.power_data[gpu_type][model_name, num_devices]
+
+        if model_name == Model.FT_VAE:
+            return self.power_data[gpu_type][model_name, num_devices]
+
+        if model_name == Model.UPSCALER:
+            return self.power_data[gpu_type][model_name, num_devices]
+
+        if model_name == Model.OTHERS:
+            # OTHERS model uses minimal GPU power (mostly idle)
+            # See models.py OthersModelAllocation.calculate_energy - only uses idle power
+            return self.power_data[gpu_type]["idle"]
+
+        raise ValueError(f"Unknown model_name {model_name}")
+
+
+def milp_to_models_dict(
+    m: ConcreteModel,
+    gpu_types: list[GPUType],
+    model_names: list[Model],
+    instance_ids: list[int],
+    idx: Callable[[GPUType, Model, int], tuple[str, str, int]],
+    workflow: WorkflowConfig,
+    power_data: Optional[PowerData],
+    policy: Policy,
+) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
+    """
+    MILP result to models dictionary.
+    """
+    if m is None:
+        return {}
+
+    models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {}
+    for gpu_type in gpu_types:
+        models[gpu_type] = {}
+        for model_name in model_names:
+            models[gpu_type][model_name] = []
+            for instance_id in instance_ids:
+                key = idx(gpu_type, model_name, instance_id)
+                gpus_val = m.gpus[key].value
+                work_val = m.work[key].value
+                if gpus_val is None or work_val is None:
+                    continue
+                # round() snaps solver floats to nearest int (e.g. 1.9999 -> 2);
+                # banker's rounding is irrelevant here since MILP values can be
+                # near-integer, like 1.999 and 2.001
+                gpus = int(round(gpus_val))
+                work = int(round(work_val))
+                if gpus > 0 and work > 0:
+                    model_allocation = get_model_allocation(
+                        model=model_name,
+                        gpu_type=gpu_type,
+                        devices=gpus,
+                        replicas=1,
+                    )
+                    model_allocation.work = work
+                    model_allocation.time = m.time[key].value
+                    model_allocation.time_first = m.ttff[key].value
+                    model_allocation.calculate_energy(
+                        workflow=workflow,
+                        power_data=power_data,
+                        total_time_s=m.makespan.value
+                    )
+                    model_allocation.calculate_cost(
+                        policy,
+                        total_time_s=m.makespan.value
+                    )
+                    models[gpu_type][model_name].append(model_allocation)
+    merged_models = models  # coalesce_models(models)
+    return merged_models
+
+
+def get_objective(
+    m: ConcreteModel,
+    allocator: Objective,
+    solver_name: str,
+) -> Optional[OptObjective]:
+    if allocator == Objective.TIME:
+        return OptObjective(expr=m.makespan, sense=minimize)
+
+    if allocator == Objective.TTFF:
+        return OptObjective(expr=m.ttff_user, sense=minimize)
+
+    if allocator == Objective.TTFF_COST:
+        # Note: This creates a bilinear (nonconvex) objective - requires Gurobi
+        if solver_name == "gurobi":
+            return OptObjective(expr=m.ttff_user * m.cost, sense=minimize)
+        logging.warning("TTFF_COST using linear utility function.")
+        a = 1.0
+        b = 1.0
+        return OptObjective(expr=a * m.ttff_user + b * m.cost, sense=minimize)
+
+    if allocator == Objective.COST:
+        return OptObjective(expr=m.cost, sense=minimize)
+
+    if allocator == Objective.ENERGY:
+        return OptObjective(expr=m.energy, sense=minimize)
+
+    if allocator == Objective.TIME_ENERGY:
+        # Note: This creates a bilinear objective - requires Gurobi
+        if solver_name == "gurobi":
+            return OptObjective(expr=m.makespan * m.energy, sense=minimize)
+        logging.warning("TIME_ENERGY using linear utility function.")
+        a = 1.0
+        b = 1.0
+        return OptObjective(expr=a * m.makespan + b * m.energy, sense=minimize)
+
+    if allocator == Objective.ENERGY_COST:
+        if solver_name == "gurobi":
+            return OptObjective(expr=m.energy * m.cost, sense=minimize)
+        logging.warning("ENERGY_COST using linear utility function.")
+        a = 1.0
+        b = 1.0
+        return OptObjective(expr=a * m.energy + b * m.cost, sense=minimize)
+
+    if allocator == Objective.FIFO:
+        logging.error("FIFO not implemented in MILP")
+
+    if allocator == Objective.RANDOM:
+        return None  # No objective, just find a feasible solution
+
+    if allocator == Objective.NONE:
+        return None
+
+    return OptObjective(expr=m.makespan, sense=minimize)
+
+
+def _save_solution(
+    m: ConcreteModel,
+    save_solution_path: str,
+) -> None:
+    solution = {
+        var.name: var.value
+        for var in m.component_data_objects(Var, active=True)
+        if var.value is not None
+    }
+    with open(save_solution_path, "w", encoding="utf-8") as output_file:
+        json.dump(solution, output_file, indent=2)
+
+
+def _load_warm_start(
+    m: ConcreteModel,
+    warm_start_path: str,
+) -> None:
+    """Load warm start values from a JSON file and apply them to the model variables."""
+    with open(warm_start_path, "r", encoding="utf-8") as input_file:
+        warm_start_values = json.load(input_file)
+
+    warm_start_applied = 0
+    for var in m.component_data_objects(Var, active=True):
+        if var.name in warm_start_values:
+            var.set_value(warm_start_values[var.name])
+            warm_start_applied += 1
+
+    logging.info(
+        f"Warm start loaded from {warm_start_path}. "
+        f"Applied values to {warm_start_applied} variables."
+    )
diff --git a/streamwise/model_provisioner/naive_baseline.py b/streamwise/model_provisioner/naive_baseline.py
new file mode 100644
index 00000000..ec95904e
--- /dev/null
+++ b/streamwise/model_provisioner/naive_baseline.py
@@ -0,0 +1,484 @@
+"""
+Naive baseline for the StreamWise workflow allocation problem.
+"""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from constants import NUM_GPUS_PER_SERVER
+from constants import DEVICE_OPTIONS
+
+from sim_types import Result
+from sim_types import GPUType
+from sim_types import WorkflowConfig
+from sim_types import LatencyData
+from sim_types import PowerData
+from sim_types import Policy
+from sim_types import Solver
+from sim_types import Model
+from sim_types import ModelAllocation
+from sim_types import Objective
+
+from models import FluxModelAllocation
+from models import GemmaModelAllocation
+from models import HFModelAllocation
+from models import HFVAEModelAllocation
+from models import FTModelAllocation
+from models import FTVAEModelAllocation
+from models import UpscalerModelAllocation
+from models import OthersModelAllocation
+
+from evaluator import evaluate_model_allocation
+
+from .policies import NAIVE_POLICY
+from .policies import MAX_DEVICES
+
+from model_allocator import ModelAllocator
+
+
+class NaiveAllocator(ModelAllocator):
+    """
+    Naive allocator that implements a simple heuristic.
+    """
+    def __init__(
+        self,
+        workflow: WorkflowConfig,
+        latency_data: LatencyData,
+        power_data: Optional[PowerData] = None,
+        policy: Policy = NAIVE_POLICY,
+    ) -> None:
+        super().__init__(
+            workflow,
+            latency_data,
+            power_data,
+            policy,
+        )
+        assert self.policy.solver == Solver.NAIVE
+        assert self.policy.objective == Objective.TTFF
+
+    def allocate(
+        self,
+        num_gpus: dict[GPUType, int],
+        verbose: bool = False,
+    ) -> Result:
+        total_gpus = sum(num_gpus.values())
+        assert total_gpus >= 8, f"Total number of GPUs must be at least 8 ({num_gpus})"
+
+        gpu_types = [
+            gpu_type
+            for gpu_type, count in num_gpus.items()
+            if count > 0
+        ]
+        assert 1 <= len(gpu_types) <= 2, f"Only up to two GPU types are supported ({len(gpu_types)})"
+        gpu_type1 = gpu_types[0]
+
+        if len(gpu_types) == 1:
+            models = self._naive_single(
+                num_gpus.get(gpu_type1, 0),
+                gpu_type=gpu_type1,
+            )
+        else:
+            # Mixed setup of GPU types (e.g., A100 and H100)
+            models = self._naive_two(num_gpus)
+
+        result = evaluate_model_allocation(
+            models=models,
+            num_gpus=num_gpus,
+            workflow=self.workflow,
+            latency_data=self.latency_data,
+            power_data=self.power_data,
+            policy=self.policy,
+            round_up_cost_to_server=True,
+        )
+        return result
+
+    def _naive_single(
+        self,
+        num_gpus: int,
+        gpu_type: GPUType,
+    ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
+        """Naive allocation for single GPU type."""
+        return self._naive_parallelism_allocation(gpu_type, num_gpus)
+
+    def _naive_two(
+        self,
+        num_gpus: dict[GPUType, int],
+    ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
+        """Naive allocation for two GPU types."""
+        gpu_types = list(num_gpus.keys())
+        assert len(gpu_types) == 2
+        assert len(num_gpus) == 2
+        gpu_type1 = gpu_types[0]
+        gpu_type2 = gpu_types[1]
+        assert num_gpus[gpu_type1] >= NUM_GPUS_PER_SERVER[gpu_type1]
+        assert num_gpus[gpu_type2] >= NUM_GPUS_PER_SERVER[gpu_type2]
+
+        # Initialize allocations with minimal setup
+        models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {
+            gpu_type1: {  # 3 x A100s (type1)
+                Model.GEMMA: [GemmaModelAllocation(
+                    gpu_type=gpu_type1,
+                    devices=1, replicas=1)],
+                Model.FLUX: [FluxModelAllocation(
+                    gpu_type=gpu_type1,
+                    devices=1, replicas=1)],
+                Model.HF: [],
+                Model.HF_VAE: [],
+                Model.FT: [],
+                Model.FT_VAE: [],
+                Model.UPSCALER: [],
+                Model.OTHERS: [OthersModelAllocation(
+                    gpu_type=gpu_type1,
+                    devices=1, replicas=1)],  # + 1 for Kokoro/YOLO
+            },
+            gpu_type2: {  # 4 (+1) X H100 GPUs (type2)
+                Model.GEMMA: [],
+                Model.FLUX: [],
+                Model.HF: [HFModelAllocation(
+                    gpu_type=gpu_type2,
+                    devices=1, replicas=1)],
+                Model.HF_VAE: [HFVAEModelAllocation(
+                    gpu_type=gpu_type2,
+                    devices=1, replicas=1)],
+                Model.FT: [FTModelAllocation(
+                    gpu_type=gpu_type2,
+                    devices=2, replicas=1)],
+                Model.FT_VAE: [FTVAEModelAllocation(
+                    gpu_type=gpu_type2,
+                    devices=1, replicas=1)],
+                Model.UPSCALER: [UpscalerModelAllocation(
+                    gpu_type=gpu_type2)],
+                Model.OTHERS: [],
+            },
+        }
+
+        # Calculate remaining: starting - assigned
+        if not self.policy.is_disaggregated(Model.HF):
+            models[gpu_type2][Model.HF][0].replicas = 2
+            models[gpu_type2][Model.HF_VAE][0].replicas = 0
+        if not self.policy.is_disaggregated(Model.FT):
+            models[gpu_type2][Model.FT_VAE][0].replicas = 0
+
+        if self.policy.use_upscaler:
+            models[gpu_type2][Model.UPSCALER][0].replicas = 1
+
+        models_gpu_type1 = self._naive_parallelism_allocation(
+            gpu_type1,
+            num_gpus.get(gpu_type1, 0),
+        )
+        models_gpu_type2 = self._naive_parallelism_allocation(
+            gpu_type2,
+            num_gpus.get(gpu_type2, 0),
+            # Already allocated in first GPU type
+            skip_non_paralelizable_models=True,
+        )
+        models[gpu_type1] = models_gpu_type1[gpu_type1]
+        models[gpu_type2] = models_gpu_type2[gpu_type2]
+
+        # Apply per-GPU-type overrides after allocation
+        if self.policy.use_upscaler:
+            models[gpu_type2][Model.UPSCALER][0].replicas = 1
+
+        return models
+
+    def _naive_parallelism_allocation(
+        self,
+        gpu_type: GPUType,
+        num_devices: int,
+        skip_non_paralelizable_models: bool = False,
+    ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]:
+        """
+        Device allocation for naive parallelism.
+        Max devices for each model.
+        Allocate devices to each model proportional to their max devices.
+        """
+        models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {
+            gpu_type: {
+                Model.GEMMA: [GemmaModelAllocation(
+                    gpu_type=gpu_type,
+                    replicas=1)],
+                Model.FLUX: [FluxModelAllocation(
+                    gpu_type=gpu_type,
+                    replicas=1)],
+                Model.HF: [HFModelAllocation(
+                    gpu_type=gpu_type,
+                    replicas=1)],
+                Model.HF_VAE: [HFVAEModelAllocation(
+                    gpu_type=gpu_type,
+                    replicas=1 if self.policy.is_disaggregated(Model.HF) else 0)],
+                Model.FT: [FTModelAllocation(
+                    gpu_type=gpu_type,
+                    replicas=4)],
+                Model.FT_VAE: [FTVAEModelAllocation(
+                    gpu_type=gpu_type,
+                    replicas=1 if self.policy.is_disaggregated(Model.FT) else 0)],
+                Model.OTHERS: [OthersModelAllocation(
+                    gpu_type=gpu_type,
+                    replicas=1)],  # + 1 for Kokoro/YOLO
+                Model.UPSCALER: [UpscalerModelAllocation(
+                    gpu_type=gpu_type,
+                    replicas=1 if self.policy.use_upscaler else 0)],
+            },
+        }
+
+        # Zero out replicas for models not in workflow
+        for model in Model:
+            if model not in self.workflow.models:
+                for alloc in models[gpu_type][model]:
+                    alloc.replicas = 0
+
+        # Zero out replicas for models that are not parallelizable when skip_non_paralelizable_models is True
+        if skip_non_paralelizable_models:
+            for model in Model:
+                if not self.workflow.is_parallelizable(model):
+                    for alloc in models[gpu_type][model]:
+                        alloc.replicas = 0
+
+        # Assert only 1 allocation instance per model for naive parallelism
+        for model in Model:
+            assert len(models[gpu_type][model]) == 1, \
+                f"Expected only 1 allocation instance for {model}, got {len(models[gpu_type][model])}"
+
+        alloc_id = 0
+        model_gemma = models[gpu_type][Model.GEMMA][alloc_id]
+        model_flux = models[gpu_type][Model.FLUX][alloc_id]
+        model_hf = models[gpu_type][Model.HF][alloc_id]
+        model_vae = models[gpu_type][Model.HF_VAE][alloc_id]
+        model_ft = models[gpu_type][Model.FT][alloc_id]
+        model_ft_vae = models[gpu_type][Model.FT_VAE][alloc_id]
+        model_upscaler = models[gpu_type][Model.UPSCALER][alloc_id]
+
+        # TODO do we need to do something for Model.OTHERS
+
+        if num_devices == 8:
+            # single server case, use fixed allocation
+            if Model.FT in self.workflow.models:
+                model_ft.replicas = 4
+            if self.policy.use_upscaler and Model.UPSCALER in self.workflow.models:
+                model_upscaler.replicas = 1
+                if Model.FT in self.workflow.models:
+                    model_ft.replicas -= 1
+            if self.policy.is_disaggregated(Model.HF) and Model.HF_VAE in self.workflow.models:
+                model_vae.replicas = 1
+                if Model.FT in self.workflow.models:
+                    model_ft.replicas -= 1
+            if self.policy.is_disaggregated(Model.FT) and Model.FT_VAE in self.workflow.models:
+                model_ft_vae.replicas = 1
+                if Model.FT in self.workflow.models:
+                    model_ft.replicas -= 1
+            return models
+
+        init_num_devices = sum([
+            model[0].devices * model[0].replicas
+            for model in models[gpu_type].values()
+        ])
+
+        # Allocate devices proportional to each model's max devices
+        max_devices = MAX_DEVICES
+        models_in_workflow = [
+            model
+            for model in max_devices.keys()
+            if model in self.workflow.models
+        ]
+        if skip_non_paralelizable_models:
+            for model in max_devices.keys():
+                if not self.workflow.is_parallelizable(model):
+                    models_in_workflow.remove(model)
+
+        total_max_devices = sum([
+            max_devices[model]
+            for model in models_in_workflow
+        ])
+        for model in models_in_workflow:
+            # Calculate the number of devices to allocate for the model, proportional to its max devices among others
+            alloc_devices = int((num_devices - init_num_devices) * max_devices[model] / total_max_devices)
+            if model == Model.GEMMA:
+                max_devices_gemma = max_devices[Model.GEMMA]
+                if self.latency_data:
+                    max_devices_gemma = min(max_devices_gemma, self.latency_data[gpu_type].get_max_parallelism(model))
+                model_gemma.devices += min(alloc_devices, max_devices_gemma)
+                # Round down nearest in DEVICE_OPTIONS_GEMMA
+                num_gemma_devices = max([
+                    d
+                    for d in DEVICE_OPTIONS[Model.GEMMA]
+                    if d <= model_gemma.devices
+                ])
+                model_gemma.devices = num_gemma_devices
+            elif model == Model.FLUX:
+                max_devices_flux = max_devices[Model.FLUX]
+                if self.latency_data:
+                    max_devices_flux = min(max_devices_flux, self.latency_data[gpu_type].get_max_parallelism(model))
+                model_flux.devices += min(alloc_devices, max_devices_flux)
+                # Round down nearest in DEVICE_OPTIONS_FLUX
+                model_flux.devices = max([
+                    d
+                    for d in DEVICE_OPTIONS[Model.FLUX]
+                    if d <= model_flux.devices
+                ])
+            elif model == Model.HF:
+                max_devices_hf = max_devices[Model.HF]
+                if self.latency_data:
+                    max_devices_hf = min(max_devices_hf, self.latency_data[gpu_type].get_max_parallelism(model))
+                model_hf.replicas += min(alloc_devices, max_devices_hf)
+            elif model == Model.HF_VAE:
+                if self.policy.is_disaggregated(Model.HF):
+                    max_devices_vae = max_devices[Model.HF_VAE]
+                    if self.latency_data:
+                        max_devices_vae = min(max_devices_vae, self.latency_data[gpu_type].get_max_parallelism(model))
+                    model_vae.replicas += min(alloc_devices, max_devices_vae)
+            elif model == Model.FT:
+                max_devices_ft = max_devices[Model.FT]
+                if self.latency_data:
+                    max_devices_ft = min(max_devices_ft, self.latency_data[gpu_type].get_max_parallelism(model))
+                model_ft.replicas += min(alloc_devices, max_devices_ft)
+            elif model == Model.FT_VAE:
+                if self.policy.is_disaggregated(Model.FT):
+                    max_devices_ft_vae = max_devices[Model.FT_VAE]
+                    if self.latency_data:
+                        max_devices_ft_vae = min(
+                            max_devices_ft_vae, self.latency_data[gpu_type].get_max_parallelism(model)
+                        )
+                    model_ft_vae.replicas += min(alloc_devices, max_devices_ft_vae)
+            else:
+                raise ValueError(f"Unrecognized model {model}")
+
+        remaining_devices = num_devices
+        for model_name in models[gpu_type].keys():
+            for model_alloc in models[gpu_type][model_name]:
+                remaining_devices -= model_alloc.get_num_gpus()
+
+        # Distribute remaining devices to parallelizable models
+        distribute_models = self.workflow.filter_parallelizable_models(
+            models_in_workflow,
+            disaggregation=self.policy.disaggregation,
+        )
+        # Prioritise models that already hold more GPUs
+        distribute_models.sort(
+            key=lambda m: models[gpu_type][m][alloc_id].get_num_gpus(),
+            reverse=True,
+        )
+        num_distribute = len(distribute_models)
+        if num_distribute > 0 and remaining_devices > 0:
+            made_progress = True
+            while remaining_devices > 0 and made_progress:
+                made_progress = False
+                for model_name in distribute_models:
+                    gpus_per_replica = models[gpu_type][model_name][alloc_id].devices
+                    if gpus_per_replica <= 0 or remaining_devices < gpus_per_replica:
+                        continue
+                    models[gpu_type][model_name][alloc_id].replicas += 1
+                    remaining_devices -= gpus_per_replica
+                    made_progress = True
+                    if remaining_devices <= 0:
+                        break
+
+        remaining_devices = num_devices
+        for model_name in models[gpu_type].keys():
+            for model_alloc in models[gpu_type][model_name]:
+                remaining_devices -= model_alloc.get_num_gpus()
+
+        # TODO we should try to assign all resources
+        # assert remaining_devices == 0, \
+        assert remaining_devices >= 0, \
+            f"remaining={remaining_devices} != 0: " \
+            f"gpu={gpu_type.value} total={num_devices} remaining={remaining_devices}"
+
+        # Update replicas based on total devices
+        # Gemma (when parallelizable)
+        if self.workflow.is_parallelizable(Model.GEMMA) and Model.GEMMA in models_in_workflow:
+            model_gemma.devices, model_gemma.replicas, remaining_devices = _calculate_naive_num_devices(
+                model_gemma.devices,
+                model_gemma.replicas,
+                remaining_devices,
+                device_options=DEVICE_OPTIONS[Model.GEMMA],
+                replica_upper_bound=self.workflow.total_scenes)
+
+        # Flux (when parallelizable)
+        if self.workflow.is_parallelizable(Model.FLUX) and Model.FLUX in models_in_workflow:
+            model_flux.devices, model_flux.replicas, remaining_devices = _calculate_naive_num_devices(
+                model_flux.devices,
+                model_flux.replicas,
+                remaining_devices,
+                device_options=DEVICE_OPTIONS[Model.FLUX],
+                replica_upper_bound=self.workflow.total_scenes)
+
+        # Hunyuan FramePack
+        if Model.HF in self.workflow.models:
+            model_hf.devices, model_hf.replicas, remaining_devices = _calculate_naive_num_devices(
+                model_hf.devices,
+                model_hf.replicas,
+                remaining_devices,
+                device_options=DEVICE_OPTIONS[Model.HF],
+                replica_upper_bound=self.workflow.total_scenes)
+
+        # Hunyuan FramePack VAE
+        if self.policy.is_disaggregated(Model.HF) and Model.HF_VAE in self.workflow.models:
+            model_vae.devices, model_vae.replicas, remaining_devices = _calculate_naive_num_devices(
+                model_vae.devices,
+                model_vae.replicas,
+                remaining_devices,
+                device_options=None,
+                replica_upper_bound=self.workflow.total_frames[Model.HF],
+            )
+
+        # Fantasy Talking
+        if Model.FT in self.workflow.models:
+            model_ft.devices, model_ft.replicas, remaining_devices = _calculate_naive_num_devices(
+                model_ft.devices,
+                model_ft.replicas,
+                remaining_devices,
+                device_options=DEVICE_OPTIONS[Model.FT],
+                replica_upper_bound=self.workflow.total_subscenes,
+            )
+
+        # Fantasy Talking VAE
+        if self.policy.is_disaggregated(Model.FT) and Model.FT_VAE in self.workflow.models:
+            model_ft_vae.devices, model_ft_vae.replicas, remaining_devices = _calculate_naive_num_devices(
+                model_ft_vae.devices,
+                model_ft_vae.replicas,
+                remaining_devices,
+                device_options=None,
+                replica_upper_bound=self.workflow.total_frames[Model.FT],
+            )
+
+        return models
+
+
+def _calculate_naive_num_devices(
+    num_devices: int,
+    num_replicas: int,
+    remaining_devices: int,
+    device_options: Optional[list[int]] = [1],
+    replica_upper_bound: Optional[int] = None,
+) -> tuple[int, int, int]:
+    """Find the parallelism that maximizes the device usage."""
+    assert remaining_devices >= 0
+
+    model_quota = num_devices * num_replicas
+
+    if device_options:
+        best_product = 0
+        best_devices_per_replica = 1
+        best_replicas = 1
+        for devices_per_replica in device_options:
+            if devices_per_replica > model_quota:
+                continue
+            max_replicas = model_quota // devices_per_replica
+            if replica_upper_bound and max_replicas > replica_upper_bound:
+                max_replicas = replica_upper_bound
+            product = devices_per_replica * max_replicas
+            if product > best_product:
+                best_product = product
+                best_devices_per_replica = devices_per_replica
+                best_replicas = max_replicas
+    else:
+        # start with parallelism=1 instead
+        best_devices_per_replica = 1
+        best_replicas = model_quota
+
+    num_devices = best_devices_per_replica
+    num_replicas = best_replicas
+    remaining_devices += model_quota - num_replicas * num_devices
+
+    return num_devices, num_replicas, remaining_devices
diff --git a/streamwise/model_provisioner/policies.py b/streamwise/model_provisioner/policies.py
new file mode 100644
index 00000000..3f670f93
--- /dev/null
+++ b/streamwise/model_provisioner/policies.py
@@ -0,0 +1,252 @@
+from __future__ import annotations
+
+from sim_types import Objective
+from sim_types import Policy
+from sim_types import GPUType
+from sim_types import Model
+from sim_types import Solver
+
+from constants import GPU_RESERVED_COST
+from constants import GPU_SPOT_COST
+
+
+# Max devices for each model
+# the logic is to allocate devices to each model proportional to their max devices
+MAX_DEVICES = {
+    Model.GEMMA: 8,
+    Model.FLUX: 16,
+    Model.HF: 40,
+    Model.HF_VAE: 1,
+    Model.FT: 40,
+    Model.FT_VAE: 1,
+}
+
+# Max iterations for the optimization loop to prevent infinite loops in case of non-monotonic allocators or other issues
+MAX_ITERATIONS = 100
+
+# Set to True if we want to use up all GPUs if there's no further improvements in the greedy optimization loop
+USE_ALL_GPUS = True
+
+# Default StreamWise policy configuration
+# TODO: Add a meta policy that picks the best among disaggregation options for HF/FT
+STREAMWISE_POLICY = Policy(
+    name="streamwise",
+    gpu_cost=GPU_SPOT_COST,
+    objective=Objective.TTFF_COST,
+    disaggregation={
+        Model.HF: True,
+        Model.FT: False,
+    },
+    use_upscaler=True,
+    hardware=list(GPUType),
+)
+
+STREAMWISE_MILP_POLICY = Policy(
+    name="streamwise",
+    gpu_cost=GPU_SPOT_COST,
+    objective=Objective.TTFF_COST,
+    disaggregation={
+        Model.HF: True,
+        Model.FT: False,
+    },
+    use_upscaler=True,
+    hardware=list(GPUType),
+    solver=Solver.GUROBI,
+)
+
+
+"""
+HexGen policy configuration.
+"""
+HEXGEN_POLICY = Policy(
+    name="hexgen",
+    gpu_cost=GPU_RESERVED_COST,
+    objective=Objective.TTFF,  # Does not account for cost
+    disaggregation={
+        Model.HF: True,
+        Model.FT: False,
+    },  # Dissagregation
+    use_upscaler=False,
+    hardware=[  # Multiple hardware
+        GPUType.A100,
+        GPUType.H100,
+        GPUType.H200,
+        GPUType.GB200,
+    ],
+    solver=Solver.HEXGEN,
+)
+
+
+"""
+Helix policy configuration.
+Reference: https://github.com/Thesys-lab/Helix-ASPLOS25
+Optimizes models one-by-one following MODEL_ORDER using MILP.
+"""
+HELIX_POLICY = Policy(
+    name="helix",
+    gpu_cost=GPU_RESERVED_COST,
+    objective=Objective.TTFF,  # Does not account for cost
+    disaggregation={
+        Model.HF: True,
+        Model.FT: False,
+    },
+    use_upscaler=False,
+    hardware=list(GPUType),
+    solver=Solver.HELIX,
+)
+
+
+"""
+DDiT policy configuration.
+Reference: https://arxiv.org/html/2506.13497v1
+"""
+DDIT_POLICY = Policy(
+    name="ddit",
+    gpu_cost=GPU_RESERVED_COST,
+    objective=Objective.TTFF,
+    disaggregation={
+        Model.HF: True,
+        Model.FT: False,
+    },
+    use_upscaler=False,
+    hardware=list(GPUType),
+    solver=Solver.NAIVE,
+)
+
+
+STREAMWISE_ENERGY_POLICY = Policy(
+    name="streamwise energy",
+    gpu_cost=GPU_SPOT_COST,
+    objective=Objective.TIME_ENERGY,
+    disaggregation={
+        Model.HF: True,
+        Model.FT: False,
+    },
+    use_upscaler=True,
+    hardware=list(GPUType),
+)
+
+NAIVE_POLICY = Policy(
+    name="naive",
+    gpu_cost=GPU_RESERVED_COST,
+    objective=Objective.TTFF,
+    disaggregation={},
+    use_upscaler=False,
+    hardware=[GPUType.A100],
+    solver=Solver.NAIVE,
+)
+
+
+BASELINE_POLICIES = {
+    "naive": NAIVE_POLICY,
+    "naive disag": Policy(
+        "naive disag",
+        gpu_cost=GPU_RESERVED_COST,
+        objective=Objective.TTFF,
+        disaggregation={
+            Model.HF: True,
+            Model.FT: True,
+        },
+        use_upscaler=False,
+        hardware=[GPUType.A100],
+        solver=Solver.NAIVE,
+    ),
+    "naive upscaler": Policy(
+        "naive upscaler",
+        gpu_cost=GPU_RESERVED_COST,
+        objective=Objective.TTFF,
+        disaggregation={},
+        use_upscaler=True,  # Changed to True
+        hardware=[GPUType.A100],
+        solver=Solver.NAIVE,
+    ),
+    "naive spot": Policy(
+        "naive spot",
+        gpu_cost=GPU_SPOT_COST,  # Changed to SPOT_COST
+        objective=Objective.TTFF,
+        disaggregation={},
+        use_upscaler=False,
+        hardware=[GPUType.A100],
+        solver=Solver.NAIVE,
+    ),
+    "naive ttff*cost allocator": Policy(
+        "naive ttff*cost allocator",
+        GPU_RESERVED_COST,
+        objective=Objective.TTFF_COST,  # Changed to TTFF_COST
+        disaggregation={},
+        use_upscaler=False,
+        hardware=[GPUType.A100],
+        solver=Solver.GREEDY,
+    ),
+    "naive hardware": Policy(
+        "naive hardware",
+        GPU_RESERVED_COST,
+        objective=Objective.TTFF,
+        disaggregation={},
+        use_upscaler=False,
+        hardware=list(GPUType),  # Changed hardware
+        solver=Solver.NAIVE,
+    ),
+}
+
+
+STREAMWISE_POLICIES = {
+    "streamwise": STREAMWISE_POLICY,
+    "streamwise no disag": Policy(
+        name="streamwise no disag",
+        gpu_cost=GPU_SPOT_COST,
+        objective=Objective.TTFF_COST,
+        disaggregation={},
+        use_upscaler=True,
+        hardware=list(GPUType),
+        solver=Solver.GREEDY,
+    ),
+    "streamwise no upscaler": Policy(
+        name="streamwise no upscaler",
+        gpu_cost=GPU_SPOT_COST,
+        objective=Objective.TTFF_COST,
+        disaggregation={
+            Model.HF: True,
+            Model.FT: False,
+        },
+        use_upscaler=False,
+        hardware=list(GPUType),
+        solver=Solver.GREEDY,
+    ),
+    "streamwise no spot": Policy(
+        name="streamwise no spot",
+        gpu_cost=GPU_RESERVED_COST,
+        objective=Objective.TTFF_COST,
+        disaggregation={
+            Model.HF: True,
+            Model.FT: False,
+        },
+        use_upscaler=True,
+        hardware=list(GPUType),
+        solver=Solver.GREEDY,
+    ),
+    "streamwise naive allocator": Policy(
+        name="streamwise naive allocator",
+        gpu_cost=GPU_SPOT_COST,
+        objective=Objective.TTFF,
+        disaggregation={
+            Model.HF: True,
+            Model.FT: False,
+        },
+        use_upscaler=True,
+        hardware=list(GPUType),
+        solver=Solver.NAIVE,
+    ),
+    "streamwise A100": Policy(
+        name="streamwise single hardware",
+        gpu_cost=GPU_SPOT_COST,
+        objective=Objective.TTFF_COST,
+        disaggregation={
+            Model.HF: True,
+            Model.FT: False,
+        },
+        use_upscaler=True,
+        hardware=[GPUType.A100],
+        solver=Solver.NAIVE,
+    ),
+}
diff --git a/tests/simulator/conftest.py b/tests/simulator/conftest.py
new file mode 100644
index 00000000..d8e52f08
--- /dev/null
+++ b/tests/simulator/conftest.py
@@ -0,0 +1,24 @@
+"""
+Conftest for simulator tests.
+
+Sets PYTHONPATH so that child processes spawned by ProcessPoolExecutor
+can find the simulator and streamwise modules.
+"""
+import os
+import sys
+
+_REPO_ROOT = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", ".."))
+_SIMULATOR_DIR = os.path.join(_REPO_ROOT, "simulator")
+_STREAMWISE_DIR = os.path.join(_REPO_ROOT, "streamwise")
+
+# Propagate paths to child processes via PYTHONPATH.
+_EXTRA = os.pathsep.join((_REPO_ROOT, _SIMULATOR_DIR, _STREAMWISE_DIR))
+_EXISTING = os.environ.get("PYTHONPATH", "")
+if _SIMULATOR_DIR not in _EXISTING:
+    os.environ["PYTHONPATH"] = (
+        _EXTRA + os.pathsep + _EXISTING if _EXISTING else _EXTRA
+    )
+
+for _p in (_REPO_ROOT, _SIMULATOR_DIR, _STREAMWISE_DIR):
+    if _p not in sys.path:
+        sys.path.insert(0, _p)
diff --git a/tests/simulator/test_auto_model_allocator.py b/tests/simulator/test_auto_model_allocator.py
index 18ff1871..f7550822 100644
--- a/tests/simulator/test_auto_model_allocator.py
+++ b/tests/simulator/test_auto_model_allocator.py
@@ -24,21 +24,21 @@
 from tests.test_utils import temp_sys_path
 
 with temp_sys_path("simulator", "streamwise"):
-    from model_provisioner.sim_types import GPUType
-    from model_provisioner.sim_types import Model
-    from model_provisioner.sim_types import QualityLevel
-    from model_provisioner.sim_types import Solver
+    from sim_types import GPUType
+    from sim_types import Model
+    from sim_types import QualityLevel
+    from sim_types import Solver
 
-    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
+    from constants import DEFAULT_WORKFLOW_CONFIG
 
-    from model_provisioner.data_loading import load_latency_data
+    from data_loading import load_latency_data
 
     from model_provisioner.policies import STREAMWISE_POLICY
     from model_provisioner.policies import NAIVE_POLICY
     from model_provisioner.policies import HEXGEN_POLICY
     from model_provisioner.policies import HELIX_POLICY
 
-    from model_provisioner.auto_model_allocator import AutoModelAllocator
+    from auto_model_allocator import AutoModelAllocator
 
     from model_provisioner.greedy import GreedyAllocator
     from model_provisioner.naive_baseline import NaiveAllocator
@@ -46,7 +46,7 @@
     from model_provisioner.helix import HelixAllocator
     from model_provisioner.milp import MILPAllocator
 
-    from model_provisioner.workflows import PODCAST_WORKFLOW
+    from workflows import PODCAST_WORKFLOW
 
 
 # ---------------------------------------------------------------------------
diff --git a/tests/simulator/test_data_loading.py b/tests/simulator/test_data_loading.py
index de883d35..72337375 100644
--- a/tests/simulator/test_data_loading.py
+++ b/tests/simulator/test_data_loading.py
@@ -12,11 +12,11 @@
 from tests.test_utils import temp_sys_path
 
 with temp_sys_path("simulator", "streamwise"):
-    from model_provisioner.sim_types import QualityLevel
+    from sim_types import QualityLevel
 
-    from model_provisioner.data_loading import load_latency_data
-    from model_provisioner.data_loading import load_power_data
-    from model_provisioner.data_loading import load_adaptive_quality_data
+    from data_loading import load_latency_data
+    from data_loading import load_power_data
+    from data_loading import load_adaptive_quality_data
 
 
 def test_latency() -> None:
diff --git a/tests/simulator/test_evaluator.py b/tests/simulator/test_evaluator.py
index 6f3a5aa7..b3c37e73 100644
--- a/tests/simulator/test_evaluator.py
+++ b/tests/simulator/test_evaluator.py
@@ -9,28 +9,28 @@
 from tests.test_utils import temp_sys_path
 
 with temp_sys_path("simulator", "streamwise"):
-    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
-    from model_provisioner.constants import SECONDS_IN_HOUR
+    from constants import DEFAULT_WORKFLOW_CONFIG
+    from constants import SECONDS_IN_HOUR
 
-    from model_provisioner.sim_types import GPUType
-    from model_provisioner.sim_types import Model
+    from sim_types import GPUType
+    from sim_types import Model
 
-    from model_provisioner.data_loading import load_latency_data
-    from model_provisioner.data_loading import load_power_data
+    from data_loading import load_latency_data
+    from data_loading import load_power_data
 
-    from model_provisioner.evaluator import evaluate_model_allocation
+    from evaluator import evaluate_model_allocation
 
     from model_provisioner.policies import STREAMWISE_POLICY
 
-    from model_provisioner.models import FluxModelAllocation
-    from model_provisioner.models import GemmaModelAllocation
-    from model_provisioner.models import HFModelAllocation
-    from model_provisioner.models import HFVAEModelAllocation
-    from model_provisioner.models import FTModelAllocation
-    from model_provisioner.models import UpscalerModelAllocation
-    from model_provisioner.models import OthersModelAllocation
+    from models import FluxModelAllocation
+    from models import GemmaModelAllocation
+    from models import HFModelAllocation
+    from models import HFVAEModelAllocation
+    from models import FTModelAllocation
+    from models import UpscalerModelAllocation
+    from models import OthersModelAllocation
 
-    from model_provisioner.utils import to_models_df
+    from utils import to_models_df
 
 
 def test_empty() -> None:
diff --git a/tests/simulator/test_greedy.py b/tests/simulator/test_greedy.py
index 786cc2c2..bfa2996e 100644
--- a/tests/simulator/test_greedy.py
+++ b/tests/simulator/test_greedy.py
@@ -9,17 +9,17 @@
 from tests.test_utils import temp_sys_path
 
 with temp_sys_path("simulator", "streamwise"):
-    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
-    from model_provisioner.constants import SECONDS_IN_HOUR
+    from constants import DEFAULT_WORKFLOW_CONFIG
+    from constants import SECONDS_IN_HOUR
 
-    from model_provisioner.workflows import WORKFLOWS
+    from workflows import WORKFLOWS
 
-    from model_provisioner.sim_types import GPUType
-    from model_provisioner.sim_types import QualityLevel
-    from model_provisioner.sim_types import WorkflowConfig
+    from sim_types import GPUType
+    from sim_types import QualityLevel
+    from sim_types import WorkflowConfig
 
-    from model_provisioner.data_loading import load_latency_data
-    from model_provisioner.data_loading import load_power_data
+    from data_loading import load_latency_data
+    from data_loading import load_power_data
 
     from model_provisioner.greedy import GreedyAllocator
 
diff --git a/tests/simulator/test_helix.py b/tests/simulator/test_helix.py
index 06ec8f3a..7261b902 100644
--- a/tests/simulator/test_helix.py
+++ b/tests/simulator/test_helix.py
@@ -13,13 +13,13 @@
 from tests.test_utils import temp_sys_path
 
 with temp_sys_path("simulator", "streamwise"):
-    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
-    from model_provisioner.sim_types import GPUType
-    from model_provisioner.sim_types import Model
-    from model_provisioner.sim_types import MODEL_ORDER
-    from model_provisioner.sim_types import Solver
-    from model_provisioner.data_loading import load_latency_data
-    from model_provisioner.data_loading import load_power_data
+    from constants import DEFAULT_WORKFLOW_CONFIG
+    from sim_types import GPUType
+    from sim_types import Model
+    from sim_types import MODEL_ORDER
+    from sim_types import Solver
+    from data_loading import load_latency_data
+    from data_loading import load_power_data
     from model_provisioner.helix import HelixAllocator
     from model_provisioner.policies import HELIX_POLICY
 
diff --git a/tests/simulator/test_hexgen.py b/tests/simulator/test_hexgen.py
index 3317a82e..3d77867b 100644
--- a/tests/simulator/test_hexgen.py
+++ b/tests/simulator/test_hexgen.py
@@ -8,12 +8,12 @@
 from tests.test_utils import temp_sys_path
 
 with temp_sys_path("simulator", "streamwise"):
-    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
-    from model_provisioner.sim_types import GPUType
-    from model_provisioner.data_loading import load_latency_data
+    from constants import DEFAULT_WORKFLOW_CONFIG
+    from sim_types import GPUType
+    from data_loading import load_latency_data
     from model_provisioner.hexgen import HexGenAllocator
     from model_provisioner.hexgen import _get_model_order
-    from model_provisioner.sim_types import MODEL_ORDER
+    from sim_types import MODEL_ORDER
 
 
 def test_get_model_order() -> None:
diff --git a/tests/simulator/test_milp.py b/tests/simulator/test_milp.py
index 9b0e909e..52a308bd 100644
--- a/tests/simulator/test_milp.py
+++ b/tests/simulator/test_milp.py
@@ -14,28 +14,28 @@
 from tests.test_utils import temp_sys_path
 
 with temp_sys_path("simulator", "streamwise"):
-    from model_provisioner.sim_types import LatencyData
-    from model_provisioner.sim_types import PowerData
-    from model_provisioner.sim_types import GPUType
-    from model_provisioner.sim_types import Objective
-    from model_provisioner.sim_types import Solver
-    from model_provisioner.sim_types import QualityLevel
+    from sim_types import LatencyData
+    from sim_types import PowerData
+    from sim_types import GPUType
+    from sim_types import Objective
+    from sim_types import Solver
+    from sim_types import QualityLevel
 
-    from model_provisioner.data_loading import load_latency_data
-    from model_provisioner.data_loading import load_power_data
+    from data_loading import load_latency_data
+    from data_loading import load_power_data
 
-    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
-    from model_provisioner.constants import SECONDS_IN_HOUR
+    from constants import DEFAULT_WORKFLOW_CONFIG
+    from constants import SECONDS_IN_HOUR
 
     from model_provisioner.policies import STREAMWISE_MILP_POLICY
 
-    from model_provisioner.workflows import WORKFLOWS
+    from workflows import WORKFLOWS
 
     from model_provisioner.milp import MILPAllocator
 
-    from model_provisioner.evaluator import evaluate_model_allocation
+    from evaluator import evaluate_model_allocation
 
-    from model_provisioner.utils import to_models_df
+    from utils import to_models_df
 
 
 def test_base() -> None:
diff --git a/tests/simulator/test_models.py b/tests/simulator/test_models.py
index c0171d99..eccb449b 100644
--- a/tests/simulator/test_models.py
+++ b/tests/simulator/test_models.py
@@ -17,33 +17,33 @@
 from tests.test_utils import temp_sys_path
 
 with temp_sys_path("simulator", "streamwise"):
-    from model_provisioner.sim_types import GPUType
-    from model_provisioner.sim_types import Model
-    from model_provisioner.sim_types import ModelAllocation
-    from model_provisioner.sim_types import QualityLevel
-    from model_provisioner.sim_types import LatencyData
-    from model_provisioner.sim_types import PowerData
+    from sim_types import GPUType
+    from sim_types import Model
+    from sim_types import ModelAllocation
+    from sim_types import QualityLevel
+    from sim_types import LatencyData
+    from sim_types import PowerData
 
-    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
+    from constants import DEFAULT_WORKFLOW_CONFIG
 
-    from model_provisioner.data_loading import load_latency_data
-    from model_provisioner.data_loading import load_power_data
+    from data_loading import load_latency_data
+    from data_loading import load_power_data
 
     from model_provisioner.policies import STREAMWISE_POLICY
     from model_provisioner.policies import NAIVE_POLICY
 
-    from model_provisioner.models import get_model_allocation
-    from model_provisioner.models import _calculate_total_time
-    from model_provisioner.models import assert_pixel_config
-    from model_provisioner.models import _MODEL_ALLOCATION_REGISTRY
-    from model_provisioner.models import GemmaModelAllocation
-    from model_provisioner.models import FluxModelAllocation
-    from model_provisioner.models import HFModelAllocation
-    from model_provisioner.models import HFVAEModelAllocation
-    from model_provisioner.models import FTModelAllocation
-    from model_provisioner.models import FTVAEModelAllocation
-    from model_provisioner.models import UpscalerModelAllocation
-    from model_provisioner.models import OthersModelAllocation
+    from models import get_model_allocation
+    from models import _calculate_total_time
+    from models import assert_pixel_config
+    from models import _MODEL_ALLOCATION_REGISTRY
+    from models import GemmaModelAllocation
+    from models import FluxModelAllocation
+    from models import HFModelAllocation
+    from models import HFVAEModelAllocation
+    from models import FTModelAllocation
+    from models import FTVAEModelAllocation
+    from models import UpscalerModelAllocation
+    from models import OthersModelAllocation
 
 
 # ---------------------------------------------------------------------------
@@ -152,7 +152,7 @@ def test_assert_pixel_config() -> None:
     assert_pixel_config(DEFAULT_WORKFLOW_CONFIG)
 
     # Patching MEDIUM > HIGH violates the ordering constraint → AssertionError.
-    with patch.dict("model_provisioner.sim_types.RESOLUTION_PIXELS",
+    with patch.dict("sim_types.RESOLUTION_PIXELS",
                     {QualityLevel.MEDIUM: 1000, QualityLevel.HIGH: 500}):
         with pytest.raises(AssertionError):
             assert_pixel_config(DEFAULT_WORKFLOW_CONFIG)
diff --git a/tests/simulator/test_multirequests_derive.py b/tests/simulator/test_multirequests_derive.py
index d5286121..c809ccd0 100644
--- a/tests/simulator/test_multirequests_derive.py
+++ b/tests/simulator/test_multirequests_derive.py
@@ -8,9 +8,9 @@
 from tests.test_utils import temp_sys_path
 
 with temp_sys_path("simulator", "streamwise"):
-    from model_provisioner.sim_types import GPUType
-    from model_provisioner.sim_types import Model
-    from model_provisioner.sim_types import QualityLevel
+    from sim_types import GPUType
+    from sim_types import Model
+    from sim_types import QualityLevel
 
     from multirequests import TIME_PER_REQ
     from multirequests import INIT_REPLICAS
diff --git a/tests/simulator/test_simulator.py b/tests/simulator/test_simulator.py
index d621cd33..d698bb9d 100644
--- a/tests/simulator/test_simulator.py
+++ b/tests/simulator/test_simulator.py
@@ -14,18 +14,18 @@
 from tests.test_utils import temp_sys_path
 
 with temp_sys_path("simulator", "streamwise"):
-    from model_provisioner.sim_types import WorkflowConfig
-    from model_provisioner.sim_types import Model
-    from model_provisioner.sim_types import Objective
-    from model_provisioner.sim_types import GPUType
+    from sim_types import WorkflowConfig
+    from sim_types import Model
+    from sim_types import Objective
+    from sim_types import GPUType
 
-    from model_provisioner.constants import SECONDS_IN_HOUR
-    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
+    from constants import SECONDS_IN_HOUR
+    from constants import DEFAULT_WORKFLOW_CONFIG
 
-    from model_provisioner.data_loading import load_latency_data
-    from model_provisioner.data_loading import load_power_data
+    from data_loading import load_latency_data
+    from data_loading import load_power_data
 
-    from model_provisioner.auto_model_allocator import AutoModelAllocator
+    from auto_model_allocator import AutoModelAllocator
     from model_provisioner.greedy import GreedyAllocator
 
     from model_provisioner.policies import STREAMWISE_POLICY
diff --git a/tests/simulator/test_simulator_actions.py b/tests/simulator/test_simulator_actions.py
index 11efd7b2..539946c5 100644
--- a/tests/simulator/test_simulator_actions.py
+++ b/tests/simulator/test_simulator_actions.py
@@ -8,11 +8,11 @@
 from tests.test_utils import temp_sys_path
 
 with temp_sys_path("simulator", "streamwise"):
-    from model_provisioner.sim_types import Action
-    from model_provisioner.sim_types import ActionName
-    from model_provisioner.sim_types import GPUType
-    from model_provisioner.sim_types import Model
-    from model_provisioner.sim_types import Result
+    from sim_types import Action
+    from sim_types import ActionName
+    from sim_types import GPUType
+    from sim_types import Model
+    from sim_types import Result
 
 
 def test_action() -> None:
diff --git a/tests/simulator/test_simulator_baseline.py b/tests/simulator/test_simulator_baseline.py
index 24749ffb..b195a1cf 100644
--- a/tests/simulator/test_simulator_baseline.py
+++ b/tests/simulator/test_simulator_baseline.py
@@ -12,18 +12,18 @@
 from tests.test_utils import temp_sys_path
 
 with temp_sys_path("simulator", "streamwise"):
-    from model_provisioner.sim_types import GPUType
-    from model_provisioner.sim_types import Model
+    from sim_types import GPUType
+    from sim_types import Model
 
-    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
-    from model_provisioner.constants import SECONDS_IN_HOUR
-    from model_provisioner.constants import POWER_GPU_IDLE
-    from model_provisioner.constants import POWER_GPU_TDP
+    from constants import DEFAULT_WORKFLOW_CONFIG
+    from constants import SECONDS_IN_HOUR
+    from constants import POWER_GPU_IDLE
+    from constants import POWER_GPU_TDP
 
-    from model_provisioner.data_loading import load_latency_data
-    from model_provisioner.data_loading import load_power_data
+    from data_loading import load_latency_data
+    from data_loading import load_power_data
 
-    from model_provisioner.auto_model_allocator import AutoModelAllocator
+    from auto_model_allocator import AutoModelAllocator
     from model_provisioner.naive_baseline import NaiveAllocator
     from model_provisioner.greedy import GreedyAllocator
 
@@ -31,8 +31,8 @@
     from model_provisioner.policies import BASELINE_POLICIES
     from model_provisioner.policies import STREAMWISE_POLICY
 
-    from model_provisioner.workflows import SHORTS_WORKFLOW
-    from model_provisioner.workflows import WORKFLOWS
+    from workflows import SHORTS_WORKFLOW
+    from workflows import WORKFLOWS
 
 
 def test_baseline() -> None:
diff --git a/tests/simulator/test_simulator_energy.py b/tests/simulator/test_simulator_energy.py
index a739f698..c96fd128 100644
--- a/tests/simulator/test_simulator_energy.py
+++ b/tests/simulator/test_simulator_energy.py
@@ -10,17 +10,17 @@
 from tests.test_utils import temp_sys_path
 
 with temp_sys_path("simulator", "streamwise"):
-    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
+    from constants import DEFAULT_WORKFLOW_CONFIG
 
-    from model_provisioner.sim_types import GPUType
-    from model_provisioner.sim_types import Model
-    from model_provisioner.sim_types import Objective
-    from model_provisioner.sim_types import Solver
+    from sim_types import GPUType
+    from sim_types import Model
+    from sim_types import Objective
+    from sim_types import Solver
 
-    from model_provisioner.data_loading import load_latency_data
-    from model_provisioner.data_loading import load_power_data
+    from data_loading import load_latency_data
+    from data_loading import load_power_data
 
-    from model_provisioner.auto_model_allocator import AutoModelAllocator
+    from auto_model_allocator import AutoModelAllocator
     from model_provisioner.greedy import GreedyAllocator
     from model_provisioner.naive_baseline import NaiveAllocator
 
diff --git a/tests/simulator/test_simulator_multirequests.py b/tests/simulator/test_simulator_multirequests.py
index 3d3e350a..6403baba 100644
--- a/tests/simulator/test_simulator_multirequests.py
+++ b/tests/simulator/test_simulator_multirequests.py
@@ -21,12 +21,12 @@
     from multirequests import TIME_PER_REQ_ADAPTIVE
     from multirequests import get_time_per_request_baseline
 
-    from model_provisioner.data_loading import load_latency_data
-    from model_provisioner.workflows import PODCAST_WORKFLOW
+    from data_loading import load_latency_data
+    from workflows import PODCAST_WORKFLOW
 
-    from model_provisioner.constants import GPU_SPOT_COST
-    from model_provisioner.sim_types import GPUType
-    from model_provisioner.sim_types import Model
+    from constants import GPU_SPOT_COST
+    from sim_types import GPUType
+    from sim_types import Model
 
 
 def test_multirequests() -> None:
diff --git a/tests/simulator/test_simulator_plotutils.py b/tests/simulator/test_simulator_plotutils.py
index 2d3b35e2..b3bdead9 100644
--- a/tests/simulator/test_simulator_plotutils.py
+++ b/tests/simulator/test_simulator_plotutils.py
@@ -14,10 +14,10 @@
     from plot_utils import plot_cost_vs_qpm
     from plot_utils import _get_time_ticklabels
 
-    from model_provisioner.sim_types import ProvisioningResult
-    from model_provisioner.sim_types import GPUType
-    from model_provisioner.sim_types import QualityLevel
-    from model_provisioner.sim_types import Model
+    from sim_types import ProvisioningResult
+    from sim_types import GPUType
+    from sim_types import QualityLevel
+    from sim_types import Model
 
 
 def test_plot_ttff_vs_cost() -> None:
diff --git a/tests/simulator/test_simulator_policies.py b/tests/simulator/test_simulator_policies.py
index 42bf69db..d9e1421f 100644
--- a/tests/simulator/test_simulator_policies.py
+++ b/tests/simulator/test_simulator_policies.py
@@ -15,7 +15,7 @@
     from model_provisioner.policies import STREAMWISE_POLICY
     from model_provisioner.policies import BASELINE_POLICIES
 
-    from model_provisioner.sim_types import Objective
+    from sim_types import Objective
 
 
 def test_streamwise_policies() -> None:
diff --git a/tests/simulator/test_simulator_provisioning.py b/tests/simulator/test_simulator_provisioning.py
index d781bc2e..fb5d46fd 100644
--- a/tests/simulator/test_simulator_provisioning.py
+++ b/tests/simulator/test_simulator_provisioning.py
@@ -8,7 +8,7 @@
 from tests.test_utils import temp_sys_path
 
 with temp_sys_path("simulator", "streamwise"):
-    from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG
+    from constants import DEFAULT_WORKFLOW_CONFIG
 
     from provisioning import get_provisioning_results
     from provisioning import get_provisioning_adaptive_results
@@ -17,11 +17,11 @@
     from provisioning import GPU_PROVISIONS
     from provisioning import GPU_PROVISIONS_SHORT
 
-    from model_provisioner.sim_types import GPUType
-    from model_provisioner.sim_types import QualityLevel
-    from model_provisioner.sim_types import Solver
+    from sim_types import GPUType
+    from sim_types import QualityLevel
+    from sim_types import Solver
 
-    from model_provisioner.data_loading import load_latency_data
+    from data_loading import load_latency_data
 
     from model_provisioner.policies import NAIVE_POLICY
     from model_provisioner.policies import STREAMWISE_POLICY
diff --git a/tests/simulator/test_simulator_types.py b/tests/simulator/test_simulator_types.py
index 9e2384ed..223a3260 100644
--- a/tests/simulator/test_simulator_types.py
+++ b/tests/simulator/test_simulator_types.py
@@ -9,20 +9,20 @@
 from tests.test_utils import temp_sys_path
 
 with temp_sys_path("simulator", "streamwise"):
-    from model_provisioner.sim_types import Model
-    from model_provisioner.sim_types import GPUType
+    from sim_types import Model
+    from sim_types import GPUType
 
-    from model_provisioner.sim_types_json import models_to_json
-    from model_provisioner.sim_types_json import workflow_to_json
-    from model_provisioner.sim_types_json import policy_to_json
-    from model_provisioner.sim_types_json import model_list_to_json
+    from sim_types_json import models_to_json
+    from sim_types_json import workflow_to_json
+    from sim_types_json import policy_to_json
+    from sim_types_json import model_list_to_json
 
-    from model_provisioner.models import GemmaModelAllocation
-    from model_provisioner.models import FluxModelAllocation
+    from models import GemmaModelAllocation
+    from models import FluxModelAllocation
 
     from model_provisioner.policies import STREAMWISE_POLICY
 
-    from model_provisioner.workflows import PODCAST_WORKFLOW
+    from workflows import PODCAST_WORKFLOW
 
 
 def test_serialize_models() -> None:
diff --git a/tests/simulator/test_simulator_utils.py b/tests/simulator/test_simulator_utils.py
index e1575d9a..b78d675d 100644
--- a/tests/simulator/test_simulator_utils.py
+++ b/tests/simulator/test_simulator_utils.py
@@ -7,18 +7,18 @@
 from tests.test_utils import temp_sys_path
 
 with temp_sys_path("simulator", "streamwise"):
-    from model_provisioner.sim_types import Model
-    from model_provisioner.sim_types import GPUType
-    from model_provisioner.sim_types import ModelAllocation
-    from model_provisioner.sim_types import ProvisioningResult
-
-    from model_provisioner.utils import get_pareto_frontier
-    from model_provisioner.utils import find_most_cost_effective_provisioning
-    from model_provisioner.utils import find_most_energy_efficient_provisioning
-    from model_provisioner.utils import find_pareto_frontier
-    from model_provisioner.utils import coalesce_models
-
-    from model_provisioner.models import FTModelAllocation
+    from sim_types import Model
+    from sim_types import GPUType
+    from sim_types import ModelAllocation
+    from sim_types import ProvisioningResult
+
+    from utils import get_pareto_frontier
+    from utils import find_most_cost_effective_provisioning
+    from utils import find_most_energy_efficient_provisioning
+    from utils import find_pareto_frontier
+    from utils import coalesce_models
+
+    from models import FTModelAllocation
 
 
 def test_get_pareto_frontier() -> None:
diff --git a/tests/simulator/test_workflows.py b/tests/simulator/test_workflows.py
index 19a7ff0c..b38dc2ab 100644
--- a/tests/simulator/test_workflows.py
+++ b/tests/simulator/test_workflows.py
@@ -16,8 +16,8 @@
 from tests.test_utils import temp_sys_path
 
 with temp_sys_path("simulator", "streamwise"):
-    from model_provisioner.sim_types import WorkflowConfig, Model, QualityLevel, GPUType
-    from model_provisioner.constants import (
+    from sim_types import WorkflowConfig, Model, QualityLevel, GPUType
+    from constants import (
         FPS,
         FRAMES_OPTIONS,
         FRAMES_PER_STEP_IDX,
@@ -26,10 +26,10 @@
         SECONDS_IN_MINUTE,
         TOTAL_INPUT_TOKENS,
     )
-    from model_provisioner.data_loading import load_latency_data
-    from model_provisioner.auto_model_allocator import AutoModelAllocator
+    from data_loading import load_latency_data
+    from auto_model_allocator import AutoModelAllocator
     from model_provisioner.policies import STREAMWISE_POLICY, NAIVE_POLICY
-    from model_provisioner.workflows import (
+    from workflows import (
         MAX_FT_FRAMES,
         SUBSCENE_SECONDS,
         SUBSCENES_PER_SCENE,
diff --git a/tests/streamwise/test_allocator_bridge.py b/tests/streamwise/test_allocator_bridge.py
new file mode 100644
index 00000000..bd45f8a6
--- /dev/null
+++ b/tests/streamwise/test_allocator_bridge.py
@@ -0,0 +1,280 @@
+"""
+Tests for streamwise/allocator_bridge.py.
+
+Covers:
+- Model-to-container name mapping.
+- Result to deployment specs conversion.
+- run_allocator end-to-end (with real latency data).
+- Error handling for invalid inputs.
+"""
+
+from __future__ import annotations
+
+import sys
+import os
+
+import pytest
+
+# Add current path
+sys.path.append(os.getcwd())
+
+from tests.test_utils import temp_sys_path
+
+with temp_sys_path("streamwise", "simulator"):
+    from allocator_bridge import (
+        MODEL_TO_CONTAINERS,
+        CONTAINER_RESOURCES,
+        GPU_TYPE_TO_POD_STR,
+        APP_TO_WORKFLOW,
+        DeploymentSpec,
+        DeploymentPlan,
+        get_available_workflows,
+        get_available_gpu_types,
+        result_to_deployment_specs,
+        deployment_plan_to_json,
+        run_allocator,
+    )
+    from sim_types import GPUType, Model, Result
+    from models import (
+        GemmaModelAllocation,
+        FluxModelAllocation,
+        HFModelAllocation,
+        HFVAEModelAllocation,
+        FTModelAllocation,
+        OthersModelAllocation,
+        UpscalerModelAllocation,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Mapping correctness
+# ---------------------------------------------------------------------------
+
+def test_model_to_containers_covers_all_models() -> None:
+    """Every Model enum value must have a mapping entry."""
+    for model in Model:
+        assert model in MODEL_TO_CONTAINERS, f"Missing mapping for {model}"
+
+
+def test_container_resources_covers_all_mapped_containers() -> None:
+    """Every container referenced in MODEL_TO_CONTAINERS must have resource defaults."""
+    for model, containers in MODEL_TO_CONTAINERS.items():
+        for container in containers:
+            assert container in CONTAINER_RESOURCES, (
+                f"Missing CONTAINER_RESOURCES for '{container}' (from {model})")
+
+
+def test_gpu_type_to_pod_str_covers_all_gpu_types() -> None:
+    """Every GPUType enum value must have a pod string mapping."""
+    for gpu_type in GPUType:
+        assert gpu_type in GPU_TYPE_TO_POD_STR
+
+
+def test_app_to_workflow_has_expected_entries() -> None:
+    """Key StreamWise apps should map to workflows."""
+    assert "streamcast" in APP_TO_WORKFLOW
+    assert "streampersona" in APP_TO_WORKFLOW
+    assert "streamchat" in APP_TO_WORKFLOW
+
+
+# ---------------------------------------------------------------------------
+# Utility functions
+# ---------------------------------------------------------------------------
+
+def test_get_available_workflows() -> None:
+    workflows = get_available_workflows()
+    assert isinstance(workflows, list)
+    assert "streamcast" in workflows
+    assert len(workflows) >= 5
+
+
+def test_get_available_gpu_types() -> None:
+    gpu_types = get_available_gpu_types()
+    assert isinstance(gpu_types, list)
+    assert "A100" in gpu_types
+    assert "H100" in gpu_types
+
+
+# ---------------------------------------------------------------------------
+# result_to_deployment_specs
+# ---------------------------------------------------------------------------
+
+def test_result_to_deployment_specs_basic() -> None:
+    """A simple result with one active allocation maps to the right container."""
+    models = {
+        GPUType.A100: {
+            Model.GEMMA: [GemmaModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=1)],
+            Model.FLUX: [FluxModelAllocation(gpu_type=GPUType.A100, devices=2, replicas=1)],
+            Model.HF: [HFModelAllocation(gpu_type=GPUType.A100, devices=2, replicas=2)],
+            Model.HF_VAE: [HFVAEModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=1)],
+            Model.FT: [FTModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=0)],
+            Model.FT_VAE: [],
+            Model.UPSCALER: [UpscalerModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=0)],
+            Model.OTHERS: [OthersModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=1)],
+        }
+    }
+    result = Result(
+        total_time_s=100.0,
+        ttff_s=10.0,
+        cost=1.0,
+        gpus_used={GPUType.A100: 8},
+        gpus_total={GPUType.A100: 8},
+        models=models,
+    )
+
+    specs = result_to_deployment_specs(result)
+    assert isinstance(specs, list)
+    assert len(specs) > 0
+
+    container_names = [s.container_name for s in specs]
+    assert "gemma" in container_names
+    assert "flux" in container_names
+    assert "hunyuanframepackf1" in container_names  # HF model
+    assert "hunyuanframepackvae" in container_names  # HF_VAE model
+
+    # OTHERS maps to kokoro + yolo
+    assert "kokoro" in container_names
+    assert "yolo" in container_names
+
+    # Check GPU type mapping
+    gemma_spec = next(s for s in specs if s.container_name == "gemma")
+    assert gemma_spec.gpu_type == "a100"
+    assert gemma_spec.gpu == 1
+
+    # MIG containers get mig_profile set
+    kokoro_spec = next(s for s in specs if s.container_name == "kokoro")
+    assert kokoro_spec.mig_profile == "1g.10gb"
+
+
+def test_result_to_deployment_specs_skips_zero_replicas() -> None:
+    """Allocations with zero replicas should not produce deployment specs."""
+    models = {
+        GPUType.A100: {
+            Model.GEMMA: [GemmaModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=0)],
+            Model.FLUX: [FluxModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=0)],
+            Model.HF: [HFModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=0)],
+            Model.HF_VAE: [],
+            Model.FT: [],
+            Model.FT_VAE: [],
+            Model.UPSCALER: [],
+            Model.OTHERS: [],
+        }
+    }
+    result = Result(
+        total_time_s=0.0,
+        ttff_s=0.0,
+        cost=0.0,
+        gpus_used={GPUType.A100: 0},
+        gpus_total={GPUType.A100: 8},
+        models=models,
+    )
+    specs = result_to_deployment_specs(result)
+    assert specs == []
+
+
+def test_result_to_deployment_specs_multiple_replicas() -> None:
+    """Multiple replicas should produce multiple deployment specs for same container."""
+    models = {
+        GPUType.H100: {
+            Model.GEMMA: [GemmaModelAllocation(gpu_type=GPUType.H100, devices=1, replicas=1)],
+            Model.FLUX: [FluxModelAllocation(gpu_type=GPUType.H100, devices=1, replicas=1)],
+            Model.HF: [HFModelAllocation(gpu_type=GPUType.H100, devices=2, replicas=3)],
+            Model.HF_VAE: [],
+            Model.FT: [],
+            Model.FT_VAE: [],
+            Model.UPSCALER: [],
+            Model.OTHERS: [],
+        }
+    }
+    result = Result(
+        total_time_s=50.0,
+        ttff_s=5.0,
+        cost=0.5,
+        gpus_used={GPUType.H100: 8},
+        gpus_total={GPUType.H100: 16},
+        models=models,
+    )
+    specs = result_to_deployment_specs(result)
+    hf_specs = [s for s in specs if s.container_name == "hunyuanframepackf1"]
+    assert len(hf_specs) == 3  # 3 replicas
+    for spec in hf_specs:
+        assert spec.gpu == 2
+        assert spec.gpu_type == "h100"
+
+
+# ---------------------------------------------------------------------------
+# deployment_plan_to_json
+# ---------------------------------------------------------------------------
+
+def test_deployment_plan_to_json() -> None:
+    """Serialization should produce all expected keys."""
+    result = Result(
+        total_time_s=100.0,
+        ttff_s=10.0,
+        cost=1.5,
+        gpus_used={GPUType.A100: 8},
+        gpus_total={GPUType.A100: 8},
+        models={},
+    )
+    plan = DeploymentPlan(
+        specs=[
+            DeploymentSpec(
+                container_name="gemma", cpu=16, memory_gib=192,
+                ephemeral_storage_gib=64, gpu=2, gpu_type="a100", mig_profile=None)
+        ],
+        result=result,
+        workflow_name="streamcast",
+        gpu_budget={"A100": 8},
+    )
+    data = deployment_plan_to_json(plan)
+    assert data["workflow_name"] == "streamcast"
+    assert data["gpu_budget"] == {"A100": 8}
+    assert data["metrics"]["total_time_s"] == 100.0
+    assert data["metrics"]["ttff_s"] == 10.0
+    assert len(data["specs"]) == 1
+    assert data["specs"][0]["container_name"] == "gemma"
+
+
+# ---------------------------------------------------------------------------
+# run_allocator (integration with real data)
+# ---------------------------------------------------------------------------
+
+def test_run_allocator_streamcast_8_a100() -> None:
+    """Run allocator for StreamCast with 8 A100s — should produce a valid plan."""
+    plan = run_allocator(
+        gpu_budget={"A100": 8},
+        workflow_name="streamcast",
+    )
+    assert isinstance(plan, DeploymentPlan)
+    assert len(plan.specs) > 0
+    assert plan.result.total_time_s > 0
+    assert plan.result.ttff_s > 0
+    assert plan.workflow_name == "streamcast"
+
+
+def test_run_allocator_streamchat_8_h100() -> None:
+    """Run allocator for StreamChat with 8 H100s."""
+    plan = run_allocator(
+        gpu_budget={"H100": 8},
+        workflow_name="streamchat",
+    )
+    assert isinstance(plan, DeploymentPlan)
+    assert len(plan.specs) > 0
+
+
+def test_run_allocator_invalid_workflow() -> None:
+    """Unknown workflow name raises ValueError."""
+    with pytest.raises(ValueError, match="Unknown workflow"):
+        run_allocator(gpu_budget={"A100": 8}, workflow_name="nonexistent")
+
+
+def test_run_allocator_invalid_gpu_type() -> None:
+    """Unknown GPU type raises ValueError."""
+    with pytest.raises(ValueError, match="Unknown GPU type"):
+        run_allocator(gpu_budget={"RTX4090": 8}, workflow_name="streamcast")
+
+
+def test_run_allocator_insufficient_gpus() -> None:
+    """Too few GPUs raises ValueError."""
+    with pytest.raises(ValueError, match="at least 8"):
+        run_allocator(gpu_budget={"A100": 4}, workflow_name="streamcast")

From bccbbd26e144f75ffd0e4d20e6d8503f4441ef56 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Fri, 15 May 2026 15:57:38 -0700
Subject: [PATCH 3/9] Update tests

---
 .gitignore                                    |   3 +
 streamwise/streamwise.py                      | 118 +++++++++
 streamwise/templates/add_pod.html             | 190 +++++++++++++++
 tests/streamwise/conftest.py                  |  18 ++
 .../streamwise/test_streamwise_auto_deploy.py | 227 ++++++++++++++++++
 5 files changed, 556 insertions(+)
 create mode 100644 tests/streamwise/conftest.py
 create mode 100644 tests/streamwise/test_streamwise_auto_deploy.py

diff --git a/.gitignore b/.gitignore
index 51130c5b..9807bf14 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,9 @@
 *.sln.docstates
 *.env
 
+# Environment files
+.venv/
+
 # User-specific files (MonoDevelop/Xamarin Studio)
 *.userprefs
 
diff --git a/streamwise/streamwise.py b/streamwise/streamwise.py
index 1c63eacf..0ce24ac5 100644
--- a/streamwise/streamwise.py
+++ b/streamwise/streamwise.py
@@ -34,6 +34,7 @@
 import pod_manager
 import node_manager
 import job_manager
+import allocator_bridge
 
 from service_manager import get_services
 from service_manager import get_service_timestamps
@@ -726,6 +727,123 @@ async def api_add_pod() -> QuartReturn:
         return jsonify({"error": str(ex)}), HTTPStatus.INTERNAL_SERVER_ERROR
 
 
+@route("/api/auto_deploy", methods=["POST"])
+async def api_auto_deploy() -> QuartReturn:
+    """Run the model allocator to produce an optimized deployment plan.
+
+    Expects JSON body:
+        {
+            "gpu_budget": {"A100": 8, "H100": 0, ...},
+            "workflow": "streamcast"
+        }
+
+    Returns the deployment plan with estimated metrics and per-container specs.
+    """
+    try:
+        data = await request.get_json()
+        if not data:
+            return jsonify({"error": "Request body must be JSON"}), HTTPStatus.BAD_REQUEST
+
+        gpu_budget = data.get("gpu_budget")
+        workflow_name = data.get("workflow")
+
+        if not gpu_budget or not isinstance(gpu_budget, dict):
+            return jsonify({"error": "Missing or invalid 'gpu_budget' field"}), HTTPStatus.BAD_REQUEST
+        if not workflow_name or not isinstance(workflow_name, str):
+            return jsonify({"error": "Missing or invalid 'workflow' field"}), HTTPStatus.BAD_REQUEST
+
+        plan = allocator_bridge.run_allocator(
+            gpu_budget=gpu_budget,
+            workflow_name=workflow_name,
+        )
+        return jsonify(allocator_bridge.deployment_plan_to_json(plan)), HTTPStatus.OK
+
+    except ValueError as ve:
+        return jsonify({"error": str(ve)}), HTTPStatus.BAD_REQUEST
+    except Exception as ex:
+        logging.exception("Error in auto_deploy: %s", ex)
+        return jsonify({"error": str(ex)}), HTTPStatus.INTERNAL_SERVER_ERROR
+
+
+@route("/api/auto_deploy/confirm", methods=["POST"])
+async def api_auto_deploy_confirm() -> QuartReturn:
+    """Execute a deployment plan produced by /api/auto_deploy.
+
+    Expects JSON body:
+        {
+            "specs": [
+                {
+                    "container_name": "gemma",
+                    "cpu": 16,
+                    "memory_gib": 192,
+                    "ephemeral_storage_gib": 64,
+                    "gpu": 2,
+                    "gpu_type": "a100",
+                    "mig_profile": null
+                },
+                ...
+            ]
+        }
+
+    Deploys all containers in the plan.
+    """
+    try:
+        data = await request.get_json()
+        if not data:
+            return jsonify({"error": "Request body must be JSON"}), HTTPStatus.BAD_REQUEST
+
+        specs = data.get("specs")
+        if not specs or not isinstance(specs, list):
+            return jsonify({"error": "Missing or invalid 'specs' field"}), HTTPStatus.BAD_REQUEST
+
+        deployed: List[str] = []
+        errors: List[str] = []
+
+        for spec in specs:
+            container_name = spec.get("container_name")
+            if not container_name:
+                errors.append("Spec missing 'container_name'")
+                continue
+
+            try:
+                await pod_manager.add_pod(
+                    container_name=container_name,
+                    cpu=int(spec.get("cpu", 4)),
+                    memory_gib=int(spec.get("memory_gib", 16)),
+                    ephemeral_storage_gib=int(spec.get("ephemeral_storage_gib", 16)),
+                    gpu=int(spec.get("gpu", 0)),
+                    gpu_type=spec.get("gpu_type"),
+                    mig_profile=spec.get("mig_profile"),
+                    namespace=NAMESPACE,
+                    k8s_cluster=k8s_cluster,
+                )
+                deployed.append(container_name)
+            except Exception as pod_ex:
+                msg = f"Failed to deploy '{container_name}': {pod_ex}"
+                logging.error(msg)
+                errors.append(msg)
+
+        status = HTTPStatus.OK if not errors else HTTPStatus.MULTI_STATUS
+        return jsonify({
+            "deployed": deployed,
+            "errors": errors,
+            "message": f"Deployed {len(deployed)}/{len(specs)} containers.",
+        }), status
+
+    except Exception as ex:
+        logging.exception("Error in auto_deploy/confirm: %s", ex)
+        return jsonify({"error": str(ex)}), HTTPStatus.INTERNAL_SERVER_ERROR
+
+
+@route("/api/auto_deploy/workflows", methods=["GET"])
+async def api_auto_deploy_workflows() -> QuartReturn:
+    """Return available workflows and GPU types for the auto-deploy UI."""
+    return jsonify({
+        "workflows": allocator_bridge.get_available_workflows(),
+        "gpu_types": allocator_bridge.get_available_gpu_types(),
+    }), HTTPStatus.OK
+
+
 @route("/api/node/<node_name>", methods=["DELETE"])
 async def api_remove_node(node_name: str) -> QuartReturn:
     return await node_manager.remove_node(
diff --git a/streamwise/templates/add_pod.html b/streamwise/templates/add_pod.html
index d61952aa..f5496e10 100644
--- a/streamwise/templates/add_pod.html
+++ b/streamwise/templates/add_pod.html
@@ -384,6 +384,94 @@ <h2 class="mt-5">🧩 Applications</h2>
             </form>
         {% endif %}
 
+        <!-- Auto-Deploy Section -->
+        <h2 class="mt-5">🤖 Auto Deploy</h2>
+        <p>Specify your GPU budget and the optimizer will determine the best allocation for each component:</p>
+
+        <form id="auto-deploy-form">
+            <fieldset class="border rounded-3 p-3 mb-4">
+                <legend class="float-none w-auto px-2 fw-semibold">
+                    💰 GPU Budget
+                </legend>
+                <div class="row g-3 mb-3">
+                    <div class="col-md-3">
+                        <label for="auto_gpu_a100" class="form-label">A100</label>
+                        <input type="number" class="form-control" id="auto_gpu_a100" name="gpu_a100"
+                            min="0" max="64" value="8">
+                    </div>
+                    <div class="col-md-3">
+                        <label for="auto_gpu_h100" class="form-label">H100</label>
+                        <input type="number" class="form-control" id="auto_gpu_h100" name="gpu_h100"
+                            min="0" max="64" value="0">
+                    </div>
+                    <div class="col-md-3">
+                        <label for="auto_gpu_h200" class="form-label">H200</label>
+                        <input type="number" class="form-control" id="auto_gpu_h200" name="gpu_h200"
+                            min="0" max="64" value="0">
+                    </div>
+                    <div class="col-md-3">
+                        <label for="auto_gpu_gb200" class="form-label">GB200</label>
+                        <input type="number" class="form-control" id="auto_gpu_gb200" name="gpu_gb200"
+                            min="0" max="64" value="0">
+                    </div>
+                </div>
+            </fieldset>
+
+            <fieldset class="border rounded-3 p-3 mb-4">
+                <legend class="float-none w-auto px-2 fw-semibold">
+                    🎬 Workflow
+                </legend>
+                <div class="mb-3">
+                    <label for="auto_workflow" class="form-label">Application workflow</label>
+                    <select class="form-select" id="auto_workflow" name="workflow">
+                        <option value="streamcast" selected>🎙️ StreamCast (Podcast)</option>
+                        <option value="streampersona">👤 StreamPersona (Slide)</option>
+                        <option value="streamchat">💬 StreamChat (Video Chat)</option>
+                        <option value="streamshort">🎬 StreamShort (Shorts)</option>
+                        <option value="streammovie">🎬 StreamMovie (Movie)</option>
+                        <option value="streamanimate">🎞️ StreamAnimate (Story)</option>
+                        <option value="streamlecture">📚 StreamLecture (Lecture)</option>
+                        <option value="streamdub">🎤 StreamDub (Dubbing)</option>
+                        <option value="streamedit">✂️ StreamEdit (Editing)</option>
+                    </select>
+                </div>
+            </fieldset>
+
+            <div class="text-end mb-3">
+                <button type="submit" class="btn btn-warning" style="width: 200px;"
+                    id="auto-deploy-optimize-btn">
+                    🤖 Optimize
+                </button>
+            </div>
+        </form>
+
+        <!-- Auto-deploy results (hidden until optimize is clicked) -->
+        <div id="auto-deploy-results" style="display:none;">
+            <h4>📊 Optimized Deployment Plan</h4>
+            <div id="auto-deploy-metrics" class="alert alert-success mb-3"></div>
+            <table class="table table-sm table-bordered" id="auto-deploy-plan-table">
+                <thead>
+                    <tr>
+                        <th>Container</th>
+                        <th>GPU</th>
+                        <th>GPU Type</th>
+                        <th>CPU</th>
+                        <th>Memory</th>
+                        <th>MIG</th>
+                    </tr>
+                </thead>
+                <tbody id="auto-deploy-plan-body"></tbody>
+            </table>
+            <div class="text-end">
+                <button type="button" class="btn btn-success" style="width: 200px;"
+                    id="auto-deploy-confirm-btn">
+                    ✅ Confirm Deploy
+                </button>
+            </div>
+        </div>
+
+        <div id="auto-deploy-error" class="alert alert-danger mt-3" style="display:none;"></div>
+
         <script src="{{ url_for('static', filename='js/form-utils.js') }}"></script>
         <script>
             // Keep aligned with deployment/helm/values.yaml and services.json
@@ -685,6 +773,108 @@ <h2 class="mt-5">🧩 Applications</h2>
                     });
                 });
             }
+            // Auto-Deploy
+            const autoDeployForm = document.getElementById('auto-deploy-form');
+            if (autoDeployForm) {
+                let currentPlan = null;
+
+                autoDeployForm.addEventListener('submit', function(e) {
+                    e.preventDefault();
+                    const btn = document.getElementById('auto-deploy-optimize-btn');
+                    btn.disabled = true;
+                    btn.textContent = '⏳ Optimizing...';
+
+                    const gpuBudget = {
+                        'A100': parseInt(document.getElementById('auto_gpu_a100').value) || 0,
+                        'H100': parseInt(document.getElementById('auto_gpu_h100').value) || 0,
+                        'H200': parseInt(document.getElementById('auto_gpu_h200').value) || 0,
+                        'GB200': parseInt(document.getElementById('auto_gpu_gb200').value) || 0,
+                    };
+                    const workflow = document.getElementById('auto_workflow').value;
+
+                    const errorDiv = document.getElementById('auto-deploy-error');
+                    const resultsDiv = document.getElementById('auto-deploy-results');
+                    errorDiv.style.display = 'none';
+                    resultsDiv.style.display = 'none';
+
+                    fetch('/api/auto_deploy', {
+                        method: 'POST',
+                        headers: {'Content-Type': 'application/json'},
+                        body: JSON.stringify({gpu_budget: gpuBudget, workflow: workflow}),
+                        credentials: 'same-origin'
+                    })
+                    .then(response => response.json().then(data => ({ok: response.ok, data})))
+                    .then(({ok, data}) => {
+                        btn.disabled = false;
+                        btn.textContent = '🤖 Optimize';
+                        if (!ok) {
+                            errorDiv.textContent = data.error || 'Unknown error';
+                            errorDiv.style.display = '';
+                            return;
+                        }
+                        currentPlan = data;
+                        // Show metrics
+                        const metrics = data.metrics;
+                        document.getElementById('auto-deploy-metrics').innerHTML =
+                            `<strong>Total Time:</strong> ${metrics.total_time_s}s &nbsp;|&nbsp; ` +
+                            `<strong>TTFF:</strong> ${metrics.ttff_s}s &nbsp;|&nbsp; ` +
+                            `<strong>Cost:</strong> $${metrics.cost} &nbsp;|&nbsp; ` +
+                            `<strong>GPUs Used:</strong> ${JSON.stringify(metrics.gpus_used)}`;
+                        // Show plan table
+                        const tbody = document.getElementById('auto-deploy-plan-body');
+                        tbody.innerHTML = '';
+                        data.specs.forEach(spec => {
+                            const row = document.createElement('tr');
+                            row.innerHTML =
+                                `<td>${escapeHtml(spec.container_name)}</td>` +
+                                `<td>${spec.gpu}</td>` +
+                                `<td>${escapeHtml(spec.gpu_type || 'any')}</td>` +
+                                `<td>${spec.cpu}</td>` +
+                                `<td>${spec.memory_gib} GiB</td>` +
+                                `<td>${spec.mig_profile || '-'}</td>`;
+                            tbody.appendChild(row);
+                        });
+                        resultsDiv.style.display = '';
+                    })
+                    .catch(err => {
+                        btn.disabled = false;
+                        btn.textContent = '🤖 Optimize';
+                        errorDiv.textContent = 'Network error: ' + err;
+                        errorDiv.style.display = '';
+                    });
+                });
+
+                document.getElementById('auto-deploy-confirm-btn').addEventListener('click', function() {
+                    if (!currentPlan || !currentPlan.specs) return;
+
+                    const btn = this;
+                    btn.disabled = true;
+                    btn.textContent = '⏳ Deploying...';
+
+                    fetch('/api/auto_deploy/confirm', {
+                        method: 'POST',
+                        headers: {'Content-Type': 'application/json'},
+                        body: JSON.stringify({specs: currentPlan.specs}),
+                        credentials: 'same-origin'
+                    })
+                    .then(response => response.json().then(data => ({ok: response.ok, data})))
+                    .then(({ok, data}) => {
+                        btn.disabled = false;
+                        btn.textContent = '✅ Confirm Deploy';
+                        if (data.errors && data.errors.length > 0) {
+                            alert('Deployed ' + data.deployed.length + ' containers.\nErrors:\n' + data.errors.join('\n'));
+                        } else {
+                            alert(data.message || 'Deployment complete!');
+                        }
+                        window.location.href = '/';
+                    })
+                    .catch(err => {
+                        btn.disabled = false;
+                        btn.textContent = '✅ Confirm Deploy';
+                        alert('Error: ' + err);
+                    });
+                });
+            }
         </script>
         <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.5/dist/js/bootstrap.bundle.min.js"
             integrity="sha384-k6d4wzSIapyDyv1kpU366/PK5hCdSbCRGRCMv+eplOQJWyd1fbcAu9OCUj5zNLiq"
diff --git a/tests/streamwise/conftest.py b/tests/streamwise/conftest.py
new file mode 100644
index 00000000..66b54d00
--- /dev/null
+++ b/tests/streamwise/conftest.py
@@ -0,0 +1,18 @@
+"""
+Shared fixtures for streamwise tests.
+"""
+import sys
+
+import pytest
+
+
+@pytest.fixture(autouse=True)
+def _ensure_simulator_path() -> None:  # type: ignore[misc]
+    """Keep simulator/ on sys.path during each test for lazy policy imports."""
+    added = False
+    if "simulator" not in sys.path:
+        sys.path.insert(0, "simulator")
+        added = True
+    yield  # type: ignore[misc]
+    if added and "simulator" in sys.path:
+        sys.path.remove("simulator")
diff --git a/tests/streamwise/test_streamwise_auto_deploy.py b/tests/streamwise/test_streamwise_auto_deploy.py
new file mode 100644
index 00000000..185aff75
--- /dev/null
+++ b/tests/streamwise/test_streamwise_auto_deploy.py
@@ -0,0 +1,227 @@
+"""
+Tests for the auto-deploy API endpoints in streamwise.py.
+
+Covers:
+- POST /api/auto_deploy — returns optimized plan.
+- POST /api/auto_deploy/confirm — deploys the plan.
+- GET /api/auto_deploy/workflows — lists available options.
+- Error cases (missing fields, invalid inputs).
+"""
+
+from __future__ import annotations
+
+import sys
+
+import pytest
+
+from http import HTTPStatus
+from unittest.mock import patch
+
+from tests.test_utils import temp_sys_path
+from tests.k8s_mock import K8sMock
+
+mock_k8s = K8sMock()
+
+mock_modules = {}
+mock_modules.update(mock_k8s.get_sub_modules())
+
+# Pre-register the streamwise package so sub-modules resolve correctly.
+from streamwise import http_session_manager  # noqa: F401
+
+# Permanently inject K8s mocks into sys.modules (not via context manager)
+# so that simulator modules loaded alongside streamwise remain importable
+# after setup completes.
+_original_modules = {}
+for mod_name, mock_mod in mock_modules.items():
+    _original_modules[mod_name] = sys.modules.get(mod_name)
+    sys.modules[mod_name] = mock_mod
+
+with temp_sys_path("streamwise"):
+    from streamwise import streamwise as sw
+
+
+def _get_client():  # type: ignore[no-untyped-def]
+    app = sw.app
+    return app.test_client()
+
+
+@pytest.fixture(scope="function", autouse=True)
+def setup_k8s_cluster() -> None:
+    sw.k8s_cluster = "unittest"
+    sw.use_https = False
+
+
+# ---------------------------------------------------------------------------
+# GET /api/auto_deploy/workflows
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_auto_deploy_workflows() -> None:
+    """Should return available workflows and GPU types."""
+    client = _get_client()
+    response = await client.get("/api/auto_deploy/workflows")
+    assert response.status_code == HTTPStatus.OK
+    data = await response.get_json()
+    assert "workflows" in data
+    assert "gpu_types" in data
+    assert "streamcast" in data["workflows"]
+    assert "A100" in data["gpu_types"]
+
+
+# ---------------------------------------------------------------------------
+# POST /api/auto_deploy
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_auto_deploy_success() -> None:
+    """Valid request returns an optimized deployment plan."""
+    fake_json = {
+        "workflow_name": "streamcast",
+        "gpu_budget": {"A100": 8},
+        "metrics": {"total_time_s": 3.5, "ttff_s": 1.0, "cost": 12.0, "gpus_used": {"A100": 3}},
+        "specs": [
+            {"container_name": "gemma", "cpu": 4, "memory_gib": 16,
+             "ephemeral_storage_gib": 10, "gpu": 1, "gpu_type": "A100", "mig_profile": None},
+            {"container_name": "flux", "cpu": 4, "memory_gib": 16,
+             "ephemeral_storage_gib": 10, "gpu": 2, "gpu_type": "A100", "mig_profile": None},
+        ],
+    }
+    # Patch on the actual module object that streamwise.py holds a reference to.
+    with patch.object(sw.allocator_bridge, "run_allocator") as mock_alloc, \
+         patch.object(sw.allocator_bridge, "deployment_plan_to_json", return_value=fake_json):
+        mock_alloc.return_value = "fake_plan"
+        client = _get_client()
+        response = await client.post(
+            "/api/auto_deploy",
+            json={
+                "gpu_budget": {"A100": 8},
+                "workflow": "streamcast",
+            },
+        )
+    assert response.status_code == HTTPStatus.OK
+    data = await response.get_json()
+    assert "specs" in data
+    assert "metrics" in data
+    assert len(data["specs"]) == 2
+    assert data["metrics"]["total_time_s"] == 3.5
+
+
+@pytest.mark.asyncio
+async def test_auto_deploy_missing_gpu_budget() -> None:
+    """Missing gpu_budget field returns 400."""
+    client = _get_client()
+    response = await client.post(
+        "/api/auto_deploy",
+        json={"workflow": "streamcast"},
+    )
+    assert response.status_code == HTTPStatus.BAD_REQUEST
+
+
+@pytest.mark.asyncio
+async def test_auto_deploy_missing_workflow() -> None:
+    """Missing workflow field returns 400."""
+    client = _get_client()
+    response = await client.post(
+        "/api/auto_deploy",
+        json={"gpu_budget": {"A100": 8}},
+    )
+    assert response.status_code == HTTPStatus.BAD_REQUEST
+
+
+@pytest.mark.asyncio
+async def test_auto_deploy_invalid_workflow() -> None:
+    """Invalid workflow name returns 400."""
+    client = _get_client()
+    response = await client.post(
+        "/api/auto_deploy",
+        json={
+            "gpu_budget": {"A100": 8},
+            "workflow": "nonexistent",
+        },
+    )
+    assert response.status_code == HTTPStatus.BAD_REQUEST
+    data = await response.get_json()
+    assert "error" in data
+
+
+@pytest.mark.asyncio
+async def test_auto_deploy_insufficient_gpus() -> None:
+    """Too few GPUs returns 400."""
+    client = _get_client()
+    response = await client.post(
+        "/api/auto_deploy",
+        json={
+            "gpu_budget": {"A100": 2},
+            "workflow": "streamcast",
+        },
+    )
+    assert response.status_code == HTTPStatus.BAD_REQUEST
+
+
+@pytest.mark.asyncio
+async def test_auto_deploy_no_json_body() -> None:
+    """No JSON body returns 400."""
+    client = _get_client()
+    response = await client.post("/api/auto_deploy")
+    assert response.status_code == HTTPStatus.BAD_REQUEST
+
+
+# ---------------------------------------------------------------------------
+# POST /api/auto_deploy/confirm
+# ---------------------------------------------------------------------------
+
+@pytest.mark.asyncio
+async def test_auto_deploy_confirm_success() -> None:
+    """Valid confirm request deploys containers."""
+    client = _get_client()
+    specs = [
+        {
+            "container_name": "gemma",
+            "cpu": 16,
+            "memory_gib": 192,
+            "ephemeral_storage_gib": 64,
+            "gpu": 2,
+            "gpu_type": "a100",
+            "mig_profile": None,
+        },
+        {
+            "container_name": "flux",
+            "cpu": 12,
+            "memory_gib": 128,
+            "ephemeral_storage_gib": 64,
+            "gpu": 2,
+            "gpu_type": "a100",
+            "mig_profile": None,
+        },
+    ]
+    response = await client.post(
+        "/api/auto_deploy/confirm",
+        json={"specs": specs},
+    )
+    # Should succeed (mocked K8s)
+    assert response.status_code in (HTTPStatus.OK, HTTPStatus.MULTI_STATUS)
+    data = await response.get_json()
+    assert "deployed" in data
+    assert "message" in data
+
+
+@pytest.mark.asyncio
+async def test_auto_deploy_confirm_missing_specs() -> None:
+    """Missing specs returns 400."""
+    client = _get_client()
+    response = await client.post(
+        "/api/auto_deploy/confirm",
+        json={},
+    )
+    assert response.status_code == HTTPStatus.BAD_REQUEST
+
+
+@pytest.mark.asyncio
+async def test_auto_deploy_confirm_empty_specs() -> None:
+    """Empty specs list returns 400."""
+    client = _get_client()
+    response = await client.post(
+        "/api/auto_deploy/confirm",
+        json={"specs": []},
+    )
+    assert response.status_code == HTTPStatus.BAD_REQUEST

From 4227e1fed42a7f9f5619354c7559ffbf190c1fdb Mon Sep 17 00:00:00 2001
From: Haoran Qiu <jamesqiu@connect.hku.hk>
Date: Fri, 15 May 2026 16:09:11 -0700
Subject: [PATCH 4/9] Fix unused import

Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com>
---
 tests/streamwise/test_streamwise_auto_deploy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/streamwise/test_streamwise_auto_deploy.py b/tests/streamwise/test_streamwise_auto_deploy.py
index 185aff75..3c8e2f08 100644
--- a/tests/streamwise/test_streamwise_auto_deploy.py
+++ b/tests/streamwise/test_streamwise_auto_deploy.py
@@ -26,7 +26,7 @@
 mock_modules.update(mock_k8s.get_sub_modules())
 
 # Pre-register the streamwise package so sub-modules resolve correctly.
-from streamwise import http_session_manager  # noqa: F401
+import streamwise.http_session_manager
 
 # Permanently inject K8s mocks into sys.modules (not via context manager)
 # so that simulator modules loaded alongside streamwise remain importable

From 10313cbba08ff55e2c28d7340b591457bcd001e7 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Fri, 15 May 2026 16:13:53 -0700
Subject: [PATCH 5/9] Fix mypy issues

---
 tests/streamwise/conftest.py | 5 +++--
 wrapper/run_httpserver.py    | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/streamwise/conftest.py b/tests/streamwise/conftest.py
index 66b54d00..9d6ea7e7 100644
--- a/tests/streamwise/conftest.py
+++ b/tests/streamwise/conftest.py
@@ -2,17 +2,18 @@
 Shared fixtures for streamwise tests.
 """
 import sys
+from typing import Generator
 
 import pytest
 
 
 @pytest.fixture(autouse=True)
-def _ensure_simulator_path() -> None:  # type: ignore[misc]
+def _ensure_simulator_path() -> Generator[None, None, None]:
     """Keep simulator/ on sys.path during each test for lazy policy imports."""
     added = False
     if "simulator" not in sys.path:
         sys.path.insert(0, "simulator")
         added = True
-    yield  # type: ignore[misc]
+    yield
     if added and "simulator" in sys.path:
         sys.path.remove("simulator")
diff --git a/wrapper/run_httpserver.py b/wrapper/run_httpserver.py
index a9ec16ad..6ca398fe 100644
--- a/wrapper/run_httpserver.py
+++ b/wrapper/run_httpserver.py
@@ -1266,8 +1266,8 @@ async def send_task(gen_task: dict) -> None:
 
     try:
         payload_bytes = await asyncio.to_thread(pickle.dumps, gen_task)
-        payload_bytes = bytearray(payload_bytes)
-        payload_tensor = torch.frombuffer(payload_bytes, dtype=torch.uint8).to("cuda")
+        payload_buffer = bytearray(payload_bytes)
+        payload_tensor = torch.frombuffer(payload_buffer, dtype=torch.uint8).to("cuda")
         payload_size = torch.tensor([payload_tensor.numel()], dtype=torch.int64, device="cuda")
 
         if payload_size.item() > MAX_PAYLOAD_BYTES:

From 0fbdf6747718f90325c28caf94e25d3e7111cc01 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Fri, 15 May 2026 16:18:39 -0700
Subject: [PATCH 6/9] Fix directory path

---
 simulator/data_loading.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/simulator/data_loading.py b/simulator/data_loading.py
index bea78715..5ecfcbde 100644
--- a/simulator/data_loading.py
+++ b/simulator/data_loading.py
@@ -28,7 +28,7 @@
 from constants import POWER_GPU_IDLE
 from constants import POWER_GPU_TDP
 
-_DEFAULT_DATA_DIR = str(Path(__file__).resolve().parents[2] / "simulator" / "data")
+_DEFAULT_DATA_DIR = str(Path(__file__).resolve().parents[0] / "data")
 
 
 def load_latency_data(

From d9680274ee026ae7135dcbed5c550607d1119e12 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Fri, 15 May 2026 16:27:48 -0700
Subject: [PATCH 7/9] Fix lint

---
 .flake8                                         | 1 +
 tests/streamwise/test_streamwise_auto_deploy.py | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.flake8 b/.flake8
index 13cb9ba1..b32f0349 100644
--- a/.flake8
+++ b/.flake8
@@ -3,3 +3,4 @@ max-line-length = 120
 # Ignore E402: module-level import not at top of file
 # Ignore W503: line break before binary operator (incompatible with W504)
 ignore = E402,W503
+exclude = .venv
diff --git a/tests/streamwise/test_streamwise_auto_deploy.py b/tests/streamwise/test_streamwise_auto_deploy.py
index 3c8e2f08..a191785a 100644
--- a/tests/streamwise/test_streamwise_auto_deploy.py
+++ b/tests/streamwise/test_streamwise_auto_deploy.py
@@ -25,8 +25,7 @@
 mock_modules = {}
 mock_modules.update(mock_k8s.get_sub_modules())
 
-# Pre-register the streamwise package so sub-modules resolve correctly.
-import streamwise.http_session_manager
+import streamwise.http_session_manager  # noqa: F401 — registers the streamwise package
 
 # Permanently inject K8s mocks into sys.modules (not via context manager)
 # so that simulator modules loaded alongside streamwise remain importable

From 34836066c7fcd4145769a385ae22e099575172fb Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Fri, 15 May 2026 17:02:34 -0700
Subject: [PATCH 8/9] Simplify imports

---
 simulator/provisioning.py                 | 21 --------------------
 streamwise/allocator_bridge.py            | 10 ++--------
 tests/simulator/conftest.py               | 24 -----------------------
 tests/streamwise/conftest.py              | 19 ------------------
 tests/streamwise/test_allocator_bridge.py |  4 +++-
 5 files changed, 5 insertions(+), 73 deletions(-)
 delete mode 100644 tests/simulator/conftest.py
 delete mode 100644 tests/streamwise/conftest.py

diff --git a/simulator/provisioning.py b/simulator/provisioning.py
index 51e1ab11..dd4f2a89 100644
--- a/simulator/provisioning.py
+++ b/simulator/provisioning.py
@@ -3,27 +3,6 @@
 """
 from __future__ import annotations
 
-import os
-import sys
-
-# Ensure streamwise/ and simulator/ are on sys.path so model_provisioner
-# imports work in child processes spawned by ProcessPoolExecutor.
-_REPO_ROOT = os.path.normpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
-_STREAMWISE_DIR = os.path.join(_REPO_ROOT, "streamwise")
-_SIMULATOR_DIR = os.path.dirname(os.path.abspath(__file__))
-for _p in (_REPO_ROOT, _STREAMWISE_DIR, _SIMULATOR_DIR):
-    if _p not in sys.path:
-        sys.path.insert(0, _p)
-
-# Propagate paths to child processes spawned by ProcessPoolExecutor (Windows
-# uses 'spawn' which starts a fresh interpreter that reads PYTHONPATH).
-_EXTRA_PATHS = os.pathsep.join((_REPO_ROOT, _STREAMWISE_DIR, _SIMULATOR_DIR))
-_EXISTING = os.environ.get("PYTHONPATH", "")
-if _SIMULATOR_DIR not in _EXISTING:
-    os.environ["PYTHONPATH"] = (
-        _EXTRA_PATHS + os.pathsep + _EXISTING if _EXISTING else _EXTRA_PATHS
-    )
-
 from tqdm.auto import tqdm
 
 import logging
diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py
index 44dd2512..b1e610d2 100644
--- a/streamwise/allocator_bridge.py
+++ b/streamwise/allocator_bridge.py
@@ -8,14 +8,8 @@
 from __future__ import annotations
 
 import os
-import sys
-
-# Add simulator/ to sys.path so foundation modules are importable.
-_SIMULATOR_DIR = os.path.normpath(
-    os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "simulator")
-)
-if _SIMULATOR_DIR not in sys.path:
-    sys.path.insert(0, _SIMULATOR_DIR)
+
+import model_provisioner  # noqa: F401 — adds simulator/ to sys.path
 
 from dataclasses import dataclass
 from typing import Optional
diff --git a/tests/simulator/conftest.py b/tests/simulator/conftest.py
deleted file mode 100644
index d8e52f08..00000000
--- a/tests/simulator/conftest.py
+++ /dev/null
@@ -1,24 +0,0 @@
-"""
-Conftest for simulator tests.
-
-Sets PYTHONPATH so that child processes spawned by ProcessPoolExecutor
-can find the simulator and streamwise modules.
-"""
-import os
-import sys
-
-_REPO_ROOT = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", ".."))
-_SIMULATOR_DIR = os.path.join(_REPO_ROOT, "simulator")
-_STREAMWISE_DIR = os.path.join(_REPO_ROOT, "streamwise")
-
-# Propagate paths to child processes via PYTHONPATH.
-_EXTRA = os.pathsep.join((_REPO_ROOT, _SIMULATOR_DIR, _STREAMWISE_DIR))
-_EXISTING = os.environ.get("PYTHONPATH", "")
-if _SIMULATOR_DIR not in _EXISTING:
-    os.environ["PYTHONPATH"] = (
-        _EXTRA + os.pathsep + _EXISTING if _EXISTING else _EXTRA
-    )
-
-for _p in (_REPO_ROOT, _SIMULATOR_DIR, _STREAMWISE_DIR):
-    if _p not in sys.path:
-        sys.path.insert(0, _p)
diff --git a/tests/streamwise/conftest.py b/tests/streamwise/conftest.py
deleted file mode 100644
index 9d6ea7e7..00000000
--- a/tests/streamwise/conftest.py
+++ /dev/null
@@ -1,19 +0,0 @@
-"""
-Shared fixtures for streamwise tests.
-"""
-import sys
-from typing import Generator
-
-import pytest
-
-
-@pytest.fixture(autouse=True)
-def _ensure_simulator_path() -> Generator[None, None, None]:
-    """Keep simulator/ on sys.path during each test for lazy policy imports."""
-    added = False
-    if "simulator" not in sys.path:
-        sys.path.insert(0, "simulator")
-        added = True
-    yield
-    if added and "simulator" in sys.path:
-        sys.path.remove("simulator")
diff --git a/tests/streamwise/test_allocator_bridge.py b/tests/streamwise/test_allocator_bridge.py
index bd45f8a6..569e4073 100644
--- a/tests/streamwise/test_allocator_bridge.py
+++ b/tests/streamwise/test_allocator_bridge.py
@@ -15,8 +15,10 @@
 
 import pytest
 
-# Add current path
+# Add current path and simulator/ permanently so lazy imports
+# (e.g. GreedyAllocator via auto_model_allocator) resolve at test time.
 sys.path.append(os.getcwd())
+sys.path[:0] = [os.path.join(os.getcwd(), "simulator")]
 
 from tests.test_utils import temp_sys_path
 

From a0636f8465eaa9adfc46bc77294ae8a237335354 Mon Sep 17 00:00:00 2001
From: Haoran Qiu <haoranqiu@microsoft.com>
Date: Fri, 15 May 2026 17:07:02 -0700
Subject: [PATCH 9/9] Use Path

---
 simulator/data_loading.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/simulator/data_loading.py b/simulator/data_loading.py
index 5ecfcbde..af37e5b8 100644
--- a/simulator/data_loading.py
+++ b/simulator/data_loading.py
@@ -28,17 +28,17 @@
 from constants import POWER_GPU_IDLE
 from constants import POWER_GPU_TDP
 
-_DEFAULT_DATA_DIR = str(Path(__file__).resolve().parents[0] / "data")
+_DEFAULT_DATA_DIR = Path(__file__).resolve().parent / "data"
 
 
 def load_latency_data(
-    data_dir: str = _DEFAULT_DATA_DIR,
+    data_dir: str | Path = _DEFAULT_DATA_DIR,
 ) -> LatencyData:
     """
     Load latency and throughput mapping data from CSV files.
 
     Args:
-        data_dir (str): The directory where the CSV files are stored.
+        data_dir: The directory where the CSV files are stored.
     Returns:
         LatencyData: An object containing all loaded latency data.
     """
@@ -109,13 +109,13 @@ def load_latency_data(
 
 
 def load_power_data(
-    data_dir: str = _DEFAULT_DATA_DIR
+    data_dir: str | Path = _DEFAULT_DATA_DIR
 ) -> PowerData:
     """
     Load power consumption data from CSV files.
 
     Args:
-        data_dir (str): The directory where the CSV files are stored.
+        data_dir: The directory where the CSV files are stored.
     Returns:
         PowerData: An object containing all loaded power consumption data.
     """
@@ -218,7 +218,7 @@ def load_power_data(
 
 
 def load_adaptive_quality_data(
-    data_dir: str,
+    data_dir: str | Path,
     level: QualityLevel,
 ) -> LatencyData:
     """Load latency data for adaptive quality."""