From 42bdf8723e6f4e65338ff24131eb19bae4b14491 Mon Sep 17 00:00:00 2001 From: Haoran Qiu Date: Fri, 15 May 2026 13:11:05 -0700 Subject: [PATCH 1/9] Remove simulator shim files; import directly from model_provisioner Delete 17 shim files from simulator/ that re-exported from streamwise.model_provisioner. Update simulator/__init__.py to add streamwise/ to sys.path so model_provisioner is importable. Update imports in simulator/provisioning.py, multirequests.py, and plot_utils.py to use model_provisioner.* prefixed imports. Update all 19 test files in tests/simulator/ to: - Pass both 'simulator' and 'streamwise' to temp_sys_path - Use model_provisioner.* prefixed imports for moved modules - Fix patch.dict target in test_models.py - Fix inline import in test_hexgen.py Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- simulator/__init__.py | 15 + simulator/actions.py | 737 ------------ simulator/auto_model_allocator.py | 109 -- simulator/constants.py | 142 --- simulator/data_loading.py | 298 ----- simulator/evaluator.py | 414 ------- simulator/greedy.py | 573 --------- simulator/helix.py | 403 ------- simulator/hexgen.py | 629 ---------- simulator/milp.py | 1070 ----------------- simulator/model_allocator.py | 282 ----- simulator/models.py | 811 ------------- simulator/multirequests.py | 26 +- simulator/naive_baseline.py | 484 -------- simulator/plot_utils.py | 10 +- simulator/policies.py | 252 ---- simulator/provisioning.py | 42 +- simulator/sim_types.py | 796 ------------ simulator/sim_types_json.py | 58 - simulator/utils.py | 297 ----- simulator/workflows.py | 253 ---- tests/simulator/test_auto_model_allocator.py | 36 +- tests/simulator/test_data_loading.py | 10 +- tests/simulator/test_evaluator.py | 34 +- tests/simulator/test_greedy.py | 22 +- tests/simulator/test_helix.py | 20 +- tests/simulator/test_hexgen.py | 16 +- tests/simulator/test_milp.py | 32 +- tests/simulator/test_models.py | 58 +- tests/simulator/test_multirequests_derive.py | 8 +- tests/simulator/test_simulator.py | 26 +- tests/simulator/test_simulator_actions.py | 12 +- tests/simulator/test_simulator_baseline.py | 34 +- tests/simulator/test_simulator_energy.py | 26 +- .../simulator/test_simulator_multirequests.py | 12 +- tests/simulator/test_simulator_plotutils.py | 10 +- tests/simulator/test_simulator_policies.py | 8 +- .../simulator/test_simulator_provisioning.py | 18 +- tests/simulator/test_simulator_types.py | 22 +- tests/simulator/test_simulator_utils.py | 26 +- tests/simulator/test_workflows.py | 14 +- 41 files changed, 282 insertions(+), 7863 deletions(-) delete mode 100644 simulator/actions.py delete mode 100644 simulator/auto_model_allocator.py delete mode 100644 simulator/constants.py delete mode 100644 simulator/data_loading.py delete mode 100644 simulator/evaluator.py delete mode 100644 simulator/greedy.py delete mode 100644 simulator/helix.py delete mode 100644 simulator/hexgen.py delete mode 100644 simulator/milp.py delete mode 100644 simulator/model_allocator.py delete mode 100644 simulator/models.py delete mode 100644 simulator/naive_baseline.py delete mode 100644 simulator/policies.py delete mode 100644 simulator/sim_types.py delete mode 100644 simulator/sim_types_json.py delete mode 100644 simulator/utils.py delete mode 100644 simulator/workflows.py diff --git a/simulator/__init__.py b/simulator/__init__.py index e69de29b..24058e01 100644 --- a/simulator/__init__.py +++ b/simulator/__init__.py @@ -0,0 +1,15 @@ +""" +Simulator package. + +The core allocation logic lives in ``streamwise.model_provisioner``. +This package adds provisioning sweeps, multi-request analysis, and plotting +on top of that shared foundation. +""" +import os +import sys + +# Make model_provisioner importable for simulator modules and child processes. +_STREAMWISE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "streamwise") +_STREAMWISE_DIR = os.path.normpath(_STREAMWISE_DIR) +if _STREAMWISE_DIR not in sys.path: + sys.path.insert(0, _STREAMWISE_DIR) diff --git a/simulator/actions.py b/simulator/actions.py deleted file mode 100644 index debea677..00000000 --- a/simulator/actions.py +++ /dev/null @@ -1,737 +0,0 @@ -""" -Actions for scaling models for the greedy allocator. -""" - -from __future__ import annotations - -import random - -from collections import Counter - -from copy import deepcopy - -from typing import Optional - -from constants import DEVICE_OPTIONS -from constants import SINGLE_INSTANCE_MODELS -from constants import SINGLE_DEVICE_MODELS - -from sim_types import Action -from sim_types import ActionName -from sim_types import Model -from sim_types import ModelAllocation -from sim_types import GPUType -from sim_types import WorkflowConfig -from sim_types import LatencyData -from sim_types import PowerData -from sim_types import Objective -from sim_types import Policy - -from policies import STREAMWISE_POLICY - -from models import get_model_allocation - -from evaluator import evaluate_model_allocation -from evaluator import calc_used_gpus - - -def _is_single_instance( - model_name: Model, - workflow: Optional[WorkflowConfig] = None, -) -> bool: - """Check if a model is single-instance, considering workflow parallelism settings.""" - if model_name not in SINGLE_INSTANCE_MODELS: - return False - if workflow is not None and workflow.is_parallelizable(model_name): - return False - return True - - -def find_next_devices( - device_options: list[int], - num_devices: int, - num_replicas: int, - remaining_devices: int, - max_num_devices: Optional[int] = None, -) -> Optional[int]: - """ - Find the next device combination. - For example, with device options [2, 4, 8, 16, 40], current devices 8, 1 replica, we get 16. - """ - if num_replicas == 0: - # means we haven't allocated any replicas yet so start from smallest device option - return device_options[0] if device_options[0] <= remaining_devices else None - - for device_option in device_options: - # if device_option > num_devices and device_option <= remaining_devices + num_devices: - if ( - device_option > num_devices - and (device_option - num_devices) * num_replicas <= remaining_devices - and (max_num_devices is None or device_option <= max_num_devices) - ): - return device_option - return None - - -def choose_action( - actions: list[Action], - objective: Objective, - switch_objective: bool = False, -) -> Optional[Action]: - """Schedule requests.""" - if not actions: - return None - - if objective == Objective.TIME_COST: - # return min(actions, key=lambda a: a.time) - return min( - actions, - key=lambda a: ( - a.time_cost(), - a.time, - ), - ) - if objective == Objective.TIME_COST: - return min( - actions, - key=lambda a: ( - a.time_cost(), - a.time, - ), - ) - if objective == Objective.TTFF_COST: - return min( - actions, - key=lambda a: ( - a.ttff_cost(), - a.ttff, - ), - ) - if objective == Objective.FIFO: - # return min(actions, key=lambda a: a.arrival_time_s) - return min(actions, key=lambda a: a.get_order()) - if objective == Objective.TIME: - return min(actions, key=lambda a: a.time) - if objective == Objective.TTFF: - return min(actions, key=lambda a: a.ttff) - if objective == Objective.COST: - return min(actions, key=lambda a: a.cost) - if objective == Objective.ENERGY: - return min(actions, key=lambda a: a.energy) - if objective == Objective.TIME_ENERGY: - return min(actions, key=lambda a: a.time_energy()) - if objective == Objective.ENERGY_COST: - return min(actions, key=lambda a: a.energy_cost()) - if objective == Objective.RANDOM: - # randomly pick an improvement to simulate naive allocation - return random.choice(actions) - if objective == Objective.TTFF_THEN_TIME: - if switch_objective: - return min(actions, key=lambda a: a.time) - else: - return min(actions, key=lambda a: a.ttff) - if objective == Objective.NONE: - return None - raise ValueError(f"Cannot recognize objective {objective}") - - -def apply_action( - action: Action, - models: dict[GPUType, dict[Model, list[ModelAllocation]]], -) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: - """Apply the chosen action to the models and update remaining devices.""" - - for gpu_type in action.models.keys(): - if gpu_type not in models: - raise ValueError(f"Cannot find gpu type {gpu_type} in {models.keys()}") - for model in action.models[gpu_type].keys(): - if model not in models[gpu_type]: - raise ValueError(f"Cannot find model {model} in {models[gpu_type].keys()}") - allocs_to_remove = [] - for alloc_id in range(len(action.models[gpu_type][model])): - # check if devices and replicas are non-negative - num_devices = action.models[gpu_type][model][alloc_id].devices - if num_devices < 0: - raise ValueError(f"Action devices {num_devices} must be >= 0") - if action.models[gpu_type][model][alloc_id].replicas <= 0: - # remove that instance if replicas is 0 or negative - allocs_to_remove.append(alloc_id) - for alloc_id in reversed(allocs_to_remove): - del action.models[gpu_type][model][alloc_id] - - return action.models - - -def gen_actions( - workflow: WorkflowConfig, - num_gpus: dict[GPUType, int], - latency_data: LatencyData, - power_data: Optional[PowerData] = None, - models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {}, - policy: Policy = STREAMWISE_POLICY, - allow_removal: bool = False, - allow_merging: bool = False, - look_ahead_replicas: int = 3, -) -> list[Action]: - actions: list[Action] = [] - - # Extract GPU types from models - gpu_types = list(models.keys()) - assert len(gpu_types) == len(num_gpus), \ - f"Number of GPU types in models {len(gpu_types)} must match num_gpus {len(num_gpus)}" - - remaining_gpus = {} - for gpu_type in num_gpus.keys(): - remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]}) - - # Option 1: Provision more by increasing for each model allocation - for model in Model: - if model not in workflow.models: - continue - for gpu_type in gpu_types: - for alloc_id in range(len(models[gpu_type][model])): - actions.extend(_gen_add_device_replica_actions( - models=models, - num_gpus=num_gpus, - remaining_gpus=remaining_gpus[gpu_type], - gpu_type=gpu_type, - model_name=model, - allocation_id=alloc_id, - workflow=workflow, - policy=policy, - latency_data=latency_data, - power_data=power_data, - look_ahead_replicas=look_ahead_replicas, - )) - - # Option 2: Add a model instance of - for model in Model: - if model not in workflow.models: - continue - for gpu_type in gpu_types: - actions.extend(_gen_add_instance( - models=models, - num_gpus=num_gpus, - remaining_gpus=remaining_gpus[gpu_type], - gpu_type=gpu_type, - model_name=model, - workflow=workflow, - policy=policy, - latency_data=latency_data, - power_data=power_data, - look_ahead_replicas=look_ahead_replicas, - )) - - if allow_removal: - # Option 3: Remove replicas for each model allocation - for model in Model: - if model not in workflow.models: - continue - for gpu_type in gpu_types: - model_instances = models[gpu_type][model] - for alloc_id in range(len(model_instances)): - action = _gen_remove_replica_action( - models=models, - num_gpus=num_gpus, - gpu_type=gpu_type, - model_name=model, - allocation_id=alloc_id, - workflow=workflow, - policy=policy, - latency_data=latency_data, - power_data=power_data, - ) - if action: - actions.append(action) - - if allow_merging: - # Option 4: Merge across model allocations - for model in Model: - if model not in workflow.models: - continue - for gpu_type in gpu_types: - actions.extend(_gen_merge_replicas_actions( - models=models, - num_gpus=num_gpus, - gpu_type=gpu_type, - model_name=model, - workflow=workflow, - policy=policy, - latency_data=latency_data, - power_data=power_data, - )) - - return actions - - -def _get_min_device_combinations( - num_gpus: int, - model: Model, -) -> list[tuple[int, int]]: - """ - Get the minimum device combinations for a given number of GPUs and model. - [(device_count, num_replicas), ...] - For example, for 64, it would return [(40, 1), (16, 1)]. - """ - remaining = num_gpus - result: list[int] = [] - for size in sorted(DEVICE_OPTIONS[model], reverse=True): - while remaining >= size: - result.append(size) - remaining -= size - if remaining > 0: - raise ValueError(f"Cannot exactly decompose {num_gpus} with DEVICE_OPTIONS") - counts = Counter(result) - return sorted(counts.items(), reverse=True) # Sort by device count descending - - -def _get_large_instance_many_small_combinations( - num_gpus: int, - model: Model, -) -> list[tuple[int, int]]: - """ - Get the largest instance possible and then split the rest into 1 GPU instances. - For example, for 64, it would return [(40, 1), (1, 16)]. - """ - assert num_gpus > 0 - assert model in DEVICE_OPTIONS - assert DEVICE_OPTIONS[model][0] == 1 # must have 1 GPU option to use this function - - remaining_gpus = num_gpus - result: list[tuple[int, int]] = [] - for size in sorted(DEVICE_OPTIONS[model], reverse=True): - if remaining_gpus >= size: - result = [(size, 1)] - remaining_gpus -= size - break - if remaining_gpus > 0: - result.append((1, remaining_gpus)) - return result - - -def _gen_add_device_replica_actions( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - num_gpus: dict[GPUType, int], - remaining_gpus: int, - gpu_type: GPUType, - model_name: Model, - allocation_id: int, - workflow: WorkflowConfig, - policy: Policy, - latency_data: LatencyData, - power_data: Optional[PowerData] = None, - look_ahead_replicas: int = 3, -) -> list[Action]: - """ - Generate actions that explore all valid (replicas, devices) provisioning - options for a given model allocation, using the remaining GPUs. - - From the current replicas * devices, find the next options by distributing the remaining devices. - For example, if currently 2 replicas at parallelism 4 with 4 remaining devices, options include: - - 3 replicas, 4 devices (uses 12 total, 4 more than current 8) - - 1 replica, 10 devices (uses 10 total, 2 more than current 8) - - etc. - """ - actions: list[Action] = [] - - if model_name in SINGLE_DEVICE_MODELS and _is_single_instance(model_name, workflow): - return actions # No scaling possible - - alloc = models[gpu_type][model_name][allocation_id] - current_total = alloc.devices * max(alloc.replicas, 0) - current_replicas = alloc.replicas - total_available = current_total + remaining_gpus - - max_num_devices = latency_data[gpu_type].get_max_parallelism(model_name) - max_replicas = alloc.get_max_replicas(workflow) - is_single_instance = _is_single_instance(model_name, workflow) - is_single_device = model_name in SINGLE_DEVICE_MODELS - - seen: set[tuple[int, int]] = set() - seen.add((max(alloc.replicas, 0), alloc.devices)) # skip current config - - for new_devices in DEVICE_OPTIONS[model_name]: - if new_devices > max_num_devices: - continue # Exceeds max parallelism from latency data - if is_single_device and new_devices > 1: - continue # Model only supports single device - if (model_name, new_devices) not in latency_data[gpu_type]: - continue # No latency data for this device count - - # Determine the range of replicas possible with this device count - if is_single_instance: - replica_candidates = [1] - else: - max_r = min(max_replicas, total_available // new_devices) if new_devices > 0 else 0 - # limit max replicas to original replicas + X to avoid too many combinations - max_r = min(max_r, current_replicas + look_ahead_replicas) - replica_candidates = list(range(1, max_r + 1)) - - for new_replicas in replica_candidates: - new_total = new_replicas * new_devices - if new_total <= current_total: - continue # Must be an increase - if new_total > total_available: - continue # Not enough GPUs - if (new_replicas, new_devices) in seen: - continue - seen.add((new_replicas, new_devices)) - - try: - new_models = deepcopy(models) - new_models[gpu_type][model_name][allocation_id] = get_model_allocation( - model=model_name, - gpu_type=gpu_type, - devices=new_devices, - replicas=new_replicas, - ) - action_result = evaluate_model_allocation( - models=new_models, - num_gpus=num_gpus, - workflow=workflow, - latency_data=latency_data, - power_data=power_data, - policy=policy, - include_models=[model_name], - ) - actions.append(Action( - name=ActionName.ADD_DEVICE_REPLICA, - model=model_name, - gpu_type=gpu_type, - models=new_models, - action_result=action_result, - arrival_time_s=alloc.time, - )) - except Exception: - pass # Invalid configuration, skip - - return actions - - -def _gen_add_device_action( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - num_gpus: dict[GPUType, int], - remaining_gpus: int, - gpu_type: GPUType, - model_name: Model, - allocation_id: int, - workflow: WorkflowConfig, - policy: Policy, - latency_data: LatencyData, - power_data: Optional[PowerData] = None, -) -> Optional[Action]: - """ - Action to add devices (increase parallelism) for a specific model allocation. - """ - action: Optional[Action] = None - - if model_name in SINGLE_DEVICE_MODELS: - return action # These models only run on a single GPU, so we don't add more devices - - alloc = models[gpu_type][model_name][allocation_id] - - max_num_devices = latency_data[gpu_type].get_max_parallelism(model_name) - next_num_devices = find_next_devices( - DEVICE_OPTIONS[model_name], - num_devices=alloc.devices, - num_replicas=alloc.replicas, - remaining_devices=remaining_gpus, - max_num_devices=max_num_devices) - - if not next_num_devices: - return action # No valid next device option, skip - if (model_name, next_num_devices) not in latency_data[gpu_type]: - return action # No latency data for this device option, skip - - new_models = deepcopy(models) - new_models[gpu_type][model_name][allocation_id] = get_model_allocation( - model=model_name, - gpu_type=gpu_type, - devices=next_num_devices, - replicas=max(1, alloc.replicas), - ) - try: - action_result = evaluate_model_allocation( - models=new_models, - num_gpus=num_gpus, - workflow=workflow, - latency_data=latency_data, - power_data=power_data, - policy=policy, - include_models=[model_name], - ) - action = Action( - name=ActionName.ADD_DEVICE, - model=model_name, - gpu_type=gpu_type, - models=new_models, - action_result=action_result, - arrival_time_s=alloc.time, - ) - except Exception: - pass # Invalid action - - return action - - -def _gen_merge_replicas_actions( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - gpu_type: GPUType, - model_name: Model, - num_gpus: dict[GPUType, int], - workflow: WorkflowConfig, - policy: Policy, - latency_data: LatencyData, - power_data: Optional[PowerData] = None, -) -> list[Action]: - actions: list[Action] = [] - - if _is_single_instance(model_name, workflow): - return actions # These models only support a single instance, so no need to merge - - model_instances = models[gpu_type][model_name] - model_num_gpus = 0 - for model_instance in model_instances: - model_num_gpus += model_instance.get_num_gpus() - if model_num_gpus <= 1: - return actions # No replicas to merge for this model and GPU type - - for device_combos in [ - _get_min_device_combinations(model_num_gpus, model_name), - _get_large_instance_many_small_combinations(model_num_gpus, model_name) - ]: - new_models = deepcopy(models) - new_models[gpu_type][model_name] = [] - - for new_num_devices, new_num_replicas in device_combos: - new_models[gpu_type][model_name].append(get_model_allocation( - model=model_name, - gpu_type=gpu_type, - devices=new_num_devices, - replicas=new_num_replicas, - )) - - try: - action_result = evaluate_model_allocation( - models=new_models, - num_gpus=num_gpus, - workflow=workflow, - latency_data=latency_data, - power_data=power_data, - policy=policy, - include_models=[model_name], - ) - - instance_id = 0 - actions.append(Action( - name=ActionName.MERGE, - model=model_name, - gpu_type=gpu_type, - models=new_models, - action_result=action_result, - arrival_time_s=new_models[gpu_type][model_name][instance_id].time, - )) - except Exception: - pass # Invalid action - - return actions - - -def _gen_add_instance( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - num_gpus: dict[GPUType, int], - remaining_gpus: int, - gpu_type: GPUType, - model_name: Model, - workflow: WorkflowConfig, - policy: Policy, - latency_data: LatencyData, - power_data: Optional[PowerData] = None, - look_ahead_replicas: int = 3, -) -> list[Action]: - actions: list[Action] = [] - - if _is_single_instance(model_name, workflow): - return actions # These models only support a single instance, so we don't add more - - for new_num_devices in DEVICE_OPTIONS[model_name]: - for new_num_replicas in list(range(1, look_ahead_replicas + 1)): - new_instance = get_model_allocation( - model=model_name, - gpu_type=gpu_type, - devices=new_num_devices, - replicas=new_num_replicas, - ) - if new_instance.get_num_gpus() > remaining_gpus: - continue # Not enough remaining GPUs for this new instance - - new_models = deepcopy(models) - new_models[gpu_type][model_name].append(new_instance) - - try: - action_result = evaluate_model_allocation( - models=new_models, - num_gpus=num_gpus, - workflow=workflow, - latency_data=latency_data, - power_data=power_data, - policy=policy, - include_models=[model_name], - ) - action = Action( - name=ActionName.ADD_INSTANCE, - model=model_name, - gpu_type=gpu_type, - models=new_models, - action_result=action_result, - arrival_time_s=new_instance.time, - ) - actions.append(action) - except Exception: - pass # Invalid action - - return actions - - -def _gen_remove_replica_action( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - num_gpus: dict[GPUType, int], - gpu_type: GPUType, - model_name: Model, - allocation_id: int, - workflow: WorkflowConfig, - policy: Policy, - latency_data: LatencyData, - power_data: Optional[PowerData] = None, -) -> Optional[Action]: - action: Optional[Action] = None - - model = models[gpu_type][model_name][allocation_id] - - if model.replicas == 0: - return action # No replicas to remove for this model and GPU type - - new_models = deepcopy(models) - new_models[gpu_type][model_name][allocation_id] = get_model_allocation( - model=model_name, - gpu_type=gpu_type, - devices=model.devices, - replicas=model.replicas - 1, - ) - - if len(num_gpus) == 2: - # For dual GPU setting, initialize removed replica on the other GPU type to see if it improves performance - gpu_types = list(num_gpus.keys()) - other_gpu_type = gpu_types[0] if gpu_type == gpu_types[1] else gpu_types[1] - if _is_single_instance(model_name, workflow): - if new_models[gpu_type][model_name][allocation_id].replicas == 0: - # If this is a single instance model and we're removing the only replica, add it to the other GPU type - new_models[other_gpu_type][model_name].append(get_model_allocation( - model=model_name, - gpu_type=other_gpu_type, - devices=model.devices, - replicas=1, - )) - - try: - action_result = evaluate_model_allocation( - models=new_models, - num_gpus=num_gpus, - workflow=workflow, - latency_data=latency_data, - power_data=power_data, - policy=policy, - include_models=[model_name], - ) - action = Action( - name=ActionName.REMOVE_REPLICA, - model=model_name, - gpu_type=gpu_type, - models=new_models, - action_result=action_result, - arrival_time_s=new_models[gpu_type][model_name][allocation_id].time, - ) - except Exception: - pass # Ignore not possible action - return action - - -def _gen_add_replica_action( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - num_gpus: dict[GPUType, int], - remaining_gpus: int, - gpu_type: GPUType, - model_name: Model, - allocation_id: int, - workflow: WorkflowConfig, - policy: Policy, - latency_data: LatencyData, - power_data: Optional[PowerData] = None, -) -> Optional[Action]: - """ - Action to add replicas for a specific model allocation. - """ - action: Optional[Action] = None - - if _is_single_instance(model_name, workflow): - return action # These models don't support replication, so we skip - - model = models[gpu_type][model_name][allocation_id] - - if remaining_gpus < model.devices: - return action # Not enough remaining GPUs to add another replica - - max_replicas = model.get_max_replicas(workflow) - if model.replicas >= max_replicas: - return action # Already at max replicas, skip - - new_num_replicas = min( - model.replicas + 1, - max_replicas, # - models[other_gpu_type][Model.HF].replicas - model.replicas + remaining_gpus // model.devices - ) - if new_num_replicas == model.replicas: - return action # No changes, skip - - new_models = deepcopy(models) - new_models[gpu_type][model_name][allocation_id] = get_model_allocation( - model=model_name, - gpu_type=gpu_type, - devices=model.devices, - replicas=new_num_replicas, - ) - - try: - action_result = evaluate_model_allocation( - models=new_models, - num_gpus=num_gpus, - workflow=workflow, - latency_data=latency_data, - power_data=power_data, - policy=policy, - include_models=[model_name], - ) - action = Action( - name=ActionName.ADD_REPLICA, - model=model_name, - gpu_type=gpu_type, - models=new_models, - action_result=action_result, - arrival_time_s=model.time, - ) - except Exception: - pass # Invalid action - - return action - - -def max_time( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - model_name: Model, -) -> float: - values = [] - for models_gpu in models.values(): - if model_name in models_gpu: - for alloc in models_gpu[model_name]: - values.append(alloc.time) - return max(values) diff --git a/simulator/auto_model_allocator.py b/simulator/auto_model_allocator.py deleted file mode 100644 index ea0fda61..00000000 --- a/simulator/auto_model_allocator.py +++ /dev/null @@ -1,109 +0,0 @@ -""" -Factory helpers for selecting the right model allocator implementation. -""" - -from __future__ import annotations - -import logging - -from dataclasses import replace -from typing import Optional - -from sim_types import Policy -from sim_types import WorkflowConfig -from sim_types import LatencyData -from sim_types import Model -from sim_types import PowerData -from sim_types import QualityLevel -from sim_types import Solver -from sim_types import GPUType -from sim_types import Result - -from policies import STREAMWISE_POLICY - -from model_allocator import ModelAllocator - - -class AutoModelAllocator(ModelAllocator): - """Allocator wrapper that routes to a concrete allocator by solver.""" - - policy: Policy - - def __init__( - self, - workflow: WorkflowConfig, - latency_data: LatencyData, - power_data: Optional[PowerData] = None, - policy: Policy = STREAMWISE_POLICY, - ) -> None: - super().__init__( - workflow=workflow, - latency_data=latency_data, - power_data=power_data, - policy=policy, - ) - self._allocator = self._build_allocator() - - def _build_allocator(self) -> ModelAllocator: - """Create concrete allocator based on configured solver.""" - if self.policy.solver == Solver.GREEDY: - from greedy import GreedyAllocator - return GreedyAllocator( - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - ) - if self.policy.solver == Solver.NAIVE: - from naive_baseline import NaiveAllocator - return NaiveAllocator( - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - ) - if self.policy.solver in {Solver.GUROBI, Solver.HIGHS}: - from milp import MILPAllocator - return MILPAllocator( - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - ) - if self.policy.solver == Solver.HEXGEN: - from hexgen import HexGenAllocator - return HexGenAllocator( - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - ) - if self.policy.solver == Solver.HELIX: - from helix import HelixAllocator - return HelixAllocator( - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - ) - raise ValueError(f"Unsupported solver for allocator selection: {self.policy.solver}") - - def allocate( - self, - num_gpus: dict[GPUType, int], - verbose: bool = False, - ) -> Result: - if self.policy.use_upscaler and self.workflow.target_resolution == QualityLevel.LOW: - logging.warning( - f"Policy {self.policy.name} uses upscaler, but workflow target resolution is LOW. " - f"Disabling upscaler for this allocation.") - self.policy = replace(self.policy, use_upscaler=False) - self._allocator.policy = self.policy - # Remove upscaler from model work - self.workflow.model_work.pop(Model.UPSCALER, None) - self._allocator.workflow = self.workflow - - return self._allocator.allocate( - num_gpus=num_gpus, - verbose=verbose, - ) diff --git a/simulator/constants.py b/simulator/constants.py deleted file mode 100644 index bb6f9034..00000000 --- a/simulator/constants.py +++ /dev/null @@ -1,142 +0,0 @@ -from __future__ import annotations - -import math - -from sim_types import WorkflowConfig -from sim_types import GPUType -from sim_types import Model - - -SECONDS_IN_MINUTE = 60.0 -SECONDS_IN_HOUR = 60.0 * 60.0 - -# Video resolution constants (16:10) -NUM_PIXELS_ORIGINAL = 1280 * 800 -NUM_PIXELS_ORIGINAL_FLUX = 1280 * 800 -NUM_PIXELS_ORIGINAL_HF = 512 * 320 -NUM_PIXELS_ORIGINAL_FT = 640 * 400 -NUM_PIXELS_ORIGINAL_UPSCALER = 1280 * 800 - -NUM_PIXELS_MEDIUM = 640 * 400 -NUM_PIXELS_MEDIUM_FLUX = 640 * 400 -NUM_PIXELS_MEDIUM_HF = 256 * 160 -NUM_PIXELS_MEDIUM_FT = 320 * 200 -NUM_PIXELS_MEDIUM_UPSCALER = 640 * 400 - -NUM_PIXELS_LOW = 320 * 200 -NUM_PIXELS_LOW_FLUX = 320 * 200 -NUM_PIXELS_LOW_HF = 128 * 80 -NUM_PIXELS_LOW_FT = 160 * 100 -NUM_PIXELS_LOW_UPSCALER = 320 * 200 - -# StreamCast constants -TOTAL_INPUT_TOKENS = 20 * 1024 # 20K tokens for instructions, PDFs, etc. -TOTAL_VIDEO_SECONDS = 10 * 60 # 10 minutes video -TOTAL_SUBSCENES = 172 # each subscene is 3.5 seconds -> limited by fantasytalking 81 frames at 23 FPS -TOTAL_SCENES = 43 # each scene is 4 subscenes -FPS: dict[Model, float] = { - Model.HF: 30, - Model.FT: 23, -} -NUM_STEPS: dict[Model, int] = { - Model.FLUX: 25, - Model.HF: 10, - Model.FT: 10, -} -FRAMES_OPTIONS: dict[Model, list[int]] = { - Model.HF: [36, 72, 108, 144, 324], - Model.FT: [9, 21, 41, 61, 77], -} -FRAMES_PER_STEP_IDX = 4 - -DEFAULT_WORKFLOW_CONFIG = WorkflowConfig( - total_video_seconds=TOTAL_VIDEO_SECONDS, - total_scenes=TOTAL_SCENES, - total_frames={ - Model.HF: math.ceil(TOTAL_VIDEO_SECONDS * FPS[Model.HF]), - Model.FT: math.ceil(TOTAL_VIDEO_SECONDS * FPS[Model.FT]), - }, - total_subscenes=TOTAL_SUBSCENES, - per_subscene_frames={ - Model.HF: math.ceil(TOTAL_VIDEO_SECONDS * FPS[Model.HF] / TOTAL_SUBSCENES), - Model.FT: math.ceil(TOTAL_VIDEO_SECONDS * FPS[Model.FT] / TOTAL_SUBSCENES), - }, - # default per-frame number of denoising steps - num_steps=dict(NUM_STEPS), - # supported number of generation frames - hf_frames=FRAMES_OPTIONS[Model.HF], - ft_frames=FRAMES_OPTIONS[Model.FT], - frames_per_step_idx=FRAMES_PER_STEP_IDX, - total_input_tokens=TOTAL_INPUT_TOKENS, -) - -# Available device counts for scaling -# Tensor parallelism (TP) or sequence parallelism (SP) -DEVICE_OPTIONS = { - Model.GEMMA: [1, 2, 4, 8], - Model.FLUX: [1, 2, 4, 8, 16], - Model.OTHERS: [1], # Single GPU, no parallelism - Model.HF: [1, 2, 4, 8, 10, 16, 20, 24, 32, 40], - Model.HF_VAE: [1], # Single GPU, no parallelism - Model.FT: [1, 2, 4, 8, 10, 16, 20, 24, 32, 40], - Model.FT_VAE: [1], # Single GPU, no parallelism - Model.UPSCALER: [1, 2, 4, 8], # Single GPU, no parallelism -} - -# Models that only have one instance in the system, so not scaling them across GPU types -SINGLE_INSTANCE_MODELS = [ - Model.GEMMA, - Model.FLUX, - Model.OTHERS, -] - -# Models that can only be run on a single GPU -SINGLE_DEVICE_MODELS = [ - Model.OTHERS, - Model.HF_VAE, - Model.FT_VAE, -] - - -NUM_GPUS_PER_SERVER = { - GPUType.A100: 8, - GPUType.H100: 8, - GPUType.H200: 8, - GPUType.GB200: 8, # This is technically 4 GPUs per server, but nothing fits -} - - -POWER_GPU_IDLE = { - GPUType.A100: 65.0, # Watts - GPUType.H100: 80.0, # Watts TODO placeholder value - GPUType.H200: 80.0, # Watts TODO placeholder value - GPUType.GB200: 170.0, # Watts -} - - -POWER_GPU_TDP = { - GPUType.A100: 400.0, # Watts - GPUType.H100: 700.0, # Watts - GPUType.H200: 700.0, # Watts - GPUType.GB200: 1200.0, # Watts -} - - -# Cost per GPU -GPU_SPOT_COST = { - # $ / hour (Spot prices) - GPUType.A100: 1.07, # $8.56 for 8 GPUs - GPUType.H100: 4.03, # $32.24 for 8 GPUs - GPUType.H200: 4.22, # $33.76 for 8 GPUs - GPUType.GB200: 10.76 # $43.04 for 4 GPUs -} - -GPU_RESERVED_COST = { - # $ / hour (Reserved prices) - GPUType.A100: 3.4, # $27.2 for 8 GPUs - GPUType.H100: 5.39, # $43.12 for 8 GPUs - GPUType.H200: 5.64, # $45.12 for 8 GPUs - GPUType.GB200: 14.42 # $57.68 for 4 GPUs -} - -GPU_COST = GPU_SPOT_COST diff --git a/simulator/data_loading.py b/simulator/data_loading.py deleted file mode 100644 index 6ee59ec5..00000000 --- a/simulator/data_loading.py +++ /dev/null @@ -1,298 +0,0 @@ -""" -Module for loading latency and power consumption data from CSV files. -""" - -import pandas as pd - -from pathlib import Path - -from sim_types import LatencyData -from sim_types import PowerData -from sim_types import GPUType -from sim_types import LatencyGPUTypeData -from sim_types import PowerGPUTypeData -from sim_types import QualityLevel - -from constants import NUM_PIXELS_ORIGINAL_UPSCALER -from constants import NUM_PIXELS_ORIGINAL_FT -from constants import NUM_PIXELS_ORIGINAL_HF -from constants import NUM_PIXELS_ORIGINAL_FLUX -from constants import NUM_PIXELS_LOW_FT -from constants import NUM_PIXELS_LOW_HF -from constants import NUM_PIXELS_LOW_FLUX -from constants import NUM_PIXELS_LOW_UPSCALER -from constants import NUM_PIXELS_MEDIUM_FT -from constants import NUM_PIXELS_MEDIUM_HF -from constants import NUM_PIXELS_MEDIUM_UPSCALER -from constants import NUM_PIXELS_MEDIUM_FLUX -from constants import POWER_GPU_IDLE -from constants import POWER_GPU_TDP - - -def load_latency_data( - data_dir: str = "data/", -) -> LatencyData: - """ - Load latency and throughput mapping data from CSV files. - - Args: - data_dir (str): The directory where the CSV files are stored. - Returns: - LatencyData: An object containing all loaded latency data. - """ - data_path = Path(data_dir) - - data = LatencyData(gpus={}) - for gpu_type in GPUType: - data.gpus[gpu_type] = LatencyGPUTypeData(gpu_type=gpu_type) - - # Flux time -> per image generation - csv_flux_path = data_path / f"latency_flux_mapping_{gpu_type.value.lower()}.csv" - df_flux = pd.read_csv(csv_flux_path, comment='#') - data[gpu_type].flux = dict(zip( - df_flux["world_size"], - df_flux["avg_steps_time"])) - - # Hunyuan Framepack per step time -> [36, 72, 108, 144, 324] frames generation - csv_hf_path = data_path / f"latency_hf_mapping_{gpu_type.value.lower()}.csv" - df_hf = pd.read_csv(csv_hf_path, comment='#') - data[gpu_type].hf = dict(zip( - df_hf["world_size"], - df_hf["avg_steps_time"])) - - # Hunyuan Framepack VAE time -> per inference iteration - # Derived: steps * avg_step_time * vae_pct(vae_time / total_time) - data[gpu_type].hf_vae = dict(zip( - df_hf["world_size"], - df_hf["vae_time"])) - - # Fantasy Talking per step time -> [9, 21, 41, 61, 77] frames generation - csv_ft_path = data_path / f"latency_ft_mapping_{gpu_type.value.lower()}.csv" - df_ft = pd.read_csv(csv_ft_path, comment='#') - data[gpu_type].ft = dict(zip( - df_ft["world_size"], - df_ft["avg_steps_time"])) - - # Fantasy Talking VAE time -> per inference iteration - # Derived: steps * avg_step_time * vae_pct(vae_time / total_time) - data[gpu_type].ft_vae = dict(zip( - df_ft["world_size"], - df_ft["vae_time"])) - - # Upscaler time -> per image frame - csv_upscaler_path = data_path / f"latency_upscaler_{gpu_type.value.lower()}.csv" - df_upscaler = pd.read_csv(csv_upscaler_path, comment='#') - data[gpu_type].upscaler = dict(zip( - df_upscaler['world_size'], - df_upscaler['avg_steps_time'])) - - # Gemma time -> first scene and per scene - csv_gemma_path = data_path / f"latency_gemma_{gpu_type.value.lower()}.csv" - df_gemma = pd.read_csv(csv_gemma_path, comment='#') - data[gpu_type].gemma_first_scene = dict(zip( - df_gemma['tp'], - df_gemma['first_scene_time'])) - data[gpu_type].gemma_per_scene = dict(zip( - df_gemma['tp'], - df_gemma['per_scene_time'])) - - # Others time -> kokoro and other overheads -> time per scene - csv_others_path = data_path / f"latency_others_{gpu_type.value.lower()}.csv" - df_others = pd.read_csv(csv_others_path, comment='#') - data[gpu_type].others = dict(zip( - df_others['world_size'], - df_others['time'])) - - return data - - -def load_power_data( - data_dir: str = "data/" -) -> PowerData: - """ - Load power consumption data from CSV files. - - Args: - data_dir (str): The directory where the CSV files are stored. - Returns: - PowerData: An object containing all loaded power consumption data. - """ - data_path = Path(data_dir) - - data = PowerData(gpus={}) - for gpu_type in GPUType: - data.gpus[gpu_type] = PowerGPUTypeData(gpu_type=gpu_type) - - # Flux power profile - power_flux_file_name = data_path / f'power_flux_mapping_{gpu_type.value.lower()}.csv' - power_flux_df = pd.read_csv(power_flux_file_name, comment='#') - data[gpu_type].flux = dict(zip( - power_flux_df['world_size'], - power_flux_df['power_watts'])) - - # Hunyuan Framepack 640x400 power profile - power_hf_file_name = data_path / f'power_hf_mapping_{gpu_type.value.lower()}.csv' - power_hf_df = pd.read_csv(power_hf_file_name, comment='#') - data[gpu_type].hf = dict(zip( - power_hf_df['world_size'], - power_hf_df['power_watts'])) - - # Hunyuan Framepack 1280x800 power profile - power_hf_file_name_high = data_path / f'power_hf_mapping_{gpu_type.value.lower()}_high.csv' - power_hf_high_df = pd.read_csv(power_hf_file_name_high, comment='#') - data[gpu_type].hf_high = dict(zip( - power_hf_high_df['world_size'], - power_hf_high_df['power_watts'])) - - # Hunyuan Framepack VAE power profile - power_hf_vae_file_name = data_path / f'power_hf_vae_{gpu_type.value.lower()}.csv' - power_hf_vae_df = pd.read_csv(power_hf_vae_file_name, comment='#') - data[gpu_type].hf_vae = dict(zip( - power_hf_vae_df['world_size'], - power_hf_vae_df['power_watts'])) - - # Hunyuan Framepack VAE high power profile - power_hf_vae_high_file_name = data_path / f'power_hf_vae_{gpu_type.value.lower()}_high.csv' - power_hf_vae_high_df = pd.read_csv(power_hf_vae_high_file_name, comment='#') - data[gpu_type].hf_vae_high = dict(zip( - power_hf_vae_high_df['world_size'], - power_hf_vae_high_df['power_watts'])) - - # Fantasy Talking 640x400 power profile - power_ft_file_name = data_path / f'power_ft_mapping_{gpu_type.value.lower()}.csv' - power_ft_df = pd.read_csv(power_ft_file_name, comment='#') - data[gpu_type].ft = dict(zip( - power_ft_df['world_size'], - power_ft_df['power_watts'])) - - # Fantasy Talking 1280x800 power profile - power_ft_high_file_name = data_path / f'power_ft_mapping_{gpu_type.value.lower()}_high.csv' - power_ft_high_df = pd.read_csv(power_ft_high_file_name, comment='#') - data[gpu_type].ft_high = dict(zip( - power_ft_high_df['world_size'], - power_ft_high_df['power_watts'])) - - # Fantasy Talking VAE mapping - power_ft_vae_file_name = data_path / f'power_ft_vae_mapping_{gpu_type.value.lower()}.csv' - power_ft_vae_df = pd.read_csv(power_ft_vae_file_name, comment='#') - data[gpu_type].ft_vae = dict(zip( - power_ft_vae_df['world_size'], - power_ft_vae_df['power_watts'])) - - # Fantasy Talking VAE high mapping - power_ft_vae_high_file_name = data_path / f'power_ft_vae_mapping_{gpu_type.value.lower()}_high.csv' - power_ft_vae_high_df = pd.read_csv(power_ft_vae_high_file_name, comment='#') - data[gpu_type].ft_vae_high = dict(zip( - power_ft_vae_high_df['world_size'], - power_ft_vae_high_df['power_watts'])) - - # Upscaler power profile - power_upscaler_file_name = data_path / f'power_upscaler_{gpu_type.value.lower()}.csv' - power_upscaler_df = pd.read_csv(power_upscaler_file_name, comment='#') - data[gpu_type].upscaler = dict(zip( - power_upscaler_df['world_size'], - power_upscaler_df['power_watts'])) - - # Gemma power profile - power_gemma_first_scene_file_name = data_path / f'power_gemma_first_scene_{gpu_type.value.lower()}.csv' - power_gemma_per_scene_file_name = data_path / f'power_gemma_per_scene_{gpu_type.value.lower()}.csv' - power_gemma_first_scene_df = pd.read_csv(power_gemma_first_scene_file_name, comment='#') - power_gemma_per_scene_df = pd.read_csv(power_gemma_per_scene_file_name, comment='#') - data[gpu_type].gemma_first_scene = dict(zip( - power_gemma_first_scene_df['world_size'], - power_gemma_first_scene_df['power_watts'] - )) - data[gpu_type].gemma_per_scene = dict(zip( - power_gemma_per_scene_df['world_size'], - power_gemma_per_scene_df['power_watts'] - )) - - # Idle and TDP power profiles - for gpu_type in GPUType: - data[gpu_type].idle = POWER_GPU_IDLE[gpu_type] - data[gpu_type].tdp = POWER_GPU_TDP[gpu_type] - - return data - - -def load_adaptive_quality_data( - data_dir: str, - level: QualityLevel, -) -> LatencyData: - """Load latency data for adaptive quality.""" - assert isinstance(level, QualityLevel) - - latency_data = load_latency_data(data_dir=data_dir) - - if level == QualityLevel.ORIGINAL or level == QualityLevel.HIGH: - return latency_data - - if level == QualityLevel.MEDIUM: - ratio_flux = NUM_PIXELS_MEDIUM_FLUX / NUM_PIXELS_ORIGINAL_FLUX - ratio_hf = NUM_PIXELS_MEDIUM_HF / NUM_PIXELS_ORIGINAL_HF - ratio_hf_vae = NUM_PIXELS_MEDIUM_HF / NUM_PIXELS_ORIGINAL_HF - ratio_ft = NUM_PIXELS_MEDIUM_FT / NUM_PIXELS_ORIGINAL_FT - ratio_ft_vae = NUM_PIXELS_MEDIUM_FT / NUM_PIXELS_ORIGINAL_FT - ratio_upscaler = NUM_PIXELS_MEDIUM_UPSCALER / NUM_PIXELS_ORIGINAL_UPSCALER - for gpu_type in GPUType: - latency_data[gpu_type].flux = { - k: v * ratio_flux - for k, v in latency_data[gpu_type].flux.items() - } - latency_data[gpu_type].hf = { - k: v * ratio_hf - for k, v in latency_data[gpu_type].hf.items() - } - latency_data[gpu_type].hf_vae = { - k: v * ratio_hf_vae - for k, v in latency_data[gpu_type].hf_vae.items() - } - latency_data[gpu_type].ft = { - k: v * ratio_ft - for k, v in latency_data[gpu_type].ft.items() - } - latency_data[gpu_type].ft_vae = { - k: v * ratio_ft_vae - for k, v in latency_data[gpu_type].ft_vae.items() - } - latency_data[gpu_type].upscaler = { - k: v * ratio_upscaler - for k, v in latency_data[gpu_type].upscaler.items() - } - return latency_data - - if level == QualityLevel.LOW: - ratio_flux = NUM_PIXELS_LOW_FLUX / NUM_PIXELS_ORIGINAL_FLUX - ratio_hf = NUM_PIXELS_LOW_HF / NUM_PIXELS_ORIGINAL_HF - ratio_hf_vae = NUM_PIXELS_LOW_HF / NUM_PIXELS_ORIGINAL_HF - ratio_ft = NUM_PIXELS_LOW_FT / NUM_PIXELS_ORIGINAL_FT - ratio_ft_vae = NUM_PIXELS_LOW_FT / NUM_PIXELS_ORIGINAL_FT - ratio_upscaler = NUM_PIXELS_LOW_UPSCALER / NUM_PIXELS_ORIGINAL_UPSCALER - for gpu_type in GPUType: - latency_data[gpu_type].flux = { - k: v * ratio_flux - for k, v in latency_data[gpu_type].flux.items() - } - latency_data[gpu_type].hf = { - k: v * ratio_hf - for k, v in latency_data[gpu_type].hf.items() - } - latency_data[gpu_type].hf_vae = { - k: v * ratio_hf_vae - for k, v in latency_data[gpu_type].hf_vae.items() - } - latency_data[gpu_type].ft = { - k: v * ratio_ft - for k, v in latency_data[gpu_type].ft.items() - } - latency_data[gpu_type].ft_vae = { - k: v * ratio_ft_vae - for k, v in latency_data[gpu_type].ft_vae.items() - } - latency_data[gpu_type].upscaler = { - k: v * ratio_upscaler - for k, v in latency_data[gpu_type].upscaler.items() - } - return latency_data - - return latency_data diff --git a/simulator/evaluator.py b/simulator/evaluator.py deleted file mode 100644 index a9730bb2..00000000 --- a/simulator/evaluator.py +++ /dev/null @@ -1,414 +0,0 @@ -""" -Evaluate the performance of a given model allocation in terms of time, energy, and cost. -It includes some assertions (e.g., only one instance of Gemma and Flux). -""" -from __future__ import annotations - -import math -import logging - -from typing import Optional - -from constants import NUM_GPUS_PER_SERVER -from constants import TOTAL_INPUT_TOKENS -from constants import SECONDS_IN_HOUR - -from sim_types import Result -from sim_types import GPUType -from sim_types import WorkflowConfig -from sim_types import PowerData -from sim_types import LatencyData -from sim_types import Model -from sim_types import ModelAllocation -from sim_types import Policy - -from sim_types_json import models_to_json -from sim_types_json import workflow_to_json -from sim_types_json import policy_to_json - - -def _count_instances( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - model: Model, -) -> int: - num_instances = 0 - for model_gpus in models.values(): - if model in model_gpus: - for model_allocation in model_gpus[model]: - if model_allocation.get_num_gpus() > 0: - num_instances += 1 - return num_instances - - -def _assert_single_instance( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - model: Model, -) -> None: - num_instances = _count_instances(models, model) - assert num_instances == 1, f"Expected exactly one instance of {model}, but found {num_instances}" - - -def _assert_at_least_one_instance( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - model: Model, -) -> None: - num_instances = _count_instances(models, model) - assert num_instances > 0, f"Expected at least one instance of {model}, but found {num_instances}" - - -def _assert_no_instances( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - model: Model, -) -> None: - num_instances = _count_instances(models, model) - assert num_instances == 0, f"Expected no instances of {model}, but found {num_instances}" - - -def evaluate_times( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - latency_data: LatencyData, - workflow: WorkflowConfig, - policy: Policy, - include_models: Optional[list[Model]] = None, -) -> None: - """ - Compute the total time for the given model allocation and workflow, using the latency data. - It only evaluates the models specified in "include_models" if provided. - """ - gpu_types = list(models.keys()) - - upscaler_gpus = sum( - model_alloc.get_num_gpus() - for gpu_type in gpu_types - for model_alloc in models.get(gpu_type, {}).get(Model.UPSCALER, []) - ) - if not policy.use_upscaler: - assert upscaler_gpus == 0 - - for model_name in workflow.models: - if include_models is not None and model_name not in include_models: - continue - - # Special conditions: models that require a policy flag - if model_name == Model.HF_VAE and not policy.is_disaggregated(Model.HF): - _assert_no_instances(models, Model.HF_VAE) - continue - if model_name == Model.FT_VAE and not policy.is_disaggregated(Model.FT): - _assert_no_instances(models, Model.FT_VAE) - continue - if model_name == Model.UPSCALER and not policy.use_upscaler: - _assert_no_instances(models, Model.UPSCALER) - continue - - _assert_at_least_one_instance(models, model_name) - - if not workflow.is_parallelizable(model_name): - # Single-instance: no work splitting - for gpu_type in gpu_types: - if model_name in models[gpu_type]: - for model_alloc in models[gpu_type][model_name]: - model_alloc.calculate_time( - policy, workflow, latency_data) - model_alloc.calculate_time_first( - policy, workflow, latency_data) - continue - - # Parallel: capacity-based work splitting (throughput-weighted) - capacities: dict[GPUType, list[float]] = {} - for gpu_type in gpu_types: - capacities[gpu_type] = [] - if model_name not in models[gpu_type]: - continue - for model_alloc in models[gpu_type][model_name]: - if model_alloc.get_num_gpus() > 0: - latency = latency_data[gpu_type][model_name, model_alloc.devices] - # When not disaggregated, include VAE overhead in capacity - if model_name == Model.FT and not policy.is_disaggregated(Model.FT): - latency += latency_data[gpu_type][Model.FT_VAE, 1] / workflow.num_steps[Model.FT] - if model_name == Model.HF and not policy.is_disaggregated(Model.HF): - latency += latency_data[gpu_type][Model.HF_VAE, 1] / workflow.num_steps[Model.HF] - if model_name in (Model.HF, Model.HF_VAE, Model.FT, Model.FT_VAE): - latency *= workflow.get_resolution_scale(policy.use_upscaler) - if model_name == Model.GEMMA: - latency *= workflow.total_input_tokens / TOTAL_INPUT_TOKENS - if latency == 0: - capacities[gpu_type].append(0.0) - else: - capacities[gpu_type].append(model_alloc.replicas / latency) - - total_capacity = sum(sum(c) for c in capacities.values()) - for gpu_type in gpu_types: - if model_name not in models[gpu_type]: - continue - cap_idx = 0 - for model_alloc in models[gpu_type][model_name]: - if model_alloc.get_num_gpus() > 0: - work_pct = capacities[gpu_type][cap_idx] / total_capacity if total_capacity > 0 else 0.0 - model_alloc.calculate_time( - policy, workflow, latency_data, - work_pct=work_pct) - model_alloc.calculate_time_first( - policy, workflow, latency_data) - cap_idx += 1 - - -def evaluate_energy( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - power_data: PowerData, - workflow: WorkflowConfig, - total_time_s: float = 0.0, -) -> None: - """ - Calculate total energy (power * time * replicas for each model). - Need to run after evaluate_times since energy calculation depends on time. - """ - for gpu_type_allocs in models.values(): - for model_allocation_list in gpu_type_allocs.values(): - for model_allocation in model_allocation_list: - model_allocation.calculate_energy( - workflow, - power_data, - total_time_s) - - -def evaluate_cost( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - total_time_s: float, - policy: Policy, -) -> None: - """ - Calculate total cost based on GPU hours used. - Need to run after evaluate_times since cost calculation depends on time. - """ - for gpu_type_allocs in models.values(): - for model_allocation_list in gpu_type_allocs.values(): - for model in model_allocation_list: - model.calculate_cost(policy, total_time_s) - - -_EVALUATOR_CACHE: dict[str, Result] = {} - - -def evaluate_model_allocation( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - num_gpus: dict[GPUType, int], - workflow: WorkflowConfig, - latency_data: LatencyData, - power_data: Optional[PowerData], - policy: Policy, - include_models: Optional[list[Model]] = None, - cache_results: bool = False, - round_up_cost_to_server: bool = False, -) -> Result: - """ - Evaluate the metrics for a given allocation of models to GPUs. - It only evaluates the models in "include_models" if specified. - """ - cache_key = None - if cache_results: - cache_key = models_to_json(models) + \ - workflow_to_json(workflow) + \ - str(latency_data) + \ - str(power_data) + \ - policy_to_json(policy) + \ - str(include_models) - if cache_key in _EVALUATOR_CACHE: - return _EVALUATOR_CACHE[cache_key] - - # Check if setup is possible - gpus_used = {} - for gpu_type, model_gpu in models.items(): - gpus_used[gpu_type] = calc_used_gpus({gpu_type: model_gpu}) - assert num_gpus[gpu_type] % NUM_GPUS_PER_SERVER[gpu_type] == 0, \ - f"{gpu_type.value}: {num_gpus[gpu_type]} % {NUM_GPUS_PER_SERVER[gpu_type]}" - assert gpus_used[gpu_type] <= num_gpus[gpu_type], \ - f"{gpu_type.value}: {gpus_used[gpu_type]} > {num_gpus[gpu_type]}" - - # Assert input models are built correctly - for gpu_type in models.keys(): - for model_name in models[gpu_type].keys(): - for instance_id in range(len(models[gpu_type][model_name])): - assert models[gpu_type][model_name][instance_id].model == model_name - assert models[gpu_type][model_name][instance_id].gpu_type == gpu_type - - # Actual evaluation - evaluate_times( - models, latency_data, workflow, policy, - include_models=include_models, - ) - time_s = calc_total_time(models) - - first_chunk_time = calc_ttff(models) - ttff_s = max( - first_chunk_time, - time_s - workflow.total_video_seconds - ) - - num_frames = (workflow.total_frames[Model.FT] - workflow.per_subscene_frames[Model.FT]) - tbf_s = (time_s - first_chunk_time) / num_frames - if tbf_s < 0: - logging.debug( - f"Negative TBF: " - F"{tbf_s:.2f} = ({time_s:.2f} - {first_chunk_time:.2f}) / {num_frames}") - tbf_s = 0.0 - - # Calculate total energy (power * time * replicas for each model) - energy = 0.0 - if power_data is not None: - evaluate_energy(models, power_data, workflow, time_s) - energy = calc_energy(models=models) - - evaluate_cost(models, time_s, policy) - cost = calc_cost( - models, time_s, policy, - round_up_to_server=round_up_cost_to_server) - - ret = Result( - models=models, - gpus_used=gpus_used, - gpus_total=num_gpus, - total_time_s=time_s, - first_chunk_time=first_chunk_time, - ttff_s=ttff_s, - tbf_s=tbf_s, - total_energy=energy if power_data else 0.0, - cost=cost, - ) - - if cache_key is not None: - _EVALUATOR_CACHE[cache_key] = ret - - return ret - - -def calc_energy( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], -) -> float: - """ - Calculate total energy (power * time * replicas for each model). - Energy in Watt x seconds (Joules). - This assumes that evaluate_energy() has been called already. - """ - energy = 0.0 # Total energy in Watt-seconds (Joules = Watt x second) - for model_dict in models.values(): - for model_allocations in model_dict.values(): - for model_allocation in model_allocations: - energy += model_allocation.energy - return energy - - -def calc_model_cost( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], -) -> float: - """ - Calculate total cost based on GPU hours used. - This assumes that evaluate_cost() has been called already. - """ - costs = {} - for gpu_type, model_dict in models.items(): - costs[gpu_type] = 0.0 - for model_allocations in model_dict.values(): - for model_allocation in model_allocations: - costs[gpu_type] += model_allocation.cost - return sum(costs.values()) - - -def calc_cost( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - time_s: float, - policy: Policy, - round_up_to_server: bool = True, -) -> float: - """ - Calculate total cost based on GPU hours used. - """ - used_gpus = calc_used_gpus_per_type(models) - - # Round up to the nearest server (pack of GPUs) since we pay for whole servers - if round_up_to_server: - for gpu_type, used in used_gpus.items(): - used_pack = math.ceil(used / NUM_GPUS_PER_SERVER[gpu_type]) * NUM_GPUS_PER_SERVER[gpu_type] - used_gpus[gpu_type] = used_pack - - return calc_cost_total(used_gpus, time_s, policy) - - -def calc_cost_total( - num_gpus: dict[GPUType, int], - time_s: float, - policy: Policy, -) -> float: - """ - Calculate total cost based on GPU hours used. - It includes the idle GPUs not assigned to a model. - """ - cost = 0.0 - for gpu_type, num in num_gpus.items(): - cost += num * (time_s / SECONDS_IN_HOUR) * policy.gpu_cost[gpu_type] - return cost - - -def calc_used_gpus_per_type( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], -) -> dict[GPUType, int]: - """ - Calculate number of GPUs used per GPU type across all models. - """ - gpus_used = {} - for gpu_type, model_gpu in models.items(): - gpus_used[gpu_type] = 0 - for model_allocations in model_gpu.values(): - for model_allocation in model_allocations: - gpus_used[gpu_type] += model_allocation.get_num_gpus() - return gpus_used - - -def calc_used_gpus( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], -) -> int: - """ - Calculate total number of GPUs used across all models and GPU types. - """ - gpus_used = calc_used_gpus_per_type(models) - return sum(gpus_used.values()) - - -def calc_total_time( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], -) -> float: - """ - Calculate total time considering all stages and dependencies. - This assumes that evaluate_time() has been called already. - """ - total_time_secs = 0.0 - for model_name in Model: - model_alloc_times = [ - model_alloc.time - for gpu_type in GPUType - if gpu_type in models and model_name in models[gpu_type] - for model_alloc in models[gpu_type][model_name] - ] - model_time = max(model_alloc_times) if model_alloc_times else 0.0 - total_time_secs += model_time - return total_time_secs - - -def calc_ttff( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], -) -> float: - """ - Calculate time to first frame (chunk). - It takes the time to first frame (TTFF) for each model. - This assumes that evaluate_time() has been called already. - """ - models_time_first: dict[Model, float] = {} - for model_name in Model: - times_first = [] - for gpu_type in models.keys(): - if model_name in models[gpu_type]: - for model_alloc in models[gpu_type][model_name]: - if model_alloc.get_num_gpus() > 0: - times_first.append(model_alloc.time_first) - if len(times_first) > 0: - models_time_first[model_name] = min(times_first) # The fastest model determines TTFF - return sum(models_time_first.values()) diff --git a/simulator/greedy.py b/simulator/greedy.py deleted file mode 100644 index 459742e5..00000000 --- a/simulator/greedy.py +++ /dev/null @@ -1,573 +0,0 @@ -""" -Greedy algorithm for the StreamWise workflow allocation problem. -""" - -from __future__ import annotations - -import logging - -from tabulate import tabulate - -from typing import Optional - -from operator import itemgetter - -from constants import NUM_GPUS_PER_SERVER -from constants import SECONDS_IN_MINUTE -from constants import SECONDS_IN_HOUR - -from sim_types import Result -from sim_types import GPUType -from sim_types import WorkflowConfig -from sim_types import LatencyData -from sim_types import PowerData -from sim_types import Model -from sim_types import ModelAllocation -from sim_types import Policy -from sim_types import Solver - -from utils import simplify_model_allocations - -from evaluator import calc_used_gpus -from evaluator import evaluate_model_allocation - -from model_allocator import ModelAllocator - -from policies import STREAMWISE_POLICY -from policies import MAX_ITERATIONS -from policies import USE_ALL_GPUS - -from actions import gen_actions -from actions import choose_action -from actions import apply_action - - -class GreedyAllocator(ModelAllocator): - """ - Greedy allocator that iteratively applies the best action. - """ - def __init__( - self, - workflow: WorkflowConfig, - latency_data: LatencyData, - power_data: Optional[PowerData] = None, - policy: Policy = STREAMWISE_POLICY, - ) -> None: - super().__init__( - workflow, - latency_data, - power_data, - policy, - ) - assert self.policy.solver in {Solver.GREEDY, Solver.HEXGEN} - - def allocate( - self, - num_gpus: dict[GPUType, int], - verbose: bool = False, - # Greedy policy parameters - allow_removal: bool = False, - allow_merging: bool = False, - look_ahead_replicas: int = 3, - ) -> Result: - total_gpus = sum(num_gpus.values()) - assert total_gpus >= 8, f"Total number of GPUs must be at least 8 ({num_gpus})" - - gpu_types = [ - gpu_type - for gpu_type, count in num_gpus.items() - if count > 0 - ] - assert 1 <= len(gpu_types) <= 2, f"Only up to two GPU types are supported ({len(gpu_types)})" - gpu_type1 = gpu_types[0] - - if len(gpu_types) == 1 and num_gpus[gpu_type1] == 8: - # 8 x GPUs - return self._pick_from_single_server( - gpu_type=gpu_type1, - verbose=verbose, - ) - - if len(gpu_types) == 1: - # More than 8 x GPUs - return self._pick_from_single_device_mapping( - num_gpus.get(gpu_type1, 0), - gpu_type=gpu_type1, - verbose=verbose, - allow_removal=allow_removal, - allow_merging=allow_merging, - look_ahead_replicas=look_ahead_replicas, - ) - - # Mixed setup of GPU types (e.g., A100 and H100) - return self._pick_from_both_devices_mapping( - num_gpus, - verbose=verbose, - allow_removal=allow_removal, - allow_merging=allow_merging, - look_ahead_replicas=look_ahead_replicas, - ) - - def _pick_from_both_devices_mapping( - self, - num_gpus: dict[GPUType, int], - verbose: bool = False, - allow_removal: bool = False, - allow_merging: bool = False, - look_ahead_replicas: int = 3, - ) -> Result: - """ - Calculate based on two GPU types. - """ - gpu_types = list(num_gpus.keys()) - assert len(gpu_types) == 2 - assert len(num_gpus) == 2 - gpu_type1 = gpu_types[0] - gpu_type2 = gpu_types[1] - assert num_gpus[gpu_type1] >= NUM_GPUS_PER_SERVER[gpu_type1] - assert num_gpus[gpu_type2] >= NUM_GPUS_PER_SERVER[gpu_type2] - - # Initialize allocations with minimal setup - models = self._init_both_devices_models(gpu_type1, gpu_type2) - - remaining_gpus = {} - for gpu_type in num_gpus.keys(): - remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]}) - - # Optimization loop - if verbose: - evaluate_model_allocation( - models=models, - num_gpus=num_gpus, - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - round_up_cost_to_server=True, - ) - self._print_iteration(0, models, num_gpus) - - it = 1 - prev_metric = None - switch_objective = False - while sum(remaining_gpus.values()) > 0: - # Calculate current iteration times - evaluate_model_allocation( - models=models, - num_gpus=num_gpus, - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - round_up_cost_to_server=False, - ) - - # Calculate potential actions for each optimization option - actions = gen_actions( - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - num_gpus=num_gpus, - models=models, - policy=self.policy, - allow_removal=allow_removal, - allow_merging=allow_merging, - look_ahead_replicas=look_ahead_replicas, - ) - - if not actions: - logging.debug(f"No more actions possible after {it} iterations for {self.policy}.") - break - - best_action = choose_action(actions, self.policy.objective, switch_objective=switch_objective) - - if not best_action: - logging.debug("No actions selected.") - break - - new_metric = best_action.get_metric(self.policy.objective, switch_objective=switch_objective) - - if self.policy.objective.is_monotonic() and prev_metric is not None and new_metric >= prev_metric: - msg = f"No improvement after {it} iterations for {self.policy}." - msg += f" Best action: {best_action}, metric: {new_metric:.2f} >= previous {prev_metric:.2f}." - if verbose: - print(msg) - logging.debug(msg) - if not USE_ALL_GPUS: - logging.debug("Not using all GPUs as USE_ALL_GPUS is False. Stopping optimization loop.") - break - switch_objective = True - - prev_metric = new_metric - - models = apply_action(best_action, models=models) - - models = simplify_model_allocations(models) - - remaining_gpus.clear() - for gpu_type in num_gpus.keys(): - remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]}) - - if verbose: - self._print_iteration(it, models, num_gpus) - print(f"{len(actions)} actions:") - for action in actions: - if action == best_action: - print(f"* {action} (best)") - else: - print(f" {action}") - print(f"Metric: {new_metric:.2f}") - print("Remaining devices:") - for gpu_type in remaining_gpus.keys(): - print(f" {remaining_gpus[gpu_type]} x {gpu_type.value}") - - it += 1 - if it > MAX_ITERATIONS: - logging.debug(f"Reached max iterations ({MAX_ITERATIONS}). Stopping optimization loop.") - break - - # Adjust for no disaggregation - if not self.policy.is_disaggregated(Model.HF): - for models_gpu in models.values(): - for instance_id in range(len(models_gpu[Model.HF_VAE])): - assert models_gpu[Model.HF_VAE][instance_id].get_num_gpus() == 0, \ - "HF_VAE must have 0 GPUs when HF disaggregation is disabled" - if not self.policy.is_disaggregated(Model.FT): - for models_gpu in models.values(): - for instance_id in range(len(models_gpu[Model.FT_VAE])): - assert models_gpu[Model.FT_VAE][instance_id].get_num_gpus() == 0, \ - "FT_VAE must have 0 GPUs when FT disaggregation is disabled" - - # Final calculations - result = evaluate_model_allocation( - models=models, - num_gpus=num_gpus, - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - round_up_cost_to_server=True, - ) - - if verbose: - self._print_final_allocation( - models=models, - used_devices=result.gpus_used, - total_devices={ - gpu_type1: num_gpus.get(gpu_type1, 0), - gpu_type2: num_gpus.get(gpu_type2, 0), - }, - power_data=self.power_data, - total_time_s=result.total_time_s, - ttff_s=result.ttff_s, - first_chunk_time=result.first_chunk_time, - tbf_s=result.tbf_s, - total_energy=result.total_energy if self.power_data else 0.0, - cost=result.cost, - ) - - assert result.gpus_used[gpu_type1] <= num_gpus.get(gpu_type1, 0), \ - f"{gpu_type1.value}: {result.gpus_used[gpu_type1]} > {num_gpus.get(gpu_type1, 0)}" - assert result.gpus_used[gpu_type2] <= num_gpus.get(gpu_type2, 0), \ - f"{gpu_type2.value}: {result.gpus_used[gpu_type2]} > {num_gpus.get(gpu_type2, 0)}" - - return Result( - total_time_s=result.total_time_s, - models=models, - gpus_used=result.gpus_used, - ttff_s=result.ttff_s, - tbf_s=result.tbf_s, - total_energy=result.total_energy if self.power_data else 0.0, - cost=result.cost, - ) - - def _pick_from_single_server( - self, - gpu_type: GPUType, - verbose: bool = False, - ) -> Result: - """ - The minimal setup with a servers with a single server (8 GPUs or 4 for GB200). - No parallelism across scenes/subscenes. - """ - - # Number of devices - num_gpus = NUM_GPUS_PER_SERVER[gpu_type] - models = self._init_single_server_models(gpu_type) - - result = evaluate_model_allocation( - models=models, - num_gpus={gpu_type: num_gpus}, - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - round_up_cost_to_server=True, - ) - - if verbose: - model_device = models[gpu_type] - print_data = [ - [Model.GEMMA.value, round(model_device[Model.GEMMA][0].time, 2)], - [Model.FLUX.value, round(model_device[Model.FLUX][0].time, 2)], - [Model.HF.value, round(model_device[Model.HF][0].time, 2)], - [Model.HF_VAE.value, round(model_device[Model.HF_VAE][0].time, 2)], - [Model.FT.value, round(model_device[Model.FT][0].time, 2)], - [Model.FT_VAE.value, round(model_device[Model.FT_VAE][0].time, 2)], - ] - if self.policy.use_upscaler: - print_data.append([Model.UPSCALER.value, round(model_device[Model.UPSCALER][0].time, 2)]) - print(f"Total time: {result.total_time_s:.2f} seconds") - print(tabulate( - print_data, - headers=["Model", "Time (seconds)"], - tablefmt="pretty", - colalign=["left", "right"] - )) - self._print_final_allocation( - models=models, - used_devices={gpu_type: num_gpus}, - total_devices={gpu_type: num_gpus}, - power_data=self.power_data, - total_time_s=result.total_time_s, - ttff_s=result.ttff_s, - first_chunk_time=result.first_chunk_time, - tbf_s=result.tbf_s, - total_energy=result.total_energy if self.power_data else 0.0, - cost=result.cost, - ) - - return Result( - total_time_s=result.total_time_s, - models=models, - gpus_used={gpu_type: num_gpus}, - ttff_s=result.ttff_s, - tbf_s=result.tbf_s, - total_energy=result.total_energy if self.power_data else 0.0, - cost=result.cost, - ) - - def _pick_from_single_device_mapping( - self, - num_gpus: int, - gpu_type: GPUType, - verbose: bool = False, - allow_removal: bool = False, - allow_merging: bool = False, - look_ahead_replicas: int = 3, - ) -> Result: - """ - Calculate time and energy based on a single GPU type. - """ - assert num_gpus >= NUM_GPUS_PER_SERVER[gpu_type] - latency_gpu_data = self.latency_data[gpu_type] - assert gpu_type == latency_gpu_data.gpu_type - - if self.power_data is not None: - power_gpu_data = self.power_data[gpu_type] - assert gpu_type == power_gpu_data.gpu_type - - # Initialize allocations - models = self._init_single_device_models(gpu_type) - - remaining_gpus = num_gpus - calc_used_gpus(models) - - assert 0 <= remaining_gpus <= num_gpus - - # Optimization loop - it = 0 - prev_metric = None - switch_objective = False - while remaining_gpus > 0: - # Calculate current iteration times - evaluate_model_allocation( - models=models, - num_gpus={gpu_type: num_gpus}, - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - round_up_cost_to_server=False, - ) - - # Calculate potential actions for each optimization option - actions = gen_actions( - num_gpus={gpu_type: num_gpus}, - latency_data=self.latency_data, - power_data=self.power_data, - workflow=self.workflow, - models=models, - policy=self.policy, - allow_removal=allow_removal, - allow_merging=allow_merging, - look_ahead_replicas=look_ahead_replicas, - ) - - if not actions: - logging.debug(f"No more actions possible after {it} iterations for {self.policy}") - break - - best_action = choose_action( - actions, - self.policy.objective, - switch_objective=switch_objective) - - if not best_action: - logging.debug("No action selected.") - break - - new_metric = best_action.get_metric(self.policy.objective, switch_objective=switch_objective) - if self.policy.objective.is_monotonic() and prev_metric is not None and new_metric >= prev_metric: - msg = f"No improvement from actions after {it} iterations for {self.policy}." - msg += f" Best action: {best_action}, metric: {new_metric:.2f} >= previous {prev_metric:.2f}." - if verbose: - print(msg) - logging.debug(msg) - if not USE_ALL_GPUS: - logging.debug("Not using all GPUs as USE_ALL_GPUS is False. Stopping optimization loop.") - break - switch_objective = True - - models = apply_action(best_action, models) - - models = simplify_model_allocations(models) - - remaining_gpus = num_gpus - calc_used_gpus(models) - prev_metric = new_metric - - if verbose: - self._print_iteration(it, models, {gpu_type: num_gpus}) - print(f"Metric: {new_metric:.2f}") - print(f"{len(actions)} actions:") - for action in actions: - if action == best_action: - print(f" * {action} (best)") - else: - print(f" {action}") - print(f"Applied: {best_action}") - print(f"Remaining devices: {remaining_gpus}x{gpu_type}") - - it += 1 - if it > MAX_ITERATIONS: - logging.debug(f"Reached max iterations ({MAX_ITERATIONS}). Stopping optimization loop.") - break - - result = evaluate_model_allocation( - models=models, - num_gpus={gpu_type: num_gpus}, - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - round_up_cost_to_server=True, - ) - - if verbose: - self._print_final_allocation( - models=models, - used_devices=result.gpus_used, - total_devices={gpu_type: num_gpus}, - power_data=self.power_data, - total_time_s=result.total_time_s, - ttff_s=result.ttff_s, - first_chunk_time=result.first_chunk_time, - tbf_s=result.tbf_s, - total_energy=result.total_energy if self.power_data else 0.0, - cost=result.cost, - ) - - if not self.policy.is_disaggregated(Model.HF): - if models[gpu_type][Model.HF_VAE]: - assert models[gpu_type][Model.HF_VAE][0].get_num_gpus() == 0, \ - "HF_VAE must have 0 GPUs when HF disaggregation is disabled" - if not self.policy.is_disaggregated(Model.FT): - if models[gpu_type][Model.FT_VAE]: - assert models[gpu_type][Model.FT_VAE][0].get_num_gpus() == 0, \ - "FT_VAE must have 0 GPUs when FT disaggregation is disabled" - num_gpus_used = result.gpus_used[gpu_type] - assert num_gpus_used <= num_gpus, f"{num_gpus_used}>{num_gpus} for {gpu_type.value}" - - return Result( - total_time_s=result.total_time_s, - models=models, - gpus_used={gpu_type: num_gpus_used}, - gpus_total={gpu_type: num_gpus}, - ttff_s=result.ttff_s, - tbf_s=result.tbf_s, - total_energy=result.total_energy if self.power_data else 0.0, - cost=result.cost, - ) - - def _print_iteration( - self, - it: int, - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - num_gpus: dict[GPUType, int], - ) -> None: - print(f"--- Iteration {it} ---") - - for gpu_type in models.keys(): - total_gpus = calc_used_gpus({gpu_type: models[gpu_type]}) - print(f"Current {gpu_type.value} allocation: {total_gpus}/{num_gpus[gpu_type]} GPUs") - for model in Model: - for model_instance in models[gpu_type][model]: - if model_instance.get_num_gpus() > 0: - print(f" {model.value:10s}:\t{model_instance}") - - # Find the bottleneck stage - stage_times: dict[Model, float] = {} - ttff_times: dict[Model, float] = {} - for model_name in Model: - times = [] - times_first = [] - for gpu_type in models.keys(): - for model_alloc in models[gpu_type][model_name]: - times.append(model_alloc.time) - times_first.append(model_alloc.time_first) - stage_times[model_name] = max(times) if times else 0.0 - ttff_times[model_name] = max(times_first) if times_first else 0.0 - - bottleneck_stage, bottleneck_time = max( - stage_times.items(), - key=itemgetter(1) - ) - bottleneck_ttff_stage, bottleneck_ttff_time = max( - ttff_times.items(), - key=itemgetter(1) - ) - print(f"Bottleneck: {bottleneck_stage} ({bottleneck_time:.2f}s)") - print(f"Bottleneck TTFF: {bottleneck_ttff_stage} ({bottleneck_ttff_time:.2f}s)") - # bottleneck stage is not necessarily the stage with the - # highest potential gain from scaling up/out - - def _print_final_allocation( - self, - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - used_devices: dict[GPUType, int], - total_devices: dict[GPUType, int], - power_data: Optional[PowerData], - total_time_s: float, - ttff_s: float, - first_chunk_time: float, - tbf_s: float, - total_energy: float, - cost: float, - ) -> None: - print("=== FINAL ALLOCATION ===") - print("Total devices used/available:") - for gpu_type, total_device in total_devices.items(): - used_device = used_devices[gpu_type] - print(f" {gpu_type.value}: {used_device}/{total_device}") - print("Model allocations:") - for gpu_type in models.keys(): - print(f" {gpu_type.value} ({used_devices[gpu_type]} used):") - for model in Model: - for model_alloc in models[gpu_type][model]: - print(f" {model.value:10s}:\t{model_alloc}") - print(f"Total time: {total_time_s:.2f} seconds ({total_time_s / SECONDS_IN_MINUTE:.2f} minutes)") - print(f"TTFF: {ttff_s:.2f} seconds") - print(f"First chunk time: {first_chunk_time:.2f} seconds") - print(f"TBF: {tbf_s:.2f} seconds") - print(f"Total cost: ${cost:.2f}") - if power_data is not None: - print(f"Total energy: {total_energy:.2f} Ws ({total_energy / SECONDS_IN_HOUR / 1000:.2f} kWh)") diff --git a/simulator/helix.py b/simulator/helix.py deleted file mode 100644 index 5891538f..00000000 --- a/simulator/helix.py +++ /dev/null @@ -1,403 +0,0 @@ -""" -Helix algorithm for the StreamWise workflow allocation problem. - -Reference: https://github.com/Thesys-lab/Helix-ASPLOS25 - -Helix optimizes models one-by-one following MODEL_ORDER, using MILP -for each model's resource allocation. After each model reaches convergence -(solver optimality or per-model time limit), its allocation is fixed and the -remaining GPU budget is passed to the next model. - -Design rationale: - HelixAllocator does NOT inherit from MILPAllocator because the parent's - allocate() builds a single joint MILP for all models simultaneously. - Instead, HelixAllocator extends ModelAllocator and *composes* - MILPAllocator instances — one per model in the workflow. - - For each model, a per-model WorkflowConfig is created where only the - target model has non-zero work (all others set to 0). The existing MILP - constraints (is_active <= work, gpus <= num_gpus * is_active) naturally - force 0 GPU allocation for those 0-work models, so no changes to - milp.py are required. -""" - -from __future__ import annotations - -import logging - -from dataclasses import replace -from typing import Optional - -from sim_types import Result -from sim_types import GPUType -from sim_types import WorkflowConfig -from sim_types import PowerData -from sim_types import LatencyData -from sim_types import Model -from sim_types import ModelAllocation -from sim_types import Policy -from sim_types import Solver -from sim_types import MODEL_ORDER - -from model_allocator import ModelAllocator - -from evaluator import evaluate_model_allocation - -from milp import MILPAllocator - -from policies import HELIX_POLICY -from policies import MAX_DEVICES - -from constants import DEVICE_OPTIONS - - -# Default per-model MILP solver time limit in seconds. -# Each model gets this long to converge before the solver moves on. -DEFAULT_PER_MODEL_TIME_LIMIT = 30 - - -def _compute_per_model_gpu_budget( - model_order: list[Model], - num_gpus: dict[GPUType, int], - workflow: WorkflowConfig, -) -> dict[Model, dict[GPUType, int]]: - """Compute a per-model GPU budget so every model gets a fair share. - - Budget is proportional to each model's ``MAX_DEVICES`` weight (capped - by the model's actual maximum useful device count from ``DEVICE_OPTIONS``). - Models not in ``MAX_DEVICES`` (e.g. OTHERS, UPSCALER) receive a minimum - allocation of ``min(DEVICE_OPTIONS)`` GPUs. - - The allocations are floored per model, and any remainder is distributed - round-robin starting from the first model. - - Returns: - Mapping ``model -> {gpu_type -> max_gpus}`` that the model may use. - """ - # Effective weight per model (max useful devices) - weights: dict[Model, int] = {} - for m in model_order: - if workflow.model_work.get(m, 0) == 0: - continue - if m in MAX_DEVICES: - weights[m] = MAX_DEVICES[m] - else: - # Models not in MAX_DEVICES (OTHERS, UPSCALER) get min allocation - weights[m] = min(DEVICE_OPTIONS.get(m, [1])) - - total_weight = sum(weights.values()) - if total_weight == 0: - # Fallback: equal split - total_weight = len(weights) or 1 - weights = {m: 1 for m in weights} - - budget: dict[Model, dict[GPUType, int]] = {} - for gpu_type, total in num_gpus.items(): - # Floor allocation per model - allocated = 0 - per_model: dict[Model, int] = {} - for m in model_order: - if m not in weights: - continue - share = int(total * weights[m] / total_weight) - # Ensure at least 1 GPU per model (if GPUs available) - share = max(share, 1) if total - allocated >= 1 else 0 - per_model[m] = share - allocated += share - - # Distribute remainder round-robin - remainder = total - allocated - idx = 0 - models_list = [m for m in model_order if m in per_model] - while remainder > 0 and models_list: - m = models_list[idx % len(models_list)] - per_model[m] += 1 - remainder -= 1 - idx += 1 - - for m in model_order: - if m not in per_model: - continue - if m not in budget: - budget[m] = {} - budget[m][gpu_type] = per_model[m] - - return budget - - -class HelixAllocator(ModelAllocator): - """ - Helix-style allocator that optimizes models one at a time - using MILP, sequentially following MODEL_ORDER. - - Reference: https://github.com/Thesys-lab/Helix-ASPLOS25 - - Key approach: - 1. For each model in MODEL_ORDER, create a per-model MILP sub-problem - where only the target model has non-zero work. - 2. Solve the MILP with the remaining GPU budget and a per-model time limit. - 3. Fix the allocation for that model and subtract used GPUs. - 4. Move to the next model with the remaining GPU budget. - 5. Combine all per-model allocations into the final result. - - The HelixAllocator uses composition (not inheritance) with MILPAllocator, - creating a separate MILPAllocator instance for each model's sub-problem. - This avoids modifying the joint MILP formulation and allows per-model - solver configurations. - """ - - def __init__( - self, - workflow: WorkflowConfig, - latency_data: LatencyData, - power_data: Optional[PowerData] = None, - policy: Policy = HELIX_POLICY, - ) -> None: - super().__init__( - workflow, - latency_data, - power_data, - policy, - ) - assert self.policy.solver == Solver.HELIX - - def allocate( - self, - num_gpus: dict[GPUType, int], - verbose: bool = False, - per_model_time_limit: int = DEFAULT_PER_MODEL_TIME_LIMIT, - milp_solver: Solver = Solver.HIGHS, - ) -> Result: - """ - Allocate resources model-by-model following MODEL_ORDER. - - For each model, a MILPAllocator is created with a workflow where - only the target model has non-zero work. The MILP solver optimizes - the allocation for that model within the remaining GPU budget. - - Args: - num_gpus: Available GPUs per type. - verbose: If True, print per-model allocation details. - per_model_time_limit: Time limit (seconds) for each per-model MILP solve. - milp_solver: MILP solver backend to use (GUROBI or HIGHS). - - Returns: - Combined Result across all models. - """ - assert milp_solver in (Solver.GUROBI, Solver.HIGHS), \ - f"milp_solver must be GUROBI or HIGHS, got {milp_solver}" - - model_order = self.workflow.get_model_order() - if not self.policy.use_upscaler and Model.UPSCALER in model_order: - # Remove UPSCALER from model_order if not using upscaler to avoid unnecessary MILP solve - model_order.remove(Model.UPSCALER) - remaining_gpus = dict(num_gpus) - - # ---- GPU budget partitioning ---- - # Pre-compute a per-model GPU budget proportional to MAX_DEVICES - # so that early models cannot starve later ones. Unused GPUs from - # one model roll over to subsequent models. - gpu_budget = _compute_per_model_gpu_budget( - model_order, num_gpus, self.workflow, - ) - - if verbose: - logging.info("Helix GPU budget per model:") - for m in model_order: - if m in gpu_budget: - logging.info(f" {m.value}: {gpu_budget[m]}") - - # Accumulated per-model allocations and metrics - all_model_allocations: dict[GPUType, dict[Model, list[ModelAllocation]]] = {} - total_makespan = 0.0 - total_ttff = 0.0 - total_cost = 0.0 - total_energy = 0.0 - total_gpus_used: dict[GPUType, int] = {gt: 0 for gt in num_gpus} - - for model in model_order: - work = self.workflow.model_work.get(model, 0) - if work == 0: - continue - - # Skip VAE models when disaggregation is disabled for the parent. - # Their latency is folded into the parent model's time calculation. - if model == Model.HF_VAE and not self.policy.is_disaggregated(Model.HF): - continue - if model == Model.FT_VAE and not self.policy.is_disaggregated(Model.FT): - continue - - # Check if any GPUs remain - if all(v <= 0 for v in remaining_gpus.values()): - logging.warning( - f"Helix: No GPUs remaining for {model.value}. Skipping.") - continue - - # Filter out GPU types with 0 remaining. - # Cap per-model GPUs to the budget so later models are not starved. - model_budget = gpu_budget.get(model, {}) - active_gpus = { - gt: min(count, model_budget.get(gt, count)) - for gt, count in remaining_gpus.items() - if count > 0 and (gt not in model_budget or model_budget[gt] > 0) - } - - if verbose: - logging.info( - f"--- Helix: Optimizing {model.value} " - f"(work={work}) with remaining GPUs: {active_gpus} ---" - ) - - # ---- build per-model workflow ---- - # Only the target model has work; other models are excluded from - # model_work so the MILP only builds variables/constraints for it. - per_model_work = {model: self.workflow.model_work[model]} - per_model_workflow = replace( - self.workflow, - model_work=per_model_work, - ) - - # ---- build MILP-compatible policy ---- - # The inner MILPAllocator requires solver ∈ {GUROBI, HIGHS}. - # Force disaggregation / use_upscaler flags so that the inner - # MILP's ``model_names`` list includes VAE / UPSCALER when those - # are the target model. Without this, the MILP would construct - # an empty model set and produce a trivial (infeasible) problem. - disag = {} # dict(self.policy.disaggregation) - if model == Model.HF_VAE and self.policy.is_disaggregated(Model.HF): - disag[Model.HF] = True - if model == Model.FT_VAE and self.policy.is_disaggregated(Model.FT): - disag[Model.FT] = True - milp_policy = Policy( - name=self.policy.name, - gpu_cost=self.policy.gpu_cost, - objective=self.policy.objective, - # disaggregation=self.policy.disaggregation or model == Model.HF_VAE, - disaggregation=disag, - use_upscaler=self.policy.use_upscaler or model == Model.UPSCALER, - hardware=self.policy.hardware, - solver=milp_solver, - ) - - # ---- solve per-model MILP ---- - milp_allocator = MILPAllocator( - workflow=per_model_workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=milp_policy, - ) - - result = milp_allocator.allocate( - num_gpus=active_gpus, - verbose=verbose, - time_limit=per_model_time_limit, - # Use running_cost=True for linear cost formulation (HiGHS-compatible) - running_cost=(milp_solver == Solver.HIGHS), - # Skip server constraint: per-model allocations don't need - # to be multiples of NUM_GPUS_PER_SERVER. - skip_server_constraint=True, - ) - - if result.total_time_s == 0.0 and not result.models: - logging.warning( - f"Helix: MILP failed for {model.value}. Skipping.") - continue - - # ---- record allocations & snap devices to DEVICE_OPTIONS ---- - # The MILP constrains devices to DEVICE_OPTIONS, but floating-point - # precision in the solver can occasionally produce off-by-one values - # (e.g. 31 instead of 32). Snap each replica to the nearest valid - # option, adjusting the GPU accounting so we don't exceed the total - # budget passed to evaluate_model_allocation at the end. - for gpu_type, model_dict in result.models.items(): - if gpu_type not in all_model_allocations: - all_model_allocations[gpu_type] = {} - for m_name, allocs in model_dict.items(): - for alloc in allocs: - valid_devices = DEVICE_OPTIONS.get(m_name, [1]) - if alloc.devices not in valid_devices: - nearest = min(valid_devices, key=lambda d: abs(d - alloc.devices)) - diff = nearest - alloc.devices # positive = round up - gpu_avail = remaining_gpus.get(gpu_type, 0) - result.gpus_used.get(gpu_type, 0) - if diff > 0 and gpu_avail < diff: - # Not enough spare GPUs to round up; round down instead - nearest = max( - (d for d in valid_devices if d <= alloc.devices), - default=valid_devices[0], - ) - diff = nearest - alloc.devices - logging.info( - f"Helix: snapping {m_name.value} from " - f"{alloc.devices} to {nearest} devices " - f"(solver precision fix, diff={diff:+d})") - # Adjust GPU accounting for this model's result - result.gpus_used[gpu_type] = result.gpus_used.get(gpu_type, 0) + diff - alloc.devices = nearest - all_model_allocations[gpu_type][m_name] = allocs - - # ---- accumulate metrics ---- - total_makespan += result.total_time_s - total_ttff += result.ttff_s - total_cost += result.cost - total_energy += result.total_energy - if verbose: - print(f'Model {model.value} - Time: {result.total_time_s:.2f}s,' - f'TTFF: {result.ttff_s:.2f}s, Cost: ${result.cost:.2f}') - print(f'Total cost so far: ${total_cost:.2f}, Total time so far: {total_makespan:.2f}s,' - f'Total TTFF so far: {total_ttff:.2f}s') - print(f'GPUs allocated for {model.value}: {result.gpus_used}') - - # ---- subtract used GPUs ---- - for gpu_type, used in result.gpus_used.items(): - remaining_gpus[gpu_type] = remaining_gpus.get(gpu_type, 0) - used - total_gpus_used[gpu_type] = total_gpus_used.get(gpu_type, 0) + used - - # ---- roll over unused budget to next models ---- - # If this model used fewer GPUs than its budget, the surplus - # is distributed evenly among the remaining models. - remaining_models = [ - m for m in model_order - if m in gpu_budget and MODEL_ORDER.get(m, 0) > MODEL_ORDER.get(model, 0) - ] - if remaining_models: - for gpu_type in num_gpus: - budget_for_model = model_budget.get(gpu_type, 0) - used_by_model = result.gpus_used.get(gpu_type, 0) - surplus = budget_for_model - used_by_model - if surplus > 0: - per_model_extra = surplus // len(remaining_models) - leftover = surplus % len(remaining_models) - for i, rm in enumerate(remaining_models): - extra = per_model_extra + (1 if i < leftover else 0) - gpu_budget[rm][gpu_type] = gpu_budget[rm].get(gpu_type, 0) + extra - - if verbose: - print( - f"Helix: {model.value} allocated. " - f"Time: {result.total_time_s:.2f}s, " - f"TTFF: {result.ttff_s:.2f}s, " - f"GPUs used: {result.gpus_used}, " - f"Remaining: {remaining_gpus}" - ) - - result = evaluate_model_allocation( - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - models=all_model_allocations, - num_gpus=num_gpus, - ) - - if verbose: - print( - f"=== Helix final: " - f"Makespan={result.total_time_s:.2f}s, " - f"TTFF={result.ttff_s:.2f}s, " - f"TBF={result.tbf_s:.4f}s, " - f"Cost=${result.cost:.2f}, " - f"Energy={result.total_energy:.2f}Ws, " - f"GPUs used={result.gpus_used} ===" - ) - - return result diff --git a/simulator/hexgen.py b/simulator/hexgen.py deleted file mode 100644 index 64c64160..00000000 --- a/simulator/hexgen.py +++ /dev/null @@ -1,629 +0,0 @@ -""" -HexGen algorithm for the StreamWise workflow allocation problem. - -Reference: https://arxiv.org/abs/2311.11514 - -HexGen treats each model in the workflow as an independent component for optimization. -It tracks metrics per model and optimizes models sequentially according to MODEL_ORDER. -When a model's metric converges (stops dropping), it moves to the next model. -After the last model converges, it cycles back to the first model and allocates -remaining GPUs until exhausted. -""" - -from __future__ import annotations -import logging -from typing import Optional - -from sim_types import Result -from sim_types import GPUType -from sim_types import WorkflowConfig -from sim_types import PowerData -from sim_types import LatencyData -from sim_types import Model -from sim_types import ModelAllocation -from sim_types import Policy -from sim_types import Solver -from sim_types import MODEL_ORDER - -from utils import simplify_model_allocations - -from evaluator import calc_used_gpus -from evaluator import evaluate_model_allocation - -from greedy import GreedyAllocator - -from actions import gen_actions -from actions import choose_action -from actions import apply_action - -from policies import HEXGEN_POLICY -from policies import MAX_ITERATIONS -from policies import USE_ALL_GPUS - - -def _get_model_order(workflow: WorkflowConfig) -> list[Model]: - """Get ordered list of models in the workflow, sorted by MODEL_ORDER.""" - return sorted( - [m for m in workflow.models if m in MODEL_ORDER], - key=lambda m: MODEL_ORDER[m], - ) - - -class HexGenAllocator(GreedyAllocator): - """ - HexGen-style allocator that optimizes models one at a time, - sequentially following MODEL_ORDER. - - Reference: https://arxiv.org/abs/2311.11514 - - Key differences from GreedyAllocator: - 1. Each model is treated as an independent optimization target. - 2. Per-model metrics are tracked separately. - 3. Models are optimized in MODEL_ORDER sequence. When a model's metric - converges, it moves to the next model. After the last model converges, - it cycles back to the first and allocates remaining GPUs. - """ - - def __init__( - self, - workflow: WorkflowConfig, - latency_data: LatencyData, - power_data: Optional[PowerData] = None, - policy: Policy = HEXGEN_POLICY, - ) -> None: - super().__init__( - workflow, - latency_data, - power_data, - policy, - ) - assert self.policy.solver == Solver.HEXGEN - - def _pick_from_single_device_mapping( - self, - num_gpus: int, - gpu_type: GPUType, - verbose: bool = False, - allow_removal: bool = False, - allow_merging: bool = False, - look_ahead_replicas: int = 3, - ) -> Result: - """ - HexGen-style allocation for a single GPU type (>8 GPUs). - Optimizes models one at a time following MODEL_ORDER. - """ - from constants import NUM_GPUS_PER_SERVER - - assert num_gpus >= NUM_GPUS_PER_SERVER[gpu_type] - - # Initialize allocations (same as GreedyAllocator) - models = self._init_single_device_models(gpu_type) - - remaining_gpus = num_gpus - calc_used_gpus(models) - assert 0 <= remaining_gpus <= num_gpus - - # --- HexGen per-model sequential optimization --- - model_order = _get_model_order(self.workflow) - per_model_metrics: dict[Model, Optional[float]] = {m: None for m in model_order} - - it = 0 - current_model_idx = 0 - cycles_without_progress = 0 # track full cycles without any improvement - total_models = len(model_order) - - while remaining_gpus > 0: - if current_model_idx >= total_models: - # Completed a full cycle, wrap around - current_model_idx = 0 - cycles_without_progress += 1 - if cycles_without_progress >= 1: - logging.debug( - f"HexGen: No progress after {cycles_without_progress} full cycles.") - break - - current_model = model_order[current_model_idx] - - if verbose: - print(f"--- HexGen: Optimizing {current_model.value} " - f"(model {current_model_idx + 1}/{total_models}) ---") - - # Inner loop: keep optimizing current model until convergence - inner_it = 0 - while remaining_gpus > 0: - # Evaluate current state - evaluate_model_allocation( - models=models, - num_gpus={gpu_type: num_gpus}, - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - round_up_cost_to_server=False, - ) - - # Generate actions only for the current model - all_actions = gen_actions( - num_gpus={gpu_type: num_gpus}, - latency_data=self.latency_data, - power_data=self.power_data, - workflow=self.workflow, - models=models, - policy=self.policy, - ) - - # Filter to actions targeting the current model only - model_actions = [a for a in all_actions if a.model == current_model] - - if not model_actions: - logging.debug( - f"HexGen: No actions for {current_model.value} after {inner_it} inner iterations.") - break - - best_action = choose_action(model_actions, self.policy.objective) - - if not best_action: - logging.debug(f"HexGen: No action selected for {current_model.value}.") - break - - new_metric = best_action.get_metric(self.policy.objective) - prev_metric = per_model_metrics[current_model] - - if self.policy.objective.is_monotonic() and prev_metric is not None and new_metric >= prev_metric: - msg = ( - f"HexGen: {current_model.value} converged after {inner_it} inner iterations. " - f"Metric: {new_metric:.2f} >= previous {prev_metric:.2f}." - ) - if verbose: - print(msg) - logging.debug(msg) - break - - per_model_metrics[current_model] = new_metric - - models = apply_action(best_action, models=models) - models = simplify_model_allocations(models) - - remaining_gpus = num_gpus - calc_used_gpus(models) - - if verbose: - self._print_iteration(it, models, {gpu_type: num_gpus}) - print(f"HexGen: Applied action for {current_model.value}, " - f"metric: {new_metric:.2f}, remaining: {remaining_gpus}") - - it += 1 - inner_it += 1 - - if it > MAX_ITERATIONS: - logging.debug(f"HexGen: Reached max iterations ({MAX_ITERATIONS}). Stopping.") - break - - if it > MAX_ITERATIONS: - break - - current_model_idx += 1 - - # --- USE_ALL_GPUS: fill remaining GPUs by cycling through MODEL_ORDER --- - remaining_gpus = num_gpus - calc_used_gpus(models) - if USE_ALL_GPUS and remaining_gpus > 0: - models = self._fill_remaining_gpus_single( - models=models, - num_gpus=num_gpus, - gpu_type=gpu_type, - model_order=model_order, - it=it, - verbose=verbose, - ) - - # Final evaluation - result = evaluate_model_allocation( - models=models, - num_gpus={gpu_type: num_gpus}, - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - round_up_cost_to_server=True, - ) - - if verbose: - self._print_final_allocation( - models=models, - used_devices=result.gpus_used, - total_devices={gpu_type: num_gpus}, - power_data=self.power_data, - total_time_s=result.total_time_s, - ttff_s=result.ttff_s, - first_chunk_time=result.first_chunk_time, - tbf_s=result.tbf_s, - total_energy=result.total_energy if self.power_data else 0.0, - cost=result.cost, - ) - - if not self.policy.is_disaggregated(Model.HF): - if models[gpu_type][Model.HF_VAE]: - assert models[gpu_type][Model.HF_VAE][0].get_num_gpus() == 0, \ - "HF_VAE must have 0 GPUs when HF disaggregation is disabled" - if not self.policy.is_disaggregated(Model.FT): - if models[gpu_type][Model.FT_VAE]: - assert models[gpu_type][Model.FT_VAE][0].get_num_gpus() == 0, \ - "FT_VAE must have 0 GPUs when FT disaggregation is disabled" - - num_gpus_used = result.gpus_used[gpu_type] - assert num_gpus_used <= num_gpus, f"{num_gpus_used}>{num_gpus} for {gpu_type.value}" - - return Result( - total_time_s=result.total_time_s, - models=models, - gpus_used={gpu_type: num_gpus_used}, - gpus_total={gpu_type: num_gpus}, - ttff_s=result.ttff_s, - tbf_s=result.tbf_s, - total_energy=result.total_energy if self.power_data else 0.0, - cost=result.cost, - ) - - def _pick_from_both_devices_mapping( - self, - num_gpus: dict[GPUType, int], - verbose: bool = False, - allow_removal: bool = False, - allow_merging: bool = False, - look_ahead_replicas: int = 3, - ) -> Result: - """ - HexGen-style allocation for two GPU types. - Optimizes models one at a time following MODEL_ORDER. - """ - from constants import NUM_GPUS_PER_SERVER - - gpu_types = list(num_gpus.keys()) - assert len(gpu_types) == 2 - gpu_type1 = gpu_types[0] - gpu_type2 = gpu_types[1] - assert num_gpus[gpu_type1] >= NUM_GPUS_PER_SERVER[gpu_type1] - assert num_gpus[gpu_type2] >= NUM_GPUS_PER_SERVER[gpu_type2] - - # Initialize allocations (same as GreedyAllocator) - models = self._init_both_devices_models(gpu_type1, gpu_type2) - - remaining_gpus: dict[GPUType, int] = {} - for gpu_type in num_gpus.keys(): - remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]}) - - # --- HexGen per-model sequential optimization --- - model_order = _get_model_order(self.workflow) - per_model_metrics: dict[Model, Optional[float]] = {m: None for m in model_order} - - if verbose: - evaluate_model_allocation( - models=models, - num_gpus=num_gpus, - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - round_up_cost_to_server=True, - ) - self._print_iteration(0, models, num_gpus) - - it = 1 - current_model_idx = 0 - cycles_without_progress = 0 - total_models = len(model_order) - - while sum(remaining_gpus.values()) > 0: - if current_model_idx >= total_models: - current_model_idx = 0 - cycles_without_progress += 1 - if cycles_without_progress >= 1: - logging.debug( - f"HexGen: No progress after {cycles_without_progress} full cycles.") - break - - current_model = model_order[current_model_idx] - - if verbose: - print(f"--- HexGen: Optimizing {current_model.value} " - f"(model {current_model_idx + 1}/{total_models}) ---") - - inner_it = 0 - - while sum(remaining_gpus.values()) > 0: - evaluate_model_allocation( - models=models, - num_gpus=num_gpus, - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - round_up_cost_to_server=False, - ) - - all_actions = gen_actions( - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - num_gpus=num_gpus, - models=models, - policy=self.policy, - ) - - # Filter to current model - model_actions = [a for a in all_actions if a.model == current_model] - - if not model_actions: - logging.debug( - f"HexGen: No actions for {current_model.value} after {inner_it} inner iterations.") - break - - best_action = choose_action(model_actions, self.policy.objective) - - if not best_action: - logging.debug(f"HexGen: No action selected for {current_model.value}.") - break - - new_metric = best_action.get_metric(self.policy.objective) - prev_metric = per_model_metrics[current_model] - - if self.policy.objective.is_monotonic() and prev_metric is not None and new_metric >= prev_metric: - msg = ( - f"HexGen: {current_model.value} converged. " - f"Metric: {new_metric:.2f} >= previous {prev_metric:.2f}." - ) - if verbose: - print(msg) - logging.debug(msg) - break - - per_model_metrics[current_model] = new_metric - - models = apply_action(best_action, models=models) - models = simplify_model_allocations(models) - - remaining_gpus.clear() - for gpu_type in num_gpus.keys(): - remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]}) - - if verbose: - self._print_iteration(it, models, num_gpus) - print(f"HexGen: Applied action for {current_model.value}, " - f"metric: {new_metric:.2f}") - print("Remaining devices:") - for gt in remaining_gpus: - print(f" {remaining_gpus[gt]} x {gt.value}") - - it += 1 - inner_it += 1 - - if it > MAX_ITERATIONS: - logging.debug(f"HexGen: Reached max iterations ({MAX_ITERATIONS}). Stopping.") - break - - if it > MAX_ITERATIONS: - break - - current_model_idx += 1 - - # --- USE_ALL_GPUS: fill remaining GPUs by cycling through MODEL_ORDER --- - remaining_gpus_total = sum( - num_gpus[gt] - calc_used_gpus({gt: models[gt]}) - for gt in num_gpus - ) - if USE_ALL_GPUS and remaining_gpus_total > 0: - models = self._fill_remaining_gpus_multi( - models=models, - num_gpus=num_gpus, - model_order=model_order, - it=it, - verbose=verbose, - ) - - # Adjust for no disaggregation - if not self.policy.is_disaggregated(Model.HF): - for models_gpu in models.values(): - for instance_id in range(len(models_gpu[Model.HF_VAE])): - assert models_gpu[Model.HF_VAE][instance_id].get_num_gpus() == 0, \ - "HF_VAE must have 0 GPUs when HF disaggregation is disabled" - if not self.policy.is_disaggregated(Model.FT): - for models_gpu in models.values(): - for instance_id in range(len(models_gpu[Model.FT_VAE])): - assert models_gpu[Model.FT_VAE][instance_id].get_num_gpus() == 0, \ - "FT_VAE must have 0 GPUs when FT disaggregation is disabled" - - # Final evaluation - result = evaluate_model_allocation( - models=models, - num_gpus=num_gpus, - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - round_up_cost_to_server=True, - ) - - if verbose: - self._print_final_allocation( - models=models, - used_devices=result.gpus_used, - total_devices={ - gpu_type1: num_gpus.get(gpu_type1, 0), - gpu_type2: num_gpus.get(gpu_type2, 0), - }, - power_data=self.power_data, - total_time_s=result.total_time_s, - ttff_s=result.ttff_s, - first_chunk_time=result.first_chunk_time, - tbf_s=result.tbf_s, - total_energy=result.total_energy if self.power_data else 0.0, - cost=result.cost, - ) - - assert result.gpus_used[gpu_type1] <= num_gpus.get(gpu_type1, 0), \ - f"{gpu_type1.value}: {result.gpus_used[gpu_type1]} > {num_gpus.get(gpu_type1, 0)}" - assert result.gpus_used[gpu_type2] <= num_gpus.get(gpu_type2, 0), \ - f"{gpu_type2.value}: {result.gpus_used[gpu_type2]} > {num_gpus.get(gpu_type2, 0)}" - - return Result( - total_time_s=result.total_time_s, - models=models, - gpus_used=result.gpus_used, - ttff_s=result.ttff_s, - tbf_s=result.tbf_s, - total_energy=result.total_energy if self.power_data else 0.0, - cost=result.cost, - ) - - def _fill_remaining_gpus_single( - self, - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - num_gpus: int, - gpu_type: GPUType, - model_order: list[Model], - it: int = 0, - verbose: bool = False, - ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: - """ - Fill remaining GPUs by cycling through MODEL_ORDER (single GPU type). - Applies any available action per model, ignoring metric convergence. - Stops when all GPUs are used or no model can accept more. - """ - remaining_gpus = num_gpus - calc_used_gpus(models) - total_models = len(model_order) - model_idx = 0 - models_exhausted: set[Model] = set() - - if verbose: - print(f"--- HexGen: USE_ALL_GPUS fill phase, {remaining_gpus} remaining ---") - - while remaining_gpus > 0 and len(models_exhausted) < total_models: - current_model = model_order[model_idx % total_models] - model_idx += 1 - - if current_model in models_exhausted: - continue - - evaluate_model_allocation( - models=models, - num_gpus={gpu_type: num_gpus}, - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - round_up_cost_to_server=False, - ) - - all_actions = gen_actions( - num_gpus={gpu_type: num_gpus}, - latency_data=self.latency_data, - power_data=self.power_data, - workflow=self.workflow, - models=models, - policy=self.policy, - ) - model_actions = [a for a in all_actions if a.model == current_model] - - if not model_actions: - models_exhausted.add(current_model) - logging.debug(f"HexGen fill: {current_model.value} exhausted (no actions).") - continue - - best_action = choose_action(model_actions, self.policy.objective) - if not best_action: - models_exhausted.add(current_model) - logging.debug(f"HexGen fill: {current_model.value} exhausted (no action selected).") - continue - - models = apply_action(best_action, models=models) - models = simplify_model_allocations(models) - remaining_gpus = num_gpus - calc_used_gpus(models) - - if verbose: - self._print_iteration(it, models, {gpu_type: num_gpus}) - print(f"HexGen fill: Allocated to {current_model.value}, remaining: {remaining_gpus}") - - it += 1 - if it > MAX_ITERATIONS: - logging.debug(f"HexGen fill: Reached max iterations ({MAX_ITERATIONS}). Stopping.") - break - - return models - - def _fill_remaining_gpus_multi( - self, - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - num_gpus: dict[GPUType, int], - model_order: list[Model], - it: int = 0, - verbose: bool = False, - ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: - """ - Fill remaining GPUs by cycling through MODEL_ORDER (multi GPU type). - Applies any available action per model, ignoring metric convergence. - Stops when all GPUs are used or no model can accept more. - """ - total_remaining = sum( - num_gpus[gt] - calc_used_gpus({gt: models[gt]}) - for gt in num_gpus - ) - total_models = len(model_order) - model_idx = 0 - models_exhausted: set[Model] = set() - - if verbose: - print(f"--- HexGen: USE_ALL_GPUS fill phase, {total_remaining} remaining ---") - - while total_remaining > 0 and len(models_exhausted) < total_models: - current_model = model_order[model_idx % total_models] - model_idx += 1 - - if current_model in models_exhausted: - continue - - evaluate_model_allocation( - models=models, - num_gpus=num_gpus, - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - round_up_cost_to_server=False, - ) - - all_actions = gen_actions( - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - num_gpus=num_gpus, - models=models, - policy=self.policy, - ) - model_actions = [a for a in all_actions if a.model == current_model] - - if not model_actions: - models_exhausted.add(current_model) - logging.debug(f"HexGen fill: {current_model.value} exhausted (no actions).") - continue - - best_action = choose_action(model_actions, self.policy.objective) - if not best_action: - models_exhausted.add(current_model) - logging.debug(f"HexGen fill: {current_model.value} exhausted (no action selected).") - continue - - models = apply_action(best_action, models=models) - models = simplify_model_allocations(models) - total_remaining = sum( - num_gpus[gt] - calc_used_gpus({gt: models[gt]}) - for gt in num_gpus - ) - - if verbose: - self._print_iteration(it, models, num_gpus) - print(f"HexGen fill: Allocated to {current_model.value}, remaining: {total_remaining}") - - it += 1 - if it > MAX_ITERATIONS: - logging.debug(f"HexGen fill: Reached max iterations ({MAX_ITERATIONS}). Stopping.") - break - - return models diff --git a/simulator/milp.py b/simulator/milp.py deleted file mode 100644 index 7a84e754..00000000 --- a/simulator/milp.py +++ /dev/null @@ -1,1070 +0,0 @@ -""" -MILP formulation for the StreamWise workflow allocation problem. -""" - -from __future__ import annotations - -import json -import logging - -from typing import Callable -from typing import Optional - -from pyomo.environ import ConcreteModel -from pyomo.environ import Var -from pyomo.environ import Set -from pyomo.environ import Objective as OptObjective -from pyomo.environ import Binary -from pyomo.environ import NonNegativeIntegers -from pyomo.environ import NonNegativeReals -from pyomo.environ import minimize -from pyomo.environ import SolverFactory -from pyomo.environ import ConstraintList - -from sim_types import GPUType -from sim_types import Model -from sim_types import WorkflowConfig -from sim_types import LatencyData -from sim_types import PowerData -from sim_types import Result -from sim_types import Policy -from sim_types import ModelAllocation -from sim_types import Objective -from sim_types import Solver - -from models import get_model_allocation - -from model_allocator import ModelAllocator - -from constants import DEVICE_OPTIONS -from constants import NUM_GPUS_PER_SERVER -from constants import SECONDS_IN_HOUR - -from policies import STREAMWISE_MILP_POLICY - - -MAX_INSTANCES = 16 - -# Maximum time it can take: 24 hours in seconds -# Used for big-M constraints to link TTFF and makespan to instance variables -MAX_TIME = 24 * SECONDS_IN_HOUR - - -# Allocators that require quadratic (bilinear) objectives - need Gurobi -QUADRATIC_OBJECTIVES = [ - Objective.TTFF_COST, - Objective.TIME_ENERGY, - Objective.ENERGY_COST, -] - - -def idx( - gpu_type: GPUType, - model_name: Model, - instance_id: int -) -> tuple[str, str, int]: - """Helper to convert enum to index key for instance variables.""" - return (gpu_type.value, model_name.value, instance_id) - - -def dev_idx( - gpu_type: GPUType, - model_name: Model, - instance_id: int, - num_devices: int -) -> tuple[str, str, int, int]: - """Helper to convert enum to index key for device variables.""" - return (gpu_type.value, model_name.value, instance_id, num_devices) - - -class MILPAllocator(ModelAllocator): - """ - MILP-based allocator that computes the optimal model allocation. - """ - def __init__( - self, - workflow: WorkflowConfig, - latency_data: LatencyData, - power_data: Optional[PowerData] = None, - policy: Policy = STREAMWISE_MILP_POLICY, - ) -> None: - super().__init__( - workflow, - latency_data, - power_data, - policy, - ) - assert self.policy.solver in [Solver.GUROBI, Solver.HIGHS] - - def allocate( - self, - num_gpus: dict[GPUType, int], - verbose: bool = False, - running_cost: bool = False, # If True, cost = active time only; False = makespan x GPUs - max_cost: Optional[float] = None, # If set, adds a constraint to limit cost - max_ttff: Optional[float] = None, # If set, adds a constraint to limit TTFF - max_makespan: Optional[float] = None, # If set, adds a constraint to limit makespan - time_limit: Optional[int] = None, # Time limit for the solver in seconds - save_solution_path: Optional[str] = None, # If set, saves the solution to a JSON file - warm_start_path: Optional[str] = None, # If set, loads a warm start solution from a JSON file - force_num_gpus: bool = False, # If True, adds constraints to force the use of all available GPUs - skip_server_constraint: bool = False, # If True, skips the GPU-per-server constraint - ) -> Result: - """ - Calculate the optimal model allocation and resulting metrics using MILP formulation. - """ - m = ConcreteModel() - - # Options: "gurobi", "highs" - solver_name = self.policy.solver.value - - # Define index sets - gpu_types = list(num_gpus.keys()) - - model_names = [ - Model.GEMMA, - Model.FLUX, - Model.HF, - # Model.HF_VAE, - Model.FT, - # Model.FT_VAE, - # Model.UPSCALER, - Model.OTHERS, - ] - if self.policy.use_upscaler: - model_names.append(Model.UPSCALER) - if self.policy.is_disaggregated(Model.HF): - model_names.append(Model.HF_VAE) - if self.policy.is_disaggregated(Model.FT): - model_names.append(Model.FT_VAE) - - # Remove models not in the workflow - model_names = [ - model_name - for model_name in model_names - if model_name in self.workflow.models - ] - - instance_ids = list(range(MAX_INSTANCES)) - - # The units of work that each model has to do - work: dict[Model, int] = self.workflow.work - - # Create Pyomo Sets - m.GPU_TYPES = Set(initialize=[g.value for g in gpu_types]) - m.MODEL_NAMES = Set(initialize=[mn.value for mn in model_names]) - m.INSTANCES = Set(initialize=instance_ids) - - # Create index set for device choices: (gpu_type, model_name, instance_id, device_count) - device_index_set = [ - (gpu_type.value, model_name.value, instance_id, num_devices) - for gpu_type in gpu_types - for model_name in model_names - for instance_id in instance_ids - for num_devices in [0] + DEVICE_OPTIONS[model_name] - ] - m.DEVICE_INDEX = Set(initialize=device_index_set) - - # Create index set for instance variables: (gpu_type, model_name, instance_id) - instance_index_set = [ - (gpu_type.value, model_name.value, instance_id) - for gpu_type in gpu_types - for model_name in model_names - for instance_id in instance_ids - ] - m.INSTANCE_INDEX = Set(initialize=instance_index_set) - - # Define indexed variables - m.device_choice = Var(m.DEVICE_INDEX, domain=Binary) - m.work_device = Var(m.DEVICE_INDEX, domain=NonNegativeIntegers) # Linearization: work per device choice - m.gpus = Var(m.INSTANCE_INDEX, domain=NonNegativeIntegers) - m.is_active = Var(m.INSTANCE_INDEX, domain=Binary) - m.is_min = Var(m.INSTANCE_INDEX, domain=Binary) - m.work = Var(m.INSTANCE_INDEX, domain=NonNegativeIntegers) - m.time = Var(m.INSTANCE_INDEX, domain=NonNegativeReals) - m.ttff = Var(m.INSTANCE_INDEX, domain=NonNegativeReals) - - # Objective variables - m.makespan = Var(domain=NonNegativeReals) - m.ttff_user = Var(domain=NonNegativeReals) - m.ttff_min = Var(m.MODEL_NAMES, domain=NonNegativeReals) # Per-model minimum TTFF - m.time_max = Var(m.MODEL_NAMES, domain=NonNegativeReals) # Per-model maximum time - m.cost = Var(domain=NonNegativeReals) - m.energy = Var(domain=NonNegativeReals) - - # Constraint list for dynamic constraints - m.constraints = ConstraintList() - - for gpu_type in gpu_types: - for model_name in model_names: - for instance_id in instance_ids: - key = idx(gpu_type, model_name, instance_id) - - # GPUs used = sum of num_devices * device_choice[num_devices] - m.constraints.add( - m.gpus[key] == sum( - num_devices * m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] - for num_devices in [0] + DEVICE_OPTIONS[model_name] - ) - ) - - # Cannot select inactive instance as min - m.constraints.add(m.is_min[key] <= m.is_active[key]) - # If active = 0 -> GPUs = 0 - m.constraints.add(m.gpus[key] <= num_gpus[gpu_type] * m.is_active[key]) - # If active = 1 -> GPUs ≥ 1 - m.constraints.add(m.gpus[key] >= m.is_active[key]) - # If work = 0 -> active = 0 -> GPUs = 0 - m.constraints.add(m.is_active[key] <= m.work[key]) - - # If device = 0 -> work = 0 - dev_idx_0 = dev_idx(gpu_type, model_name, instance_id, 0) - m.constraints.add( - m.work[key] - <= work[model_name] * (1 - m.device_choice[dev_idx_0]) - ) - - # Linearization: work_device links device_choice and work - # work = sum(work_device[d] for d in devices) - excludes 0 GPUs since they can't do work - m.constraints.add( - m.work[key] == sum( - m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] - for num_devices in DEVICE_OPTIONS[model_name] - ) - ) - # If any non-zero device is selected, work must be >= 1 - m.constraints.add( - m.work[key] >= sum( - m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] - for num_devices in DEVICE_OPTIONS[model_name] - ) - ) - # work_device[d] <= TOTAL_WORK * device_choice[d] - for num_devices in [0] + DEVICE_OPTIONS[model_name]: - didx = dev_idx(gpu_type, model_name, instance_id, num_devices) - m.constraints.add( - m.work_device[didx] <= work[model_name] * m.device_choice[didx] - ) - - # Link instance time to per-model max time - m.constraints.add(m.time[key] <= m.time_max[model_name.value]) - - # Link TTFF to per-model TTFF min - # If selected → ttff_min[model] == ttff_var - m.constraints.add(m.ttff_min[model_name.value] >= m.ttff[key] - MAX_TIME * (1 - m.is_min[key])) - m.constraints.add(m.ttff_min[model_name.value] <= m.ttff[key] + MAX_TIME * (1 - m.is_active[key])) - - # One device per instance - for instance_id in instance_ids: - m.constraints.add( - sum( - m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] - for num_devices in [0] + DEVICE_OPTIONS[model_name] - ) == 1 - ) - - # Symmetry breaking (fill earlier instances first) - for instance_id in range(MAX_INSTANCES - 1): - m.constraints.add( - m.gpus[idx(gpu_type, model_name, instance_id)] - >= m.gpus[idx(gpu_type, model_name, instance_id + 1)] - ) - - # Makespan is the sum of max times per model (models run sequentially) - m.constraints.add(m.makespan == sum(m.time_max[model_name.value] for model_name in model_names)) - - # User TTFF definition: sum of min TTFF per model - m.constraints.add(m.ttff_user >= sum(m.ttff_min[model_name.value] for model_name in model_names)) - m.constraints.add(m.ttff_user >= m.makespan - self.workflow.total_video_seconds) - - # Select exactly 1 instance as the min TTFF instance per model - for model_name in model_names: - m.constraints.add( - sum( - m.is_min[idx(gpu_type, model_name, instance_id)] - for gpu_type in gpu_types - for instance_id in instance_ids - ) == 1 - ) - - # Resolution scaling factor for HF/VAE/FT - latency_ratio = self.workflow.get_resolution_scale(self.policy.use_upscaler) - - # Time constraints - # Each model block is guarded by membership in model_names so that - # the MILP can be built for a subset of models (e.g. Helix per-model). - for gpu_type in gpu_types: - # Gemma - if Model.GEMMA in model_names and work[Model.GEMMA] > 0: - model_name = Model.GEMMA - for instance_id in instance_ids: - key = idx(gpu_type, model_name, instance_id) - # Makespan is the max time across all instances - # Linearized: use work_device instead of device_choice * work - if work[model_name] > 1: - # Parallel: each work unit = 1 scene - # Time for w scenes - # = gemma_first_scene + gemma_per_scene * (w - 1) - # = (gemma_first_scene - gemma_per_scene) * is_active + gemma_per_scene * work - # Using linearized variables: - # = (gemma_first_scene[d] - gemma_per_scene[d]) * \ - # device_choice[d] + gemma_per_scene[d] * work_device[d] - m.constraints.add( - m.time[key] == sum( - ( - self.latency_data[gpu_type].gemma_first_scene[num_devices] - - self.latency_data[gpu_type].gemma_per_scene[num_devices] - ) - * m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] - + self.latency_data[gpu_type].gemma_per_scene[num_devices] - * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] - for num_devices in DEVICE_OPTIONS[model_name] - ) - ) - else: - m.constraints.add( - m.time[key] == sum( - ( - self.latency_data[gpu_type].gemma_first_scene[num_devices] - + self.latency_data[gpu_type].gemma_per_scene[num_devices] - * (self.workflow.total_scenes - 1) - ) - * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] - for num_devices in DEVICE_OPTIONS[model_name] - ) - ) - # TTFF is for 1 work unit - m.constraints.add( - m.ttff[key] == sum( - m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] - * self.latency_data[gpu_type].gemma_first_scene[num_devices] - * 1 # TTFF for tokens in first scene - for num_devices in DEVICE_OPTIONS[model_name] - ) - ) - - # Flux - if Model.FLUX in model_names and work[Model.FLUX] > 0: - model_name = Model.FLUX - for instance_id in instance_ids: - key = idx(gpu_type, model_name, instance_id) - # Makespan is the max time across all instances - # Linearized: use work_device instead of device_choice * work - if work[model_name] > 1: - # Parallel: each work unit = 1 scene - # Time for w scenes = latency * num_steps_flux * w - m.constraints.add( - m.time[key] == sum( - self.latency_data[gpu_type][model_name, num_devices] - * self.workflow.num_steps[model_name] - * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] - for num_devices in DEVICE_OPTIONS[model_name] - ) - ) - else: - # Non-parallel: single work unit covers all scenes - m.constraints.add( - m.time[key] == sum( - self.latency_data[gpu_type][model_name, num_devices] - * self.workflow.num_steps[model_name] - * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] - for num_devices in DEVICE_OPTIONS[model_name] - ) - ) - # TTFF is for 1 work unit - m.constraints.add( - m.ttff[key] == sum( - m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] - * self.latency_data[gpu_type][model_name, num_devices] - * self.workflow.num_steps[model_name] - * 1 # TTFF for first work unit - for num_devices in DEVICE_OPTIONS[model_name] - ) - ) - - # Hunyuan FramePack - if Model.HF in model_names and work[Model.HF] > 0: - model_name = Model.HF - for instance_id in instance_ids: - key = idx(gpu_type, model_name, instance_id) - - """ - from models import HFModelAllocation - HFModelAllocation( - gpu_type, - num_devices, - replicas=1, - )._calc_time_per_subscene( - self.policy, - self.workflow, - self.latency_data[gpu_type] - ) - """ - - # Makespan is the max time across all instances - # Linearized: use work_device instead of device_choice * work - hf_time_expr = sum( - self.workflow.per_subscene_frames[model_name] - / self.workflow.hf_frames[self.workflow.frames_per_step_idx] - * self.latency_data[gpu_type][model_name, num_devices] - * latency_ratio - * self.workflow.num_steps[model_name] - * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] - for num_devices in DEVICE_OPTIONS[model_name] - ) - # When not disaggregated, VAE runs on the same instance - if not self.policy.is_disaggregated(Model.HF): - hf_vae_time_per_work = ( - self.latency_data[gpu_type][Model.HF_VAE, 1] - * latency_ratio - / self.workflow.hf_frames[self.workflow.frames_per_step_idx] - ) - hf_time_expr += hf_vae_time_per_work * m.work[key] - m.constraints.add(m.time[key] == hf_time_expr) - # TTFF is for first chunk (can be smaller than subscene when disaggregated) - ttff_frames_hf = min( - self.workflow.hf_frames[0], - self.workflow.per_subscene_frames[model_name]) - hf_ttff_expr = sum( - m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] - * ttff_frames_hf - / self.workflow.hf_frames[self.workflow.frames_per_step_idx] - * self.latency_data[gpu_type][model_name, num_devices] - * latency_ratio - * self.workflow.num_steps[model_name] - * 1 # TTFF for first chunk - for num_devices in DEVICE_OPTIONS[model_name] - ) - # When not disaggregated, add VAE decode time for first chunk - if not self.policy.is_disaggregated(Model.HF): - hf_vae_ttff = ( - ttff_frames_hf - / self.workflow.hf_frames[self.workflow.frames_per_step_idx] - * self.latency_data[gpu_type][Model.HF_VAE, 1] - * latency_ratio - ) - hf_ttff_expr += hf_vae_ttff * m.is_active[key] - m.constraints.add(m.ttff[key] == hf_ttff_expr) - - # Hunyuan FramePack VAE - if Model.HF_VAE in model_names and work[Model.HF_VAE] > 0: - model_name = Model.HF_VAE - for instance_id in instance_ids: - key = idx(gpu_type, model_name, instance_id) - # Makespan is the max time across all instances - # Linearized: use work_device instead of device_choice * work - m.constraints.add( - m.time[key] == sum( - self.latency_data[gpu_type][model_name, num_devices] - * latency_ratio - / self.workflow.hf_frames[self.workflow.frames_per_step_idx] - * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] - for num_devices in DEVICE_OPTIONS[model_name] - ) - ) - # TTFF is for 1 subscene - m.constraints.add( - m.ttff[key] == sum( - m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] - * self.workflow.per_subscene_frames[Model.HF] - * self.latency_data[gpu_type][model_name, num_devices] - * latency_ratio - / self.workflow.hf_frames[self.workflow.frames_per_step_idx] # frames_per_step_hf - * 1 # TTFF for first subscene - for num_devices in DEVICE_OPTIONS[model_name] - ) - ) - - # Fantasy Talking - if Model.FT in model_names and work[Model.FT] > 0: - model_name = Model.FT - for instance_id in instance_ids: - key = idx(gpu_type, model_name, instance_id) - # Makespan is the max time across all instances - # Linearized: use work_device instead of device_choice * work - ft_time_expr = sum( - self.workflow.per_subscene_frames[model_name] - / self.workflow.ft_frames[self.workflow.frames_per_step_idx] - * self.latency_data[gpu_type][model_name, num_devices] - * latency_ratio - * self.workflow.num_steps[model_name] - * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] - for num_devices in DEVICE_OPTIONS[model_name] - ) - # When not disaggregated, VAE runs on the same instance - if not self.policy.is_disaggregated(Model.FT): - ft_vae_time_per_work = ( - self.latency_data[gpu_type][Model.FT_VAE, 1] - * latency_ratio - / self.workflow.ft_frames[self.workflow.frames_per_step_idx] - ) - ft_time_expr += ft_vae_time_per_work * m.work[key] - m.constraints.add(m.time[key] == ft_time_expr) - # TTFF is for 1 work unit (e.g., subscene) - ft_ttff_expr = sum( - m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] - * self.workflow.per_subscene_frames[model_name] - / self.workflow.ft_frames[self.workflow.frames_per_step_idx] - * self.latency_data[gpu_type][model_name, num_devices] - * latency_ratio - * self.workflow.num_steps[model_name] - * 1 # TTFF for first subscene - for num_devices in DEVICE_OPTIONS[model_name] - ) - # When not disaggregated, add VAE decode time for first subscene - if not self.policy.is_disaggregated(Model.FT): - ft_vae_ttff = ( - self.workflow.per_subscene_frames[Model.FT] - / self.workflow.ft_frames[self.workflow.frames_per_step_idx] - * self.latency_data[gpu_type][Model.FT_VAE, 1] - * latency_ratio - ) - ft_ttff_expr += ft_vae_ttff * m.is_active[key] - m.constraints.add(m.ttff[key] == ft_ttff_expr) - - # Fantasy Talking VAE - if Model.FT_VAE in model_names and work[Model.FT_VAE] > 0: - model_name = Model.FT_VAE - for instance_id in instance_ids: - key = idx(gpu_type, model_name, instance_id) - # Makespan is the max time across all instances - # Linearized: use work_device instead of device_choice * work - m.constraints.add( - m.time[key] == sum( - self.latency_data[gpu_type][model_name, num_devices] - * latency_ratio - / self.workflow.ft_frames[self.workflow.frames_per_step_idx] - * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] - for num_devices in DEVICE_OPTIONS[model_name] - ) - ) - # TTFF is for 1 subscene - m.constraints.add( - m.ttff[key] == sum( - m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] - * self.workflow.per_subscene_frames[Model.FT] - * self.latency_data[gpu_type][model_name, num_devices] - * latency_ratio - / self.workflow.ft_frames[self.workflow.frames_per_step_idx] # frames_per_step_ft - * 1 # TTFF for first subscene - for num_devices in DEVICE_OPTIONS[model_name] - ) - ) - - # Upscaler - if Model.UPSCALER in model_names and work[Model.UPSCALER] > 0 and self.policy.use_upscaler: - model_name = Model.UPSCALER - for instance_id in instance_ids: - key = idx(gpu_type, model_name, instance_id) - # Linearized: use work_device instead of device_choice * work - m.constraints.add( - m.time[key] == sum( - self.latency_data[gpu_type][model_name, num_devices] - * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] - for num_devices in DEVICE_OPTIONS[model_name] - ) - ) - # TTFF is for 1 work unit (e.g., subscene) - m.constraints.add( - m.ttff[key] == sum( - m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] - * self.latency_data[gpu_type][model_name, num_devices] - * self.workflow.per_subscene_frames[Model.FT] - * 1 # TTFF is for first subscene - for num_devices in DEVICE_OPTIONS[model_name] - ) - ) - - # Others - if Model.OTHERS in model_names and work[Model.OTHERS] > 0: - model_name = Model.OTHERS - for instance_id in instance_ids: - key = idx(gpu_type, model_name, instance_id) - # Makespan is the max time across all instances - m.constraints.add( - m.time[key] == sum( - m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] - * self.latency_data[gpu_type][model_name, num_devices] - * self.workflow.total_scenes - for num_devices in DEVICE_OPTIONS[model_name] - ) - ) - # TTFF is for 1 work unit - m.constraints.add( - m.ttff[key] == sum( - m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] - * self.latency_data[gpu_type][model_name, num_devices] - * 1 # TTFF is for first scene - for num_devices in DEVICE_OPTIONS[model_name] - ) - ) - - # Total work to do for each model - for model_name in model_names: - m.constraints.add( - sum( - m.work[idx(gpu_type, model_name, instance_id)] - for gpu_type in gpu_types - for instance_id in instance_ids - ) == work[model_name] - ) - - # Number of GPUs per type - # Add a variable to represent the number of servers for each GPU type - m.num_servers = Var(m.GPU_TYPES, domain=NonNegativeIntegers) - - for gpu_type in gpu_types: - total_gpus = sum( - m.gpus[idx(gpu_type, model_name, instance_id)] - for model_name in model_names - for instance_id in instance_ids - ) - if force_num_gpus: - m.constraints.add(total_gpus == num_gpus[gpu_type]) - else: - m.constraints.add(total_gpus <= num_gpus[gpu_type]) - - # GPUs used must be a multiple of NUM_GPUS_PER_SERVER - if not skip_server_constraint: - m.constraints.add(total_gpus == m.num_servers[gpu_type.value] * NUM_GPUS_PER_SERVER[gpu_type]) - - # Cost calculation - # running_cost=True: cost based only on active model running time - if running_cost: - cost_expr = sum( - self._get_latency_per_work( - gpu_type, - model_name, - num_devices, - ) - * num_devices - * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] - * self.policy.gpu_cost[gpu_type] / SECONDS_IN_HOUR - for gpu_type in gpu_types - for model_name in model_names - for instance_id in instance_ids - for num_devices in DEVICE_OPTIONS[model_name] - ) - # running_cost=False: cost = makespan × total_GPUs_used (GPUs allocated for full job duration) - else: - cost_expr = m.makespan * sum( - m.gpus[idx(gpu_type, model_name, instance_id)] - * self.policy.gpu_cost[gpu_type] / SECONDS_IN_HOUR - for gpu_type in gpu_types - for model_name in model_names - for instance_id in instance_ids - ) - m.constraints.add(m.cost == cost_expr) - - # Energy: model-specific power * active time + idle power * (makespan - active time) - if self.power_data is None: - energy_expr = 0.0 - else: - # Active energy: Use model-specific power values (not TDP) - energy_expr = sum( - self._get_latency_per_work( - gpu_type, - model_name, - num_devices, - ) - * num_devices - * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] - * ( - self._get_power_per_work( - gpu_type, - model_name, - num_devices, - ) - self.power_data[gpu_type]["idle"] - ) - for gpu_type in gpu_types - for model_name in model_names - for instance_id in instance_ids - for num_devices in DEVICE_OPTIONS[model_name] - ) - # Idle energy: idle power * num_gpus * makespan - energy_expr += sum( - self.power_data[gpu_type]["idle"] * num_gpus[gpu_type] * m.makespan - for gpu_type in gpu_types - ) - m.constraints.add(m.energy == energy_expr) - - # Bounds - if max_cost is not None: - m.constraints.add(m.cost <= max_cost) - if max_ttff is not None: - m.constraints.add(m.ttff_user <= max_ttff) - if max_makespan is not None: - m.constraints.add(m.makespan <= max_makespan) - - # Objective functions - obj = get_objective( - m=m, - allocator=self.policy.objective, - solver_name=solver_name, - ) - if obj is not None: - m.objective = obj - - # Solve - solver = SolverFactory(solver_name) - if solver_name == "gurobi" and time_limit: - solver.options["TimeLimit"] = time_limit - if solver_name == "highs" and time_limit: - solver.options["time_limit"] = time_limit - if self.policy.objective in QUADRATIC_OBJECTIVES and solver_name == "gurobi": - solver.options['NonConvex'] = 2 # Option for bilinear objectives - if solver_name == "highs": - solver.options["time_limit"] = 50 # seconds - - if warm_start_path is not None: - _load_warm_start(m, warm_start_path) - - if solver_name == "gurobi": - opt_result = solver.solve( - m, - tee=verbose, - warmstart=warm_start_path is not None, - ) - else: - opt_result = solver.solve(m, tee=verbose) - - if opt_result.solver.status != "ok": - logging.error(f"Solver failed with status: {opt_result.solver.status}") - - if save_solution_path is not None: - _save_solution(m, save_solution_path) - - models = milp_to_models_dict( - m=m, - gpu_types=gpu_types, - model_names=model_names, - instance_ids=instance_ids, - idx=idx, - workflow=self.workflow, - power_data=self.power_data, - policy=self.policy, - ) - - if not self._is_valid_result(m): - return Result() - - tbf_s = 0.0 - if m.makespan.value and self.workflow.num_frames > 0: - tbf_s = m.makespan.value / self.workflow.num_frames - return Result( - models=models, - gpus_used=self._get_num_gpus(m, gpu_types, model_names, instance_ids), - total_time_s=m.makespan.value, - ttff_s=m.ttff_user.value, - tbf_s=tbf_s, - cost=m.cost.value, - total_energy=m.energy.value, - ) - - def _is_valid_result(self, m: ConcreteModel) -> bool: - for gpu_type in m.GPU_TYPES: - for model_name in m.MODEL_NAMES: - for instance_id in m.INSTANCES: - if m.gpus[gpu_type, model_name, instance_id].value is None: - return False - return True - - def _get_num_gpus( - self, - m: ConcreteModel, - gpu_types: list[GPUType], - model_names: list[Model], - instance_ids: list[int], - ) -> dict[GPUType, int]: - if not self._is_valid_result(m): - return {} - return { - gpu_type: sum( - # round() snaps solver float to nearest int (e.g. 1.9999 -> 2) - int(round(m.gpus[idx(gpu_type, model_name, instance_id)].value)) - for model_name in model_names - for instance_id in instance_ids - if m.gpus[idx(gpu_type, model_name, instance_id)].value is not None - ) - for gpu_type in gpu_types - } - - def _get_latency_per_work( - self, - gpu_type: GPUType, - model_name: Model, - num_devices: int, - ) -> float: - """ - Cost per unit of work for a given model and GPU type, based on latency data. - Cost: Linearized - sum of (latency * work_device * num_devices * ratio) - This replaces the bilinear makespan * GPUs. - """ - # Resolution scaling factor for HF/VAE/FT - latency_ratio = self.workflow.get_resolution_scale(self.policy.use_upscaler) - - if model_name == Model.GEMMA: - return ( - self.latency_data[gpu_type].gemma_first_scene[num_devices] - + self.latency_data[gpu_type].gemma_per_scene[num_devices] * (self.workflow.total_scenes - 1) - ) - - if model_name == Model.FLUX: - return ( - self.latency_data[gpu_type][model_name, num_devices] - * self.workflow.num_steps[Model.FLUX] - ) - - if model_name == Model.HF: - time_per_work = ( - self.workflow.per_subscene_frames[Model.HF] - / self.workflow.hf_frames[self.workflow.frames_per_step_idx] - * self.latency_data[gpu_type][model_name, num_devices] - * latency_ratio - * self.workflow.num_steps[Model.HF] - ) - if not self.policy.is_disaggregated(Model.HF): - time_per_work += self._get_latency_per_work( - gpu_type, - Model.HF_VAE, - 1, # VAE is single-device only in current policy - ) - return time_per_work - - if model_name == Model.HF_VAE: - return ( - self.latency_data[gpu_type][model_name, num_devices] - * latency_ratio - / self.workflow.hf_frames[self.workflow.frames_per_step_idx] - ) - - if model_name == Model.FT: - time_per_work = ( - self.workflow.per_subscene_frames[Model.FT] - / self.workflow.ft_frames[self.workflow.frames_per_step_idx] - * self.latency_data[gpu_type][model_name, num_devices] - * latency_ratio - * self.workflow.num_steps[Model.FT] - ) - if not self.policy.is_disaggregated(Model.FT): - time_per_work += self._get_latency_per_work( - gpu_type, - Model.FT_VAE, - 1, # VAE is single-device only in current policy - ) - return time_per_work - - if model_name == Model.FT_VAE: - return ( - self.latency_data[gpu_type][model_name, num_devices] - * latency_ratio - / self.workflow.ft_frames[self.workflow.frames_per_step_idx] - ) - - if model_name == Model.UPSCALER: - return self.latency_data[gpu_type][model_name, num_devices] - - if model_name == Model.OTHERS: - return self.latency_data[gpu_type][model_name, num_devices] * self.workflow.total_scenes - - raise ValueError(f"Unknown model_name {model_name}") - - def _get_power_per_work( - self, - gpu_type: GPUType, - model_name: Model, - num_devices: int, - ) -> float: - """ - Average power per unit of work for a given model and GPU type. - Returns the time-weighted average power consumption in watts. - For energy calculation: - energy = _get_latency_per_work(...) * _get_power_per_work(...) * num_devices * work - """ - if self.power_data is None: - return 0.0 - - if model_name == Model.GEMMA: - # For Gemma, power varies between first scene and subsequent scenes - # Compute energy then divide by total time to get average power - power_first = self.power_data[gpu_type].gemma_first_scene[num_devices] - power_per_scene = self.power_data[gpu_type].gemma_per_scene[num_devices] - latency_first = self.latency_data[gpu_type].gemma_first_scene[num_devices] - latency_per_scene = self.latency_data[gpu_type].gemma_per_scene[num_devices] - - total_energy = ( - power_first * latency_first - + power_per_scene * latency_per_scene * (self.workflow.total_scenes - 1) - ) - total_time = latency_first + latency_per_scene * (self.workflow.total_scenes - 1) - - return total_energy / total_time if total_time > 0 else power_first - - if model_name == Model.FLUX: - return self.power_data[gpu_type][model_name, num_devices] - - if model_name == Model.HF: - return self.power_data[gpu_type][model_name, num_devices] - - if model_name == Model.HF_VAE: - return self.power_data[gpu_type][model_name, num_devices] - - if model_name == Model.FT: - return self.power_data[gpu_type][model_name, num_devices] - - if model_name == Model.FT_VAE: - return self.power_data[gpu_type][model_name, num_devices] - - if model_name == Model.UPSCALER: - return self.power_data[gpu_type][model_name, num_devices] - - if model_name == Model.OTHERS: - # OTHERS model uses minimal GPU power (mostly idle) - # See models.py OthersModelAllocation.calculate_energy - only uses idle power - return self.power_data[gpu_type]["idle"] - - raise ValueError(f"Unknown model_name {model_name}") - - -def milp_to_models_dict( - m: ConcreteModel, - gpu_types: list[GPUType], - model_names: list[Model], - instance_ids: list[int], - idx: Callable[[GPUType, Model, int], tuple[str, str, int]], - workflow: WorkflowConfig, - power_data: Optional[PowerData], - policy: Policy, -) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: - """ - MILP result to models dictionary. - """ - if m is None: - return {} - - models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {} - for gpu_type in gpu_types: - models[gpu_type] = {} - for model_name in model_names: - models[gpu_type][model_name] = [] - for instance_id in instance_ids: - key = idx(gpu_type, model_name, instance_id) - gpus_val = m.gpus[key].value - work_val = m.work[key].value - if gpus_val is None or work_val is None: - continue - # round() snaps solver floats to nearest int (e.g. 1.9999 -> 2); - # banker's rounding is irrelevant here since MILP values can be - # near-integer, like 1.999 and 2.001 - gpus = int(round(gpus_val)) - work = int(round(work_val)) - if gpus > 0 and work > 0: - model_allocation = get_model_allocation( - model=model_name, - gpu_type=gpu_type, - devices=gpus, - replicas=1, - ) - model_allocation.work = work - model_allocation.time = m.time[key].value - model_allocation.time_first = m.ttff[key].value - model_allocation.calculate_energy( - workflow=workflow, - power_data=power_data, - total_time_s=m.makespan.value - ) - model_allocation.calculate_cost( - policy, - total_time_s=m.makespan.value - ) - models[gpu_type][model_name].append(model_allocation) - merged_models = models # coalesce_models(models) - return merged_models - - -def get_objective( - m: ConcreteModel, - allocator: Objective, - solver_name: str, -) -> Optional[OptObjective]: - if allocator == Objective.TIME: - return OptObjective(expr=m.makespan, sense=minimize) - - if allocator == Objective.TTFF: - return OptObjective(expr=m.ttff_user, sense=minimize) - - if allocator == Objective.TTFF_COST: - # Note: This creates a bilinear (nonconvex) objective - requires Gurobi - if solver_name == "gurobi": - return OptObjective(expr=m.ttff_user * m.cost, sense=minimize) - logging.warning("TTFF_COST using linear utility function.") - a = 1.0 - b = 1.0 - return OptObjective(expr=a * m.ttff_user + b * m.cost, sense=minimize) - - if allocator == Objective.COST: - return OptObjective(expr=m.cost, sense=minimize) - - if allocator == Objective.ENERGY: - return OptObjective(expr=m.energy, sense=minimize) - - if allocator == Objective.TIME_ENERGY: - # Note: This creates a bilinear objective - requires Gurobi - if solver_name == "gurobi": - return OptObjective(expr=m.makespan * m.energy, sense=minimize) - logging.warning("TIME_ENERGY using linear utility function.") - a = 1.0 - b = 1.0 - return OptObjective(expr=a * m.makespan + b * m.energy, sense=minimize) - - if allocator == Objective.ENERGY_COST: - if solver_name == "gurobi": - return OptObjective(expr=m.energy * m.cost, sense=minimize) - logging.warning("ENERGY_COST using linear utility function.") - a = 1.0 - b = 1.0 - return OptObjective(expr=a * m.energy + b * m.cost, sense=minimize) - - if allocator == Objective.FIFO: - logging.error("FIFO not implemented in MILP") - - if allocator == Objective.RANDOM: - return None # No objective, just find a feasible solution - - if allocator == Objective.NONE: - return None - - return OptObjective(expr=m.makespan, sense=minimize) - - -def _save_solution( - m: ConcreteModel, - save_solution_path: str, -) -> None: - solution = { - var.name: var.value - for var in m.component_data_objects(Var, active=True) - if var.value is not None - } - with open(save_solution_path, "w", encoding="utf-8") as output_file: - json.dump(solution, output_file, indent=2) - - -def _load_warm_start( - m: ConcreteModel, - warm_start_path: str, -) -> None: - """Load warm start values from a JSON file and apply them to the model variables.""" - with open(warm_start_path, "r", encoding="utf-8") as input_file: - warm_start_values = json.load(input_file) - - warm_start_applied = 0 - for var in m.component_data_objects(Var, active=True): - if var.name in warm_start_values: - var.set_value(warm_start_values[var.name]) - warm_start_applied += 1 - - logging.info( - f"Warm start loaded from {warm_start_path}. " - f"Applied values to {warm_start_applied} variables." - ) diff --git a/simulator/model_allocator.py b/simulator/model_allocator.py deleted file mode 100644 index ab1c7e39..00000000 --- a/simulator/model_allocator.py +++ /dev/null @@ -1,282 +0,0 @@ -""" -Defines the ModelAllocator abstract base class and its interface for model allocation strategies. -""" - -from __future__ import annotations - -from typing import Optional - -from abc import ABC -from abc import abstractmethod - -from sim_types import GPUType -from sim_types import Model -from sim_types import ModelAllocation -from sim_types import Policy -from sim_types import WorkflowConfig -from sim_types import LatencyData -from sim_types import PowerData -from sim_types import Result - -from models import FluxModelAllocation -from models import GemmaModelAllocation -from models import HFModelAllocation -from models import HFVAEModelAllocation -from models import FTModelAllocation -from models import FTVAEModelAllocation -from models import UpscalerModelAllocation -from models import OthersModelAllocation - -from policies import NAIVE_POLICY - - -class ModelAllocator(ABC): - """ - Abstract base class for model allocators. - """ - - def __init__( - self, - workflow: WorkflowConfig, - latency_data: LatencyData, - power_data: Optional[PowerData] = None, - policy: Policy = NAIVE_POLICY, - ) -> None: - self.workflow = workflow - self.latency_data = latency_data - self.power_data = power_data - self.policy = policy - - @abstractmethod - def allocate( - self, - num_gpus: dict[GPUType, int], - verbose: bool = False, - ) -> Result: - """Allocate models to GPUs and return the provisioning result.""" - ... - - def _init_single_server_models( - self, - gpu_type: GPUType, - ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: - """ - Initialize model allocations for a single server (8 GPUs or fewer). - Each model gets a single allocation entry. - """ - models: dict[GPUType, dict[Model, list[ModelAllocation]]] = { - gpu_type: { - Model.GEMMA: [ - GemmaModelAllocation( - gpu_type=gpu_type, - devices=1, replicas=1) - ], - Model.FLUX: [ - FluxModelAllocation( - gpu_type=gpu_type, - devices=1, replicas=1) - ], - Model.HF: [ - HFModelAllocation( - gpu_type=gpu_type, - devices=1, replicas=2) - ], - Model.HF_VAE: [ - HFVAEModelAllocation( - gpu_type=gpu_type, - devices=1, replicas=1) - ], - Model.FT: [ - FTModelAllocation( - gpu_type=gpu_type, - devices=1, replicas=1) - ], - Model.FT_VAE: [ - FTVAEModelAllocation( - gpu_type=gpu_type, - devices=1, replicas=1) - ], - Model.UPSCALER: [ - UpscalerModelAllocation( - gpu_type=gpu_type) - ], - Model.OTHERS: [ - OthersModelAllocation( - gpu_type=gpu_type, - devices=1, replicas=1) # + 1 for Kokoro/YOLO - ], - }, - } - - if self.policy.use_upscaler: - # HF -> UPSCALER - models[gpu_type][Model.HF][0].replicas -= 1 - models[gpu_type][Model.UPSCALER][0].replicas += 1 - - if not self.policy.is_disaggregated(Model.HF): - # HF_VAE -> HF - models[gpu_type][Model.HF_VAE][0].replicas -= 1 - models[gpu_type][Model.HF][0].replicas += 1 - if not self.policy.is_disaggregated(Model.FT): - # FT_VAE -> FT - models[gpu_type][Model.FT_VAE][0].replicas -= 1 - models[gpu_type][Model.FT][0].replicas += 1 - - self._zero_out_unused_models(models) - return models - - def _init_single_device_models( - self, - gpu_type: GPUType, - ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: - """ - Initialize model allocations for a single GPU type with >8 GPUs. - Each model gets two allocation entries (active and inactive). - """ - models: dict[GPUType, dict[Model, list[ModelAllocation]]] = { - gpu_type: { - Model.GEMMA: [ - GemmaModelAllocation( - gpu_type=gpu_type, - devices=1, replicas=1), - GemmaModelAllocation( - gpu_type=gpu_type), - ], - Model.FLUX: [ - FluxModelAllocation( - gpu_type=gpu_type, - devices=1, replicas=1), - FluxModelAllocation( - gpu_type=gpu_type), - ], - Model.HF: [ - HFModelAllocation( - gpu_type=gpu_type, - devices=1, replicas=1), - HFModelAllocation( - gpu_type=gpu_type), - ], - Model.HF_VAE: [ - HFVAEModelAllocation( - gpu_type=gpu_type, - devices=1, replicas=1), - HFVAEModelAllocation( - gpu_type=gpu_type), - ], - Model.FT: [ - FTModelAllocation( - gpu_type=gpu_type, - devices=2, replicas=1), - FTModelAllocation( - gpu_type=gpu_type), - ], - Model.FT_VAE: [ - FTVAEModelAllocation( - gpu_type=gpu_type, - devices=1, replicas=1), - FTVAEModelAllocation( - gpu_type=gpu_type), - ], - Model.UPSCALER: [ - UpscalerModelAllocation( - gpu_type=gpu_type), - UpscalerModelAllocation( - gpu_type=gpu_type), - ], - Model.OTHERS: [ - OthersModelAllocation( - gpu_type=gpu_type, - devices=1, replicas=1), - OthersModelAllocation( - gpu_type=gpu_type), - ], - }, - } - - if self.policy.use_upscaler: - models[gpu_type][Model.UPSCALER][0].replicas = 1 - - if not self.policy.is_disaggregated(Model.HF): - # HF_VAE -> HF - models[gpu_type][Model.HF_VAE][0].replicas -= 1 - models[gpu_type][Model.HF][0].replicas += 1 - if not self.policy.is_disaggregated(Model.FT): - # FT_VAE -> FT - models[gpu_type][Model.FT_VAE][0].replicas -= 1 - models[gpu_type][Model.FT][0].replicas += 1 - - self._zero_out_unused_models(models) - return models - - def _init_both_devices_models( - self, - gpu_type1: GPUType, - gpu_type2: GPUType, - ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: - """ - Initialize model allocations for two GPU types. - gpu_type1 gets GEMMA, FLUX, OTHERS; gpu_type2 gets HF, VAE, FT, UPSCALER. - """ - models: dict[GPUType, dict[Model, list[ModelAllocation]]] = { - gpu_type1: { - Model.GEMMA: [GemmaModelAllocation( - gpu_type=gpu_type1, - devices=1, replicas=1)], - Model.FLUX: [FluxModelAllocation( - gpu_type=gpu_type1, - devices=1, replicas=1)], - Model.HF: [], - Model.HF_VAE: [], - Model.FT: [], - Model.FT_VAE: [], - Model.UPSCALER: [], - Model.OTHERS: [OthersModelAllocation( - gpu_type=gpu_type1, - devices=1, replicas=1)], # + 1 for Kokoro/YOLO - }, - gpu_type2: { - Model.GEMMA: [], - Model.FLUX: [], - Model.HF: [HFModelAllocation( - gpu_type=gpu_type2, - devices=1, replicas=1)], - Model.HF_VAE: [HFVAEModelAllocation( - gpu_type=gpu_type2, - devices=1, replicas=1)], - Model.FT: [FTModelAllocation( - gpu_type=gpu_type2, - devices=2, replicas=1)], - Model.FT_VAE: [FTVAEModelAllocation( - gpu_type=gpu_type2, - devices=1, replicas=1)], - Model.UPSCALER: [UpscalerModelAllocation( - gpu_type=gpu_type2)], - Model.OTHERS: [], - }, - } - - if not self.policy.is_disaggregated(Model.HF): - # HF_VAE -> HF - models[gpu_type2][Model.HF_VAE][0].replicas -= 1 - models[gpu_type2][Model.HF][0].replicas += 1 - if not self.policy.is_disaggregated(Model.FT): - # FT_VAE -> FT - models[gpu_type2][Model.FT_VAE][0].replicas -= 1 - models[gpu_type2][Model.FT][0].replicas += 1 - - if self.policy.use_upscaler: - models[gpu_type2][Model.UPSCALER][0].replicas = 1 - - self._zero_out_unused_models(models) - return models - - def _zero_out_unused_models( - self, - models: dict[GPUType, dict[Model, list[ModelAllocation]]], - ) -> None: - """Zero out replicas for models not in the workflow.""" - for gpu_type in models: - for model in Model: - if model not in self.workflow.models: - for alloc in models[gpu_type][model]: - alloc.replicas = 0 diff --git a/simulator/models.py b/simulator/models.py deleted file mode 100644 index 9a56ab79..00000000 --- a/simulator/models.py +++ /dev/null @@ -1,811 +0,0 @@ -""" -Contains the definition for each model. -It includes the calculations for time, energy, and cost. -""" -from __future__ import annotations - -import math - -from typing import override -from typing import Callable -from typing import Optional -from typing import Type -from typing import ClassVar - -from sim_types import LatencyData -from sim_types import PowerData -from sim_types import ModelAllocation -from sim_types import Model -from sim_types import Policy -from sim_types import QualityLevel -from sim_types import WorkflowConfig -from sim_types import GPUType - -from constants import TOTAL_INPUT_TOKENS - - -# ModelAllocation Factory -ModelAllocationCls = Type[ModelAllocation] - -_MODEL_ALLOCATION_REGISTRY: dict[Model, ModelAllocationCls] = {} - - -def register_model( - model: Model -) -> Callable[[ModelAllocationCls], ModelAllocationCls]: - """Register a ModelAllocation class for the factory.""" - def decorator(cls: ModelAllocationCls) -> ModelAllocationCls: - _MODEL_ALLOCATION_REGISTRY[model] = cls - return cls - return decorator - - -def get_model_allocation( - *, - model: Model, - gpu_type: GPUType, - devices: int = 1, - replicas: int = 0, -) -> ModelAllocation: - """Factory to get the ModelAllocation instance for a specific model.""" - if model not in _MODEL_ALLOCATION_REGISTRY: - raise ValueError(f"No ModelAllocation for model {model}") - cls = _MODEL_ALLOCATION_REGISTRY[model] - return cls( - gpu_type=gpu_type, - devices=devices, - replicas=replicas, - ) - - -def _calculate_total_time( - total_work: float, - num_replicas: int, - time_per_work: float, -) -> float: - """Calculate total time given work, replicas, and time per work unit.""" - if num_replicas <= 0: - return 0.0 - total_time = (total_work / num_replicas) * time_per_work - if total_time < time_per_work: # We cannot go faster than single work unit time - total_time = time_per_work - return total_time - - -def assert_pixel_config( - workflow: WorkflowConfig -) -> None: - """Verify that the workflow's pixel configuration is valid for upscaling.""" - from sim_types import RESOLUTION_PIXELS - assert 0 < RESOLUTION_PIXELS[QualityLevel.MEDIUM] < RESOLUTION_PIXELS[QualityLevel.HIGH] - - -@register_model(Model.GEMMA) -class GemmaModelAllocation(ModelAllocation): - """Gemma model allocation.""" - model: ClassVar[Model] = Model.GEMMA - - @override - def get_max_replicas( - self, - workflow: WorkflowConfig, - ) -> int: - return workflow.model_work.get(Model.GEMMA, 1) - - @override - def calculate_time( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - work_pct: float = 1.0, - ) -> float: - if self.get_num_gpus() == 0: - self.time = 0.0 - return self.time - latency_first = latency_data[self.gpu_type].gemma_first_scene[self.devices] - latency_per_scene = latency_data[self.gpu_type].gemma_per_scene[self.devices] - latency_first *= workflow.total_input_tokens / TOTAL_INPUT_TOKENS - latency_per_scene *= workflow.total_input_tokens / TOTAL_INPUT_TOKENS - total_work = workflow.model_work.get(Model.GEMMA, 1) - if total_work > 1: - num_scenes = math.ceil(work_pct * total_work) - total_time_per_scene = latency_first + latency_per_scene * (num_scenes - 1) - self.time = _calculate_total_time( - num_scenes, - self.replicas, - total_time_per_scene / num_scenes) - else: - self.time = latency_first + latency_per_scene * (workflow.total_scenes - 1) - return self.time - - @override - def calculate_time_first( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - ) -> float: - if self.get_num_gpus() == 0: - self.time_first = 0.0 - return self.time_first - latency_first = latency_data[self.gpu_type].gemma_first_scene[self.devices] - latency_first *= workflow.total_input_tokens / TOTAL_INPUT_TOKENS - self.time_first = latency_first - return self.time_first - - @override - def calculate_energy( - self, - workflow: WorkflowConfig, - power_data: Optional[PowerData] = None, - total_time_s: float = 0.0, - ) -> float: - if self.get_num_gpus() == 0 or power_data is None: - self.energy = 0.0 - return self.energy - # Gemma energy - latency_first = self.time_first - latency_per_scene = max(0.0, self.time - latency_first) - power_first = power_data[self.gpu_type].gemma_first_scene[self.devices] - power_per_scene = power_data[self.gpu_type].gemma_per_scene[self.devices] - self.energy = \ - power_first * latency_first + \ - power_per_scene * latency_per_scene * (workflow.total_scenes - 1) - # Idle energy - power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus() - time_idle = total_time_s - self.time - if time_idle > 0: - self.energy += power_idle * time_idle - return self.energy - - -@register_model(Model.FLUX) -class FluxModelAllocation(ModelAllocation): - """Flux model allocation.""" - model: ClassVar[Model] = Model.FLUX - - def _calc_time_per_scene( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - ) -> float: - return ( - latency_data[self.gpu_type][self.model, self.devices] - * workflow.num_steps[Model.FLUX] - ) - - @override - def get_max_replicas( - self, - workflow: WorkflowConfig, - ) -> int: - return workflow.model_work.get(Model.FLUX, 1) - - @override - def calculate_time( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - work_pct: float = 1.0, - ) -> float: - if self.get_num_gpus() == 0: - self.time = 0.0 - return self.time - time_per_scene = self._calc_time_per_scene( - policy, - workflow, - latency_data, - ) - total_work = workflow.model_work.get(Model.FLUX, 1) - if total_work > 1: - num_scenes = math.ceil(work_pct * total_work) - self.time = _calculate_total_time( - num_scenes, - self.replicas, - time_per_scene) - else: - self.time = time_per_scene - return self.time - - @override - def calculate_time_first( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - ) -> float: - if self.get_num_gpus() == 0: - self.time_first = 0.0 - return self.time_first - time_per_scene = self._calc_time_per_scene( - policy, - workflow, - latency_data, - ) - self.time_first = time_per_scene - return self.time_first - - @override - def calculate_energy( - self, - workflow: WorkflowConfig, - power_data: Optional[PowerData] = None, - total_time_s: float = 0.0, - ) -> float: - if self.get_num_gpus() == 0 or power_data is None: - self.energy = 0.0 - return self.energy - power_flux = power_data[self.gpu_type][Model.FLUX, self.devices] - self.energy = power_flux * self.time * self.replicas - # Idle energy - power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus() - time_idle = total_time_s - self.time - if time_idle > 0: - self.energy += power_idle * time_idle - return self.energy - - -@register_model(Model.HF) -class HFModelAllocation(ModelAllocation): - """HunyuanFramePack model allocation.""" - model: ClassVar[Model] = Model.HF - - def _calc_time_per_frame( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - ) -> float: - return ( - latency_data[self.gpu_type][self.model, self.devices] - * workflow.get_resolution_scale(policy.use_upscaler) - * workflow.num_steps[Model.HF] - ) - - def _calc_time_per_subscene( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - ) -> float: - return ( - workflow.per_subscene_frames[Model.HF] - / workflow.hf_frames[workflow.frames_per_step_idx] - * latency_data[self.gpu_type][self.model, self.devices] - * workflow.get_resolution_scale(policy.use_upscaler) # latency_ratio - * workflow.num_steps[Model.HF] - ) - - @override - def calculate_time( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - work_pct: float = 1.0, - ) -> float: - if self.get_num_gpus() == 0: - self.time = 0.0 - return self.time - - hf_time_per_subscene = self._calc_time_per_subscene( - policy, - workflow, - latency_data, - ) - self.time = _calculate_total_time( - math.ceil(work_pct * workflow.total_subscenes), - self.replicas, - hf_time_per_subscene) - - if not policy.is_disaggregated(Model.HF): - # Include VAE time in the same GPU when disaggregation is disabled - hf_vae_time_per_frame = ( - latency_data[self.gpu_type][Model.HF_VAE, 1] # VAE is single-device only in current policy - * workflow.get_resolution_scale(policy.use_upscaler) - / workflow.hf_frames[workflow.frames_per_step_idx] - ) - self.time += _calculate_total_time( - math.ceil(work_pct * workflow.total_frames[Model.HF]), - self.replicas, - hf_vae_time_per_frame) - - return self.time - - @override - def calculate_time_first( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - ) -> float: - if self.get_num_gpus() == 0: - self.time_first = 0.0 - return self.time_first - - if policy.is_disaggregated(Model.HF): - # HF for the first chunk - self.time_first = min( - # Option 1: the first few frames until the first chunk is done - workflow.hf_frames[0] - / workflow.hf_frames[workflow.frames_per_step_idx] - * self._calc_time_per_frame( - policy, - workflow, - latency_data - ), - # Option 2: the full subscene - self._calc_time_per_subscene( - policy, - workflow, - latency_data - ), - ) - else: - # HF + VAE for the full subscene - hf_time_per_subscene = self._calc_time_per_subscene( - policy, - workflow, - latency_data) - hf_vae_time_per_subscene = ( - workflow.per_subscene_frames[Model.HF] - / workflow.hf_frames[workflow.frames_per_step_idx] - * latency_data[self.gpu_type][Model.HF_VAE, 1] # VAE is single-device only in current policy - * workflow.get_resolution_scale(policy.use_upscaler) - ) - self.time_first = hf_time_per_subscene + hf_vae_time_per_subscene - - return self.time_first - - @override - def calculate_energy( - self, - workflow: WorkflowConfig, - power_data: Optional[PowerData] = None, - total_time_s: float = 0.0, - ) -> float: - if self.get_num_gpus() == 0 or power_data is None: - self.energy = 0.0 - return self.energy - power_hf = power_data[self.gpu_type][Model.HF, self.devices] - self.energy = power_hf * self.time * self.replicas - # Idle energy - power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus() - time_idle = total_time_s - self.time - if time_idle > 0: - self.energy += power_idle * time_idle - return self.energy - - @override - def get_max_replicas( - self, - workflow: WorkflowConfig, - ) -> int: - return workflow.model_work.get(Model.HF, 1) - - -@register_model(Model.HF_VAE) -class HFVAEModelAllocation(ModelAllocation): - """HunyuanFramePack VAE model allocation.""" - model: ClassVar[Model] = Model.HF_VAE - - def _calc_time_per_frame( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - ) -> float: - return ( - latency_data[self.gpu_type][Model.HF_VAE, self.devices] - * workflow.get_resolution_scale(policy.use_upscaler) - / workflow.hf_frames[workflow.frames_per_step_idx] - ) - - @override - def calculate_time( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - work_pct: float = 1.0, - ) -> float: - if not policy.is_disaggregated(Model.HF): - assert self.get_num_gpus() == 0 - self.time = 0.0 - return self.time - if self.get_num_gpus() == 0: - self.time = 0.0 - return self.time - - vae_time_per_frame = self._calc_time_per_frame( - policy, - workflow, - latency_data - ) - self.time = _calculate_total_time( - math.ceil(workflow.total_frames[Model.HF] * work_pct), - self.replicas, - vae_time_per_frame) - return self.time - - @override - def calculate_time_first( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - ) -> float: - if not policy.is_disaggregated(Model.HF): - assert self.get_num_gpus() == 0 - self.time_first = 0.0 - return self.time_first - if self.get_num_gpus() == 0: - self.time_first = 0.0 - return self.time_first - - vae_time_per_frame = self._calc_time_per_frame( - policy, - workflow, - latency_data, - ) - num_frames = workflow.per_subscene_frames[Model.HF] - self.time_first = num_frames * vae_time_per_frame - return self.time_first - - @override - def calculate_energy( - self, - workflow: WorkflowConfig, - power_data: Optional[PowerData] = None, - total_time_s: float = 0.0, - ) -> float: - if self.get_num_gpus() == 0 or power_data is None: - self.energy = 0.0 - return self.energy - self.energy = power_data[self.gpu_type][Model.HF_VAE, self.devices] * self.time * self.replicas - # Idle energy - power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus() - time_idle = total_time_s - self.time - if time_idle > 0: - self.energy += power_idle * time_idle - return self.energy - - @override - def get_max_replicas( - self, - workflow: WorkflowConfig, - ) -> int: - return workflow.model_work.get(Model.HF_VAE, 1) - - -@register_model(Model.FT) -class FTModelAllocation(ModelAllocation): - """FantasyTalking model allocation.""" - model: ClassVar[Model] = Model.FT - - def _calc_time_per_subscene( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - ) -> float: - return ( - workflow.per_subscene_frames[Model.FT] - / workflow.ft_frames[workflow.frames_per_step_idx] - * latency_data[self.gpu_type][Model.FT, self.devices] - * workflow.get_resolution_scale(policy.use_upscaler) - * workflow.num_steps[Model.FT] - ) - - @override - def calculate_time( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - work_pct: float = 1.0, - ) -> float: - if self.get_num_gpus() == 0: - self.time = 0.0 - return self.time - - ft_time_per_subscene = self._calc_time_per_subscene( - policy, - workflow, - latency_data, - ) - self.time = _calculate_total_time( - math.ceil(work_pct * workflow.total_subscenes), - self.replicas, - ft_time_per_subscene) - - if not policy.is_disaggregated(Model.FT): - # Include VAE time in the same GPU when disaggregation is disabled - # Note: VAE latency uses devices=1 as VAE processing is not parallelized - # across multiple devices in the same way as the main FT diffusion - ft_vae_time_per_frame = ( - latency_data[self.gpu_type][Model.FT_VAE, 1] - * workflow.get_resolution_scale(policy.use_upscaler) - / workflow.ft_frames[workflow.frames_per_step_idx] - ) - self.time += _calculate_total_time( - math.ceil(work_pct * workflow.total_frames[Model.FT]), - self.replicas, - ft_vae_time_per_frame) - - return self.time - - @override - def calculate_time_first( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - ) -> float: - if self.get_num_gpus() == 0: - self.time_first = 0.0 - return self.time_first - - ft_time_per_subscene = self._calc_time_per_subscene( - policy, - workflow, - latency_data, - ) - self.time_first = ft_time_per_subscene - - if not policy.is_disaggregated(Model.FT): - # Include VAE time_first when FT-VAE is not disaggregated - # Note: VAE latency uses devices=1 (see note in calculate_time) - ft_vae_time_per_subscene = ( - workflow.per_subscene_frames[Model.FT] - / workflow.ft_frames[workflow.frames_per_step_idx] - * latency_data[self.gpu_type][Model.FT_VAE, 1] - * workflow.get_resolution_scale(policy.use_upscaler) - ) - self.time_first += ft_vae_time_per_subscene - - return self.time_first - - @override - def calculate_energy( - self, - workflow: WorkflowConfig, - power_data: Optional[PowerData] = None, - total_time_s: float = 0.0, - ) -> float: - if self.get_num_gpus() == 0 or power_data is None: - self.energy = 0.0 - return self.energy - power_ft = power_data[self.gpu_type][Model.FT, self.devices] - self.energy = power_ft * self.time * self.replicas - # Idle energy - power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus() - time_idle = total_time_s - self.time - if time_idle > 0: - self.energy += power_idle * time_idle - return self.energy - - @override - def get_max_replicas( - self, - workflow: WorkflowConfig, - ) -> int: - return workflow.model_work.get(Model.FT, 1) - - -@register_model(Model.FT_VAE) -class FTVAEModelAllocation(ModelAllocation): - """FantasyTalking VAE model allocation.""" - model: ClassVar[Model] = Model.FT_VAE - - def _calc_time_per_frame( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - ) -> float: - return ( - latency_data[self.gpu_type][Model.FT_VAE, self.devices] - * workflow.get_resolution_scale(policy.use_upscaler) - / workflow.ft_frames[workflow.frames_per_step_idx] - ) - - @override - def calculate_time( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - work_pct: float = 1.0, - ) -> float: - if not policy.is_disaggregated(Model.FT): - assert self.get_num_gpus() == 0 - self.time = 0.0 - return self.time - if self.get_num_gpus() == 0: - self.time = 0.0 - return self.time - - vae_time_per_frame = self._calc_time_per_frame( - policy, - workflow, - latency_data, - ) - self.time = _calculate_total_time( - math.ceil(workflow.total_frames[Model.FT] * work_pct), - self.replicas, - vae_time_per_frame) - return self.time - - @override - def calculate_time_first( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - ) -> float: - if not policy.is_disaggregated(Model.FT): - assert self.get_num_gpus() == 0 - self.time_first = 0.0 - return self.time_first - if self.get_num_gpus() == 0: - self.time_first = 0.0 - return self.time_first - - vae_time_per_frame = self._calc_time_per_frame( - policy, - workflow, - latency_data, - ) - num_frames = workflow.per_subscene_frames[Model.FT] - self.time_first = num_frames * vae_time_per_frame - return self.time_first - - @override - def calculate_energy( - self, - workflow: WorkflowConfig, - power_data: Optional[PowerData] = None, - total_time_s: float = 0.0, - ) -> float: - if self.get_num_gpus() == 0 or power_data is None: - self.energy = 0.0 - return self.energy - self.energy = power_data[self.gpu_type][Model.FT_VAE, self.devices] * self.time * self.replicas - # Idle energy - power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus() - time_idle = total_time_s - self.time - if time_idle > 0: - self.energy += power_idle * time_idle - return self.energy - - @override - def get_max_replicas( - self, - workflow: WorkflowConfig, - ) -> int: - return workflow.model_work.get(Model.FT_VAE, 1) - - -@register_model(Model.UPSCALER) -class UpscalerModelAllocation(ModelAllocation): - """Upscaler model allocation.""" - model: ClassVar[Model] = Model.UPSCALER - - @override - def calculate_time( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - work_pct: float = 1.0, - ) -> float: - if self.get_num_gpus() == 0: - self.time = 0.0 - return self.time - self.time = _calculate_total_time( - math.ceil(work_pct * workflow.total_frames[Model.FT]), - self.replicas, - latency_data[self.gpu_type][self.model, self.devices]) - return self.time - - @override - def calculate_time_first( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - ) -> float: - if not policy.use_upscaler: - assert self.get_num_gpus() == 0 - if self.get_num_gpus() == 0: - self.time_first = 0.0 - return self.time_first - - self.time_first = ( - workflow.per_subscene_frames[Model.FT] - * latency_data[self.gpu_type][self.model, self.devices] - ) - return self.time_first - - @override - def calculate_energy( - self, - workflow: WorkflowConfig, - power_data: Optional[PowerData] = None, - total_time_s: float = 0.0, - ) -> float: - if self.get_num_gpus() == 0 or power_data is None: - self.energy = 0.0 - return self.energy - # Assumes a single device and multiple replicas - self.energy = power_data[self.gpu_type][self.model, self.devices] * self.time * self.replicas - # Idle energy - power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus() - time_idle = total_time_s - self.time - if time_idle > 0: - self.energy += power_idle * time_idle - return self.energy - - @override - def get_max_replicas( - self, - workflow: WorkflowConfig, - ) -> int: - return workflow.model_work.get(Model.UPSCALER, 1) - - -@register_model(Model.OTHERS) -class OthersModelAllocation(ModelAllocation): - """Others: Kokoro + YOLO.""" - model: ClassVar[Model] = Model.OTHERS - - @override - def calculate_time( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - work_pct: float = 1.0, - ) -> float: - if self.get_num_gpus() == 0: - self.time = 0.0 - return self.time - - self.time = ( - workflow.total_scenes - * latency_data[self.gpu_type][self.model, self.devices] - ) - return self.time - - @override - def calculate_time_first( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - ) -> float: - if self.get_num_gpus() == 0: - self.time_first = 0.0 - return self.time_first - - self.time_first = latency_data[self.gpu_type][self.model, self.devices] - return self.time_first - - @override - def calculate_energy( - self, - workflow: WorkflowConfig, - power_data: Optional[PowerData] = None, - total_time_s: float = 0.0, - ) -> float: - if self.get_num_gpus() == 0 or power_data is None: - self.energy = 0.0 - return self.energy - # Idle energy; not much GPU usage - power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus() - self.energy = power_idle * self.time - return self.energy diff --git a/simulator/multirequests.py b/simulator/multirequests.py index 4fee5d55..82957c8f 100644 --- a/simulator/multirequests.py +++ b/simulator/multirequests.py @@ -4,23 +4,23 @@ import os from dataclasses import replace -from sim_types import GPUType -from sim_types import Model -from sim_types import QualityLevel -from sim_types import RESOLUTION_PIXELS -from sim_types import Result -from sim_types import WorkflowConfig -from sim_types import LatencyData +from model_provisioner.sim_types import GPUType +from model_provisioner.sim_types import Model +from model_provisioner.sim_types import QualityLevel +from model_provisioner.sim_types import RESOLUTION_PIXELS +from model_provisioner.sim_types import Result +from model_provisioner.sim_types import WorkflowConfig +from model_provisioner.sim_types import LatencyData -from data_loading import load_latency_data -from data_loading import load_power_data -from data_loading import load_adaptive_quality_data +from model_provisioner.data_loading import load_latency_data +from model_provisioner.data_loading import load_power_data +from model_provisioner.data_loading import load_adaptive_quality_data -from workflows import PODCAST_WORKFLOW +from model_provisioner.workflows import PODCAST_WORKFLOW -from policies import STREAMWISE_POLICY +from model_provisioner.policies import STREAMWISE_POLICY -from auto_model_allocator import AutoModelAllocator +from model_provisioner.auto_model_allocator import AutoModelAllocator # Queries per minute diff --git a/simulator/naive_baseline.py b/simulator/naive_baseline.py deleted file mode 100644 index 9f9c550c..00000000 --- a/simulator/naive_baseline.py +++ /dev/null @@ -1,484 +0,0 @@ -""" -Naive baseline for the StreamWise workflow allocation problem. -""" - -from __future__ import annotations - -from typing import Optional - -from constants import NUM_GPUS_PER_SERVER -from constants import DEVICE_OPTIONS - -from sim_types import Result -from sim_types import GPUType -from sim_types import WorkflowConfig -from sim_types import LatencyData -from sim_types import PowerData -from sim_types import Policy -from sim_types import Solver -from sim_types import Model -from sim_types import ModelAllocation -from sim_types import Objective - -from models import FluxModelAllocation -from models import GemmaModelAllocation -from models import HFModelAllocation -from models import HFVAEModelAllocation -from models import FTModelAllocation -from models import FTVAEModelAllocation -from models import UpscalerModelAllocation -from models import OthersModelAllocation - -from evaluator import evaluate_model_allocation - -from policies import NAIVE_POLICY -from policies import MAX_DEVICES - -from model_allocator import ModelAllocator - - -class NaiveAllocator(ModelAllocator): - """ - Naive allocator that implements a simple heuristic. - """ - def __init__( - self, - workflow: WorkflowConfig, - latency_data: LatencyData, - power_data: Optional[PowerData] = None, - policy: Policy = NAIVE_POLICY, - ) -> None: - super().__init__( - workflow, - latency_data, - power_data, - policy, - ) - assert self.policy.solver == Solver.NAIVE - assert self.policy.objective == Objective.TTFF - - def allocate( - self, - num_gpus: dict[GPUType, int], - verbose: bool = False, - ) -> Result: - total_gpus = sum(num_gpus.values()) - assert total_gpus >= 8, f"Total number of GPUs must be at least 8 ({num_gpus})" - - gpu_types = [ - gpu_type - for gpu_type, count in num_gpus.items() - if count > 0 - ] - assert 1 <= len(gpu_types) <= 2, f"Only up to two GPU types are supported ({len(gpu_types)})" - gpu_type1 = gpu_types[0] - - if len(gpu_types) == 1: - models = self._naive_single( - num_gpus.get(gpu_type1, 0), - gpu_type=gpu_type1, - ) - else: - # Mixed setup of GPU types (e.g., A100 and H100) - models = self._naive_two(num_gpus) - - result = evaluate_model_allocation( - models=models, - num_gpus=num_gpus, - workflow=self.workflow, - latency_data=self.latency_data, - power_data=self.power_data, - policy=self.policy, - round_up_cost_to_server=True, - ) - return result - - def _naive_single( - self, - num_gpus: int, - gpu_type: GPUType, - ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: - """Naive allocation for single GPU type.""" - return self._naive_parallelism_allocation(gpu_type, num_gpus) - - def _naive_two( - self, - num_gpus: dict[GPUType, int], - ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: - """Naive allocation for two GPU types.""" - gpu_types = list(num_gpus.keys()) - assert len(gpu_types) == 2 - assert len(num_gpus) == 2 - gpu_type1 = gpu_types[0] - gpu_type2 = gpu_types[1] - assert num_gpus[gpu_type1] >= NUM_GPUS_PER_SERVER[gpu_type1] - assert num_gpus[gpu_type2] >= NUM_GPUS_PER_SERVER[gpu_type2] - - # Initialize allocations with minimal setup - models: dict[GPUType, dict[Model, list[ModelAllocation]]] = { - gpu_type1: { # 3 x A100s (type1) - Model.GEMMA: [GemmaModelAllocation( - gpu_type=gpu_type1, - devices=1, replicas=1)], - Model.FLUX: [FluxModelAllocation( - gpu_type=gpu_type1, - devices=1, replicas=1)], - Model.HF: [], - Model.HF_VAE: [], - Model.FT: [], - Model.FT_VAE: [], - Model.UPSCALER: [], - Model.OTHERS: [OthersModelAllocation( - gpu_type=gpu_type1, - devices=1, replicas=1)], # + 1 for Kokoro/YOLO - }, - gpu_type2: { # 4 (+1) X H100 GPUs (type2) - Model.GEMMA: [], - Model.FLUX: [], - Model.HF: [HFModelAllocation( - gpu_type=gpu_type2, - devices=1, replicas=1)], - Model.HF_VAE: [HFVAEModelAllocation( - gpu_type=gpu_type2, - devices=1, replicas=1)], - Model.FT: [FTModelAllocation( - gpu_type=gpu_type2, - devices=2, replicas=1)], - Model.FT_VAE: [FTVAEModelAllocation( - gpu_type=gpu_type2, - devices=1, replicas=1)], - Model.UPSCALER: [UpscalerModelAllocation( - gpu_type=gpu_type2)], - Model.OTHERS: [], - }, - } - - # Calculate remaining: starting - assigned - if not self.policy.is_disaggregated(Model.HF): - models[gpu_type2][Model.HF][0].replicas = 2 - models[gpu_type2][Model.HF_VAE][0].replicas = 0 - if not self.policy.is_disaggregated(Model.FT): - models[gpu_type2][Model.FT_VAE][0].replicas = 0 - - if self.policy.use_upscaler: - models[gpu_type2][Model.UPSCALER][0].replicas = 1 - - models_gpu_type1 = self._naive_parallelism_allocation( - gpu_type1, - num_gpus.get(gpu_type1, 0), - ) - models_gpu_type2 = self._naive_parallelism_allocation( - gpu_type2, - num_gpus.get(gpu_type2, 0), - # Already allocated in first GPU type - skip_non_paralelizable_models=True, - ) - models[gpu_type1] = models_gpu_type1[gpu_type1] - models[gpu_type2] = models_gpu_type2[gpu_type2] - - # Apply per-GPU-type overrides after allocation - if self.policy.use_upscaler: - models[gpu_type2][Model.UPSCALER][0].replicas = 1 - - return models - - def _naive_parallelism_allocation( - self, - gpu_type: GPUType, - num_devices: int, - skip_non_paralelizable_models: bool = False, - ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: - """ - Device allocation for naive parallelism. - Max devices for each model. - Allocate devices to each model proportional to their max devices. - """ - models: dict[GPUType, dict[Model, list[ModelAllocation]]] = { - gpu_type: { - Model.GEMMA: [GemmaModelAllocation( - gpu_type=gpu_type, - replicas=1)], - Model.FLUX: [FluxModelAllocation( - gpu_type=gpu_type, - replicas=1)], - Model.HF: [HFModelAllocation( - gpu_type=gpu_type, - replicas=1)], - Model.HF_VAE: [HFVAEModelAllocation( - gpu_type=gpu_type, - replicas=1 if self.policy.is_disaggregated(Model.HF) else 0)], - Model.FT: [FTModelAllocation( - gpu_type=gpu_type, - replicas=4)], - Model.FT_VAE: [FTVAEModelAllocation( - gpu_type=gpu_type, - replicas=1 if self.policy.is_disaggregated(Model.FT) else 0)], - Model.OTHERS: [OthersModelAllocation( - gpu_type=gpu_type, - replicas=1)], # + 1 for Kokoro/YOLO - Model.UPSCALER: [UpscalerModelAllocation( - gpu_type=gpu_type, - replicas=1 if self.policy.use_upscaler else 0)], - }, - } - - # Zero out replicas for models not in workflow - for model in Model: - if model not in self.workflow.models: - for alloc in models[gpu_type][model]: - alloc.replicas = 0 - - # Zero out replicas for models that are not parallelizable when skip_non_paralelizable_models is True - if skip_non_paralelizable_models: - for model in Model: - if not self.workflow.is_parallelizable(model): - for alloc in models[gpu_type][model]: - alloc.replicas = 0 - - # Assert only 1 allocation instance per model for naive parallelism - for model in Model: - assert len(models[gpu_type][model]) == 1, \ - f"Expected only 1 allocation instance for {model}, got {len(models[gpu_type][model])}" - - alloc_id = 0 - model_gemma = models[gpu_type][Model.GEMMA][alloc_id] - model_flux = models[gpu_type][Model.FLUX][alloc_id] - model_hf = models[gpu_type][Model.HF][alloc_id] - model_vae = models[gpu_type][Model.HF_VAE][alloc_id] - model_ft = models[gpu_type][Model.FT][alloc_id] - model_ft_vae = models[gpu_type][Model.FT_VAE][alloc_id] - model_upscaler = models[gpu_type][Model.UPSCALER][alloc_id] - - # TODO do we need to do something for Model.OTHERS - - if num_devices == 8: - # single server case, use fixed allocation - if Model.FT in self.workflow.models: - model_ft.replicas = 4 - if self.policy.use_upscaler and Model.UPSCALER in self.workflow.models: - model_upscaler.replicas = 1 - if Model.FT in self.workflow.models: - model_ft.replicas -= 1 - if self.policy.is_disaggregated(Model.HF) and Model.HF_VAE in self.workflow.models: - model_vae.replicas = 1 - if Model.FT in self.workflow.models: - model_ft.replicas -= 1 - if self.policy.is_disaggregated(Model.FT) and Model.FT_VAE in self.workflow.models: - model_ft_vae.replicas = 1 - if Model.FT in self.workflow.models: - model_ft.replicas -= 1 - return models - - init_num_devices = sum([ - model[0].devices * model[0].replicas - for model in models[gpu_type].values() - ]) - - # Allocate devices proportional to each model's max devices - max_devices = MAX_DEVICES - models_in_workflow = [ - model - for model in max_devices.keys() - if model in self.workflow.models - ] - if skip_non_paralelizable_models: - for model in max_devices.keys(): - if not self.workflow.is_parallelizable(model): - models_in_workflow.remove(model) - - total_max_devices = sum([ - max_devices[model] - for model in models_in_workflow - ]) - for model in models_in_workflow: - # Calculate the number of devices to allocate for the model, proportional to its max devices among others - alloc_devices = int((num_devices - init_num_devices) * max_devices[model] / total_max_devices) - if model == Model.GEMMA: - max_devices_gemma = max_devices[Model.GEMMA] - if self.latency_data: - max_devices_gemma = min(max_devices_gemma, self.latency_data[gpu_type].get_max_parallelism(model)) - model_gemma.devices += min(alloc_devices, max_devices_gemma) - # Round down nearest in DEVICE_OPTIONS_GEMMA - num_gemma_devices = max([ - d - for d in DEVICE_OPTIONS[Model.GEMMA] - if d <= model_gemma.devices - ]) - model_gemma.devices = num_gemma_devices - elif model == Model.FLUX: - max_devices_flux = max_devices[Model.FLUX] - if self.latency_data: - max_devices_flux = min(max_devices_flux, self.latency_data[gpu_type].get_max_parallelism(model)) - model_flux.devices += min(alloc_devices, max_devices_flux) - # Round down nearest in DEVICE_OPTIONS_FLUX - model_flux.devices = max([ - d - for d in DEVICE_OPTIONS[Model.FLUX] - if d <= model_flux.devices - ]) - elif model == Model.HF: - max_devices_hf = max_devices[Model.HF] - if self.latency_data: - max_devices_hf = min(max_devices_hf, self.latency_data[gpu_type].get_max_parallelism(model)) - model_hf.replicas += min(alloc_devices, max_devices_hf) - elif model == Model.HF_VAE: - if self.policy.is_disaggregated(Model.HF): - max_devices_vae = max_devices[Model.HF_VAE] - if self.latency_data: - max_devices_vae = min(max_devices_vae, self.latency_data[gpu_type].get_max_parallelism(model)) - model_vae.replicas += min(alloc_devices, max_devices_vae) - elif model == Model.FT: - max_devices_ft = max_devices[Model.FT] - if self.latency_data: - max_devices_ft = min(max_devices_ft, self.latency_data[gpu_type].get_max_parallelism(model)) - model_ft.replicas += min(alloc_devices, max_devices_ft) - elif model == Model.FT_VAE: - if self.policy.is_disaggregated(Model.FT): - max_devices_ft_vae = max_devices[Model.FT_VAE] - if self.latency_data: - max_devices_ft_vae = min( - max_devices_ft_vae, self.latency_data[gpu_type].get_max_parallelism(model) - ) - model_ft_vae.replicas += min(alloc_devices, max_devices_ft_vae) - else: - raise ValueError(f"Unrecognized model {model}") - - remaining_devices = num_devices - for model_name in models[gpu_type].keys(): - for model_alloc in models[gpu_type][model_name]: - remaining_devices -= model_alloc.get_num_gpus() - - # Distribute remaining devices to parallelizable models - distribute_models = self.workflow.filter_parallelizable_models( - models_in_workflow, - disaggregation=self.policy.disaggregation, - ) - # Prioritise models that already hold more GPUs - distribute_models.sort( - key=lambda m: models[gpu_type][m][alloc_id].get_num_gpus(), - reverse=True, - ) - num_distribute = len(distribute_models) - if num_distribute > 0 and remaining_devices > 0: - made_progress = True - while remaining_devices > 0 and made_progress: - made_progress = False - for model_name in distribute_models: - gpus_per_replica = models[gpu_type][model_name][alloc_id].devices - if gpus_per_replica <= 0 or remaining_devices < gpus_per_replica: - continue - models[gpu_type][model_name][alloc_id].replicas += 1 - remaining_devices -= gpus_per_replica - made_progress = True - if remaining_devices <= 0: - break - - remaining_devices = num_devices - for model_name in models[gpu_type].keys(): - for model_alloc in models[gpu_type][model_name]: - remaining_devices -= model_alloc.get_num_gpus() - - # TODO we should try to assign all resources - # assert remaining_devices == 0, \ - assert remaining_devices >= 0, \ - f"remaining={remaining_devices} != 0: " \ - f"gpu={gpu_type.value} total={num_devices} remaining={remaining_devices}" - - # Update replicas based on total devices - # Gemma (when parallelizable) - if self.workflow.is_parallelizable(Model.GEMMA) and Model.GEMMA in models_in_workflow: - model_gemma.devices, model_gemma.replicas, remaining_devices = _calculate_naive_num_devices( - model_gemma.devices, - model_gemma.replicas, - remaining_devices, - device_options=DEVICE_OPTIONS[Model.GEMMA], - replica_upper_bound=self.workflow.total_scenes) - - # Flux (when parallelizable) - if self.workflow.is_parallelizable(Model.FLUX) and Model.FLUX in models_in_workflow: - model_flux.devices, model_flux.replicas, remaining_devices = _calculate_naive_num_devices( - model_flux.devices, - model_flux.replicas, - remaining_devices, - device_options=DEVICE_OPTIONS[Model.FLUX], - replica_upper_bound=self.workflow.total_scenes) - - # Hunyuan FramePack - if Model.HF in self.workflow.models: - model_hf.devices, model_hf.replicas, remaining_devices = _calculate_naive_num_devices( - model_hf.devices, - model_hf.replicas, - remaining_devices, - device_options=DEVICE_OPTIONS[Model.HF], - replica_upper_bound=self.workflow.total_scenes) - - # Hunyuan FramePack VAE - if self.policy.is_disaggregated(Model.HF) and Model.HF_VAE in self.workflow.models: - model_vae.devices, model_vae.replicas, remaining_devices = _calculate_naive_num_devices( - model_vae.devices, - model_vae.replicas, - remaining_devices, - device_options=None, - replica_upper_bound=self.workflow.total_frames[Model.HF], - ) - - # Fantasy Talking - if Model.FT in self.workflow.models: - model_ft.devices, model_ft.replicas, remaining_devices = _calculate_naive_num_devices( - model_ft.devices, - model_ft.replicas, - remaining_devices, - device_options=DEVICE_OPTIONS[Model.FT], - replica_upper_bound=self.workflow.total_subscenes, - ) - - # Fantasy Talking VAE - if self.policy.is_disaggregated(Model.FT) and Model.FT_VAE in self.workflow.models: - model_ft_vae.devices, model_ft_vae.replicas, remaining_devices = _calculate_naive_num_devices( - model_ft_vae.devices, - model_ft_vae.replicas, - remaining_devices, - device_options=None, - replica_upper_bound=self.workflow.total_frames[Model.FT], - ) - - return models - - -def _calculate_naive_num_devices( - num_devices: int, - num_replicas: int, - remaining_devices: int, - device_options: Optional[list[int]] = [1], - replica_upper_bound: Optional[int] = None, -) -> tuple[int, int, int]: - """Find the parallelism that maximizes the device usage.""" - assert remaining_devices >= 0 - - model_quota = num_devices * num_replicas - - if device_options: - best_product = 0 - best_devices_per_replica = 1 - best_replicas = 1 - for devices_per_replica in device_options: - if devices_per_replica > model_quota: - continue - max_replicas = model_quota // devices_per_replica - if replica_upper_bound and max_replicas > replica_upper_bound: - max_replicas = replica_upper_bound - product = devices_per_replica * max_replicas - if product > best_product: - best_product = product - best_devices_per_replica = devices_per_replica - best_replicas = max_replicas - else: - # start with parallelism=1 instead - best_devices_per_replica = 1 - best_replicas = model_quota - - num_devices = best_devices_per_replica - num_replicas = best_replicas - remaining_devices += model_quota - num_replicas * num_devices - - return num_devices, num_replicas, remaining_devices diff --git a/simulator/plot_utils.py b/simulator/plot_utils.py index 4b0d5849..2ec13de9 100644 --- a/simulator/plot_utils.py +++ b/simulator/plot_utils.py @@ -10,12 +10,12 @@ from typing import Optional -from utils import get_pareto_frontier +from model_provisioner.utils import get_pareto_frontier -from sim_types import ProvisioningResult -from sim_types import GPUType -from sim_types import Model -from sim_types import QualityLevel +from model_provisioner.sim_types import ProvisioningResult +from model_provisioner.sim_types import GPUType +from model_provisioner.sim_types import Model +from model_provisioner.sim_types import QualityLevel FIG_SIZE = (7, 5) diff --git a/simulator/policies.py b/simulator/policies.py deleted file mode 100644 index 3f670f93..00000000 --- a/simulator/policies.py +++ /dev/null @@ -1,252 +0,0 @@ -from __future__ import annotations - -from sim_types import Objective -from sim_types import Policy -from sim_types import GPUType -from sim_types import Model -from sim_types import Solver - -from constants import GPU_RESERVED_COST -from constants import GPU_SPOT_COST - - -# Max devices for each model -# the logic is to allocate devices to each model proportional to their max devices -MAX_DEVICES = { - Model.GEMMA: 8, - Model.FLUX: 16, - Model.HF: 40, - Model.HF_VAE: 1, - Model.FT: 40, - Model.FT_VAE: 1, -} - -# Max iterations for the optimization loop to prevent infinite loops in case of non-monotonic allocators or other issues -MAX_ITERATIONS = 100 - -# Set to True if we want to use up all GPUs if there's no further improvements in the greedy optimization loop -USE_ALL_GPUS = True - -# Default StreamWise policy configuration -# TODO: Add a meta policy that picks the best among disaggregation options for HF/FT -STREAMWISE_POLICY = Policy( - name="streamwise", - gpu_cost=GPU_SPOT_COST, - objective=Objective.TTFF_COST, - disaggregation={ - Model.HF: True, - Model.FT: False, - }, - use_upscaler=True, - hardware=list(GPUType), -) - -STREAMWISE_MILP_POLICY = Policy( - name="streamwise", - gpu_cost=GPU_SPOT_COST, - objective=Objective.TTFF_COST, - disaggregation={ - Model.HF: True, - Model.FT: False, - }, - use_upscaler=True, - hardware=list(GPUType), - solver=Solver.GUROBI, -) - - -""" -HexGen policy configuration. -""" -HEXGEN_POLICY = Policy( - name="hexgen", - gpu_cost=GPU_RESERVED_COST, - objective=Objective.TTFF, # Does not account for cost - disaggregation={ - Model.HF: True, - Model.FT: False, - }, # Dissagregation - use_upscaler=False, - hardware=[ # Multiple hardware - GPUType.A100, - GPUType.H100, - GPUType.H200, - GPUType.GB200, - ], - solver=Solver.HEXGEN, -) - - -""" -Helix policy configuration. -Reference: https://github.com/Thesys-lab/Helix-ASPLOS25 -Optimizes models one-by-one following MODEL_ORDER using MILP. -""" -HELIX_POLICY = Policy( - name="helix", - gpu_cost=GPU_RESERVED_COST, - objective=Objective.TTFF, # Does not account for cost - disaggregation={ - Model.HF: True, - Model.FT: False, - }, - use_upscaler=False, - hardware=list(GPUType), - solver=Solver.HELIX, -) - - -""" -DDiT policy configuration. -Reference: https://arxiv.org/html/2506.13497v1 -""" -DDIT_POLICY = Policy( - name="ddit", - gpu_cost=GPU_RESERVED_COST, - objective=Objective.TTFF, - disaggregation={ - Model.HF: True, - Model.FT: False, - }, - use_upscaler=False, - hardware=list(GPUType), - solver=Solver.NAIVE, -) - - -STREAMWISE_ENERGY_POLICY = Policy( - name="streamwise energy", - gpu_cost=GPU_SPOT_COST, - objective=Objective.TIME_ENERGY, - disaggregation={ - Model.HF: True, - Model.FT: False, - }, - use_upscaler=True, - hardware=list(GPUType), -) - -NAIVE_POLICY = Policy( - name="naive", - gpu_cost=GPU_RESERVED_COST, - objective=Objective.TTFF, - disaggregation={}, - use_upscaler=False, - hardware=[GPUType.A100], - solver=Solver.NAIVE, -) - - -BASELINE_POLICIES = { - "naive": NAIVE_POLICY, - "naive disag": Policy( - "naive disag", - gpu_cost=GPU_RESERVED_COST, - objective=Objective.TTFF, - disaggregation={ - Model.HF: True, - Model.FT: True, - }, - use_upscaler=False, - hardware=[GPUType.A100], - solver=Solver.NAIVE, - ), - "naive upscaler": Policy( - "naive upscaler", - gpu_cost=GPU_RESERVED_COST, - objective=Objective.TTFF, - disaggregation={}, - use_upscaler=True, # Changed to True - hardware=[GPUType.A100], - solver=Solver.NAIVE, - ), - "naive spot": Policy( - "naive spot", - gpu_cost=GPU_SPOT_COST, # Changed to SPOT_COST - objective=Objective.TTFF, - disaggregation={}, - use_upscaler=False, - hardware=[GPUType.A100], - solver=Solver.NAIVE, - ), - "naive ttff*cost allocator": Policy( - "naive ttff*cost allocator", - GPU_RESERVED_COST, - objective=Objective.TTFF_COST, # Changed to TTFF_COST - disaggregation={}, - use_upscaler=False, - hardware=[GPUType.A100], - solver=Solver.GREEDY, - ), - "naive hardware": Policy( - "naive hardware", - GPU_RESERVED_COST, - objective=Objective.TTFF, - disaggregation={}, - use_upscaler=False, - hardware=list(GPUType), # Changed hardware - solver=Solver.NAIVE, - ), -} - - -STREAMWISE_POLICIES = { - "streamwise": STREAMWISE_POLICY, - "streamwise no disag": Policy( - name="streamwise no disag", - gpu_cost=GPU_SPOT_COST, - objective=Objective.TTFF_COST, - disaggregation={}, - use_upscaler=True, - hardware=list(GPUType), - solver=Solver.GREEDY, - ), - "streamwise no upscaler": Policy( - name="streamwise no upscaler", - gpu_cost=GPU_SPOT_COST, - objective=Objective.TTFF_COST, - disaggregation={ - Model.HF: True, - Model.FT: False, - }, - use_upscaler=False, - hardware=list(GPUType), - solver=Solver.GREEDY, - ), - "streamwise no spot": Policy( - name="streamwise no spot", - gpu_cost=GPU_RESERVED_COST, - objective=Objective.TTFF_COST, - disaggregation={ - Model.HF: True, - Model.FT: False, - }, - use_upscaler=True, - hardware=list(GPUType), - solver=Solver.GREEDY, - ), - "streamwise naive allocator": Policy( - name="streamwise naive allocator", - gpu_cost=GPU_SPOT_COST, - objective=Objective.TTFF, - disaggregation={ - Model.HF: True, - Model.FT: False, - }, - use_upscaler=True, - hardware=list(GPUType), - solver=Solver.NAIVE, - ), - "streamwise A100": Policy( - name="streamwise single hardware", - gpu_cost=GPU_SPOT_COST, - objective=Objective.TTFF_COST, - disaggregation={ - Model.HF: True, - Model.FT: False, - }, - use_upscaler=True, - hardware=[GPUType.A100], - solver=Solver.NAIVE, - ), -} diff --git a/simulator/provisioning.py b/simulator/provisioning.py index 43612b53..26e9c8a9 100644 --- a/simulator/provisioning.py +++ b/simulator/provisioning.py @@ -3,6 +3,18 @@ """ from __future__ import annotations +import os +import sys + +# Ensure streamwise/ and simulator/ are on sys.path so model_provisioner +# imports work in child processes spawned by ProcessPoolExecutor. +_REPO_ROOT = os.path.normpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")) +_STREAMWISE_DIR = os.path.join(_REPO_ROOT, "streamwise") +_SIMULATOR_DIR = os.path.dirname(os.path.abspath(__file__)) +for _p in (_REPO_ROOT, _STREAMWISE_DIR, _SIMULATOR_DIR): + if _p not in sys.path: + sys.path.insert(0, _p) + from tqdm.auto import tqdm import logging @@ -18,24 +30,24 @@ from concurrent.futures import TimeoutError from concurrent.futures import as_completed -from sim_types import WorkflowConfig -from sim_types import GPUType -from sim_types import LatencyData -from sim_types import Provision -from sim_types import ProvisioningResult -from sim_types import Model -from sim_types import ModelAllocation -from sim_types import PowerData -from sim_types import QualityLevel -from sim_types import Policy -from sim_types import Result -from sim_types import num_gpus_to_str +from model_provisioner.sim_types import WorkflowConfig +from model_provisioner.sim_types import GPUType +from model_provisioner.sim_types import LatencyData +from model_provisioner.sim_types import Provision +from model_provisioner.sim_types import ProvisioningResult +from model_provisioner.sim_types import Model +from model_provisioner.sim_types import ModelAllocation +from model_provisioner.sim_types import PowerData +from model_provisioner.sim_types import QualityLevel +from model_provisioner.sim_types import Policy +from model_provisioner.sim_types import Result +from model_provisioner.sim_types import num_gpus_to_str -from auto_model_allocator import AutoModelAllocator +from model_provisioner.auto_model_allocator import AutoModelAllocator -from policies import STREAMWISE_POLICY +from model_provisioner.policies import STREAMWISE_POLICY -from constants import SECONDS_IN_HOUR +from model_provisioner.constants import SECONDS_IN_HOUR GPU_PROVISIONS: list[int] = [ diff --git a/simulator/sim_types.py b/simulator/sim_types.py deleted file mode 100644 index a83cec22..00000000 --- a/simulator/sim_types.py +++ /dev/null @@ -1,796 +0,0 @@ -from __future__ import annotations - -import pandas as pd -import numpy as np - -from typing import Optional -from typing import ClassVar - -from abc import ABC -from abc import abstractmethod - -from dataclasses import dataclass -from dataclasses import field - -from enum import Enum - - -class GPUType(Enum): - A100 = "A100" - H100 = "H100" - H200 = "H200" - GB200 = "GB200" - - def __lt__(self, other: object) -> bool: - if not isinstance(other, GPUType): - return NotImplemented - order = [GPUType.A100, GPUType.H100, GPUType.H200, GPUType.GB200] - return order.index(self) < order.index(other) - - -class QualityLevel(Enum): - ORIGINAL = "original" - HIGH = "high" - MEDIUM = "medium" - LOW = "low" - - -# Pixel counts per quality level (16:10 aspect ratio). -# Latency data is profiled at MEDIUM resolution. -RESOLUTION_PIXELS: dict[QualityLevel, int] = { - QualityLevel.HIGH: 1280 * 800, - QualityLevel.MEDIUM: 640 * 400, - QualityLevel.LOW: 320 * 200, -} - - -class Model(Enum): - GEMMA = "gemma" - FLUX = "flux" - HF = "hf" # HunyuanFramePack - HF_VAE = "hf_vae" # HunyuanFramePack VAE - FT = "ft" # FantasyTalking - FT_VAE = "ft_vae" # FantasyTalking VAE - UPSCALER = "upscaler" - OTHERS = "others" # YOLO + Kokoro - - -# Used for FIFO -MODEL_ORDER: dict[Model, int] = { - Model.GEMMA: 0, - Model.FLUX: 1, - Model.OTHERS: 2, - Model.HF: 3, - Model.HF_VAE: 4, - Model.FT: 5, - Model.FT_VAE: 6, - Model.UPSCALER: 7, -} - - -@dataclass -class ModelAllocation(ABC): - model: ClassVar[Model] - - # policy TODO - # workflow TODO - gpu_type: GPUType - devices: int = 1 - replicas: int = 0 # No replicas by default - work: int = 0 - time: float = 0.0 - time_first: float = 0.0 - energy: float = 0.0 - cost: float = 0.0 - - def __str__(self) -> str: - if self.replicas <= 0: - assert self.time == 0.0, f"time must be 0 when no replicas, got {self.time:.2f}" - assert self.energy == 0.0, f"energy must be 0 when no replicas, got {self.energy:.2f}" - return "--" - return \ - f"devices={self.devices:2d}, " \ - f"replicas={self.replicas}, " \ - f"work={self.work}, " \ - f"time={self.time:.2f} secs, " \ - f"time_first={self.time_first:.2f} secs, " \ - f"energy={self.energy / 60.0 / 60.0:.2f} Wh, " \ - f"cost=${self.cost:.2f}" - - def __repr__(self) -> str: - return self.__str__() - - def __post_init__(self) -> None: - if self.replicas > 0: - return - if self.time != 0.0 or self.energy != 0.0: - raise ValueError( - f"time and energy must be 0.0 when no replicas, got time={self.time:.2f}, energy={self.energy:.2f}") - - def get_num_gpus(self) -> int: - if self.replicas <= 0: - return 0 - return self.devices * self.replicas - - def disable(self) -> None: - self.devices = 0 - self.replicas = 0 - self.time = 0.0 - self.time_first = 0.0 - self.energy = 0.0 - - @abstractmethod - def calculate_time( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - work_pct: float = 1.0, - ) -> float: - ... - - @abstractmethod - def calculate_time_first( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - ) -> float: - ... - - @abstractmethod - def calculate_energy( - self, - workflow: WorkflowConfig, - power_data: Optional[PowerData] = None, - total_time_s: float = 0.0, - ) -> float: - ... - - def calculate_cost( - self, - policy: Policy, - total_time_s: float = 0.0, - ) -> float: - """Calculate the cost for this model allocation.""" - SECONDS_IN_HOUR = 60 * 60 - gpu_cost = policy.gpu_cost[self.gpu_type] - self.cost = total_time_s * (self.get_num_gpus() * gpu_cost) / SECONDS_IN_HOUR - return self.cost - - def calculate( - self, - policy: Policy, - workflow: WorkflowConfig, - latency_data: LatencyData, - power_data: Optional[PowerData] = None, - total_time_s: float = 0.0, - work_pct: float = 1.0, - ) -> None: - """Calculate all the values for this model allocation.""" - self.calculate_time(policy, workflow, latency_data, work_pct) - self.calculate_time_first(policy, workflow, latency_data) - self.calculate_cost(policy, total_time_s) - self.calculate_energy(workflow, power_data, total_time_s) - - def get_max_replicas( - self, - workflow: WorkflowConfig, - ) -> int: - """Get the maximum number of replicas that can leverage parallelism.""" - return 1 - - -class Objective(Enum): - FIFO = "fifo" - TIME = "time" - TTFF = "ttff" - COST = "cost" - ENERGY = "energy" - TIME_COST = "time_cost" - TTFF_COST = "ttff_cost" - ENERGY_COST = "energy_cost" - TIME_ENERGY = "time_energy" - RANDOM = "random" - NONE = "none" - - TTFF_THEN_TIME = "ttff_then_time" # first minimize ttff, then minimize time - - def is_monotonic(self) -> bool: - return self not in {Objective.RANDOM, Objective.FIFO} - - -@dataclass -class WorkflowConfig: - total_video_seconds: int - total_scenes: int - total_frames: dict[Model, int] - total_subscenes: int - per_subscene_frames: dict[Model, int] - # default per-frame number of denoising steps - num_steps: dict[Model, int] - # supported number of generation frames - hf_frames: list[int] - ft_frames: list[int] - frames_per_step_idx: int - # target output resolution (default: HIGH) - target_resolution: QualityLevel = QualityLevel.HIGH - - # total input tokens - total_input_tokens: int = 0 - - # work per model (determines parallelism; work > 1 means parallelizable across replicas) - # models included in the workflow are derived from the keys of this dict - model_work: dict[Model, int] = field(default_factory=dict) - - @property - def models(self) -> list[Model]: - """Models included in the workflow (derived from model_work keys).""" - return list(self.model_work.keys()) - - @property - def work(self) -> dict[Model, int]: - """Units of work per model (0 for models not in the workflow).""" - return { - model_name: self.model_work.get(model_name, 0) - for model_name in Model - } - - def get_model_order(self) -> list[Model]: - """Get ordered list of models in the workflow, sorted by MODEL_ORDER.""" - return sorted( - [m for m in self.models if m in MODEL_ORDER], - key=lambda m: MODEL_ORDER[m], - ) - - def get_resolution_scale(self, use_upscaler: bool) -> float: - """Compute latency scaling factor based on target resolution. - - Latency data is profiled at MEDIUM resolution. The scale factor - adjusts for the actual generation resolution: - - 1. Upscaler used, HIGH → 1.0 (models generate at MEDIUM) - 2. Upscaler used, MEDIUM → LOW / MEDIUM (models generate at LOW) - 3. No upscaler, HIGH → HIGH / MEDIUM (scale up) - 4. No upscaler, MEDIUM → 1.0 - 5. No upscaler, LOW → LOW / MEDIUM (scale down) - """ - if use_upscaler: - assert self.target_resolution in (QualityLevel.HIGH, QualityLevel.MEDIUM), \ - "Upscaler can only be used when target resolution is HIGH or MEDIUM" - if self.target_resolution == QualityLevel.HIGH: - return 1.0 - # MEDIUM target with upscaler: generate at LOW, upscale to MEDIUM - return RESOLUTION_PIXELS[QualityLevel.LOW] / RESOLUTION_PIXELS[QualityLevel.MEDIUM] - if self.target_resolution == QualityLevel.MEDIUM: - return 1.0 - return RESOLUTION_PIXELS[self.target_resolution] / RESOLUTION_PIXELS[QualityLevel.MEDIUM] - - def is_parallelizable(self, model: Model) -> bool: - """Whether the given model can be parallelized across multiple replicas.""" - return self.model_work.get(model, 0) > 1 - - def filter_parallelizable_models( - self, - models: list[Model], - disaggregation: dict[Model, bool], - ) -> list[Model]: - filtered_models = [ - model - for model in models - if self.is_parallelizable(model) - ] - # Remove VAE models when their parent model disaggregation is disabled - if not disaggregation.get(Model.HF, False): - filtered_models = [m for m in filtered_models if m != Model.HF_VAE] - if not disaggregation.get(Model.FT, False): - filtered_models = [m for m in filtered_models if m != Model.FT_VAE] - return filtered_models - - def __post_init__(self) -> None: - assert self.total_frames[Model.HF] > self.per_subscene_frames[Model.HF] - assert self.total_frames[Model.FT] > self.per_subscene_frames[Model.FT] - - # If no models specified, populate defaults for all models - if not self.model_work: - defaults: dict[Model, int] = { - Model.GEMMA: 1, - Model.FLUX: 1, - Model.HF: self.total_subscenes, - Model.HF_VAE: self.total_frames[Model.HF], - Model.FT: self.total_subscenes, - Model.FT_VAE: self.total_frames[Model.FT], - Model.UPSCALER: self.total_frames[Model.FT], - Model.OTHERS: 1, - } - for model, work in defaults.items(): - self.model_work[model] = work - if self.target_resolution != QualityLevel.HIGH: - if Model.UPSCALER in self.model_work: - del self.model_work[Model.UPSCALER] - - @property - def num_frames(self) -> int: - """Number of frames generated by the workflow.""" - if Model.FT in self.total_frames: - return self.total_frames[Model.FT] - return 0 - - -class ActionName(Enum): - MERGE = "merge" - ADD_DEVICE = "add device" - ADD_REPLICA = "add replica" - ADD_DEVICE_REPLICA = "add device replica" - ADD_INSTANCE = "add instance" - REMOVE_DEVICE = "remove device" - REMOVE_REPLICA = "remove replica" - - -@dataclass -class Action: - """ - Optimization action to take. - """ - name: ActionName - model: Model - gpu_type: GPUType - models: dict[GPUType, dict[Model, list[ModelAllocation]]] - - action_result: Result = field(repr=False) - - arrival_time_s: float = 0.0 # For FIFO scheduling - - # Derived fields from action_result (not passed by caller) - time: float = field(init=False) # Total execution time - ttff: float = field(init=False) # Time to first frame - cost: float = field(init=False) # Cost in $ - energy: float = field(init=False) # Energy in W*s - - def __post_init__(self) -> None: - # ---- type checks ---- - if not isinstance(self.model, Model): - raise ValueError(f"Model {self.model} [{type(self.model)}] not supported") - if not isinstance(self.name, ActionName): - raise ValueError(f"Action name {self.name} [{type(self.name)}] not supported") - if not isinstance(self.models, dict): - raise ValueError(f"models must be a dict, got {type(self.models)}") - if not isinstance(self.gpu_type, GPUType): - raise ValueError(f"Device type {self.gpu_type} [{type(self.gpu_type)}] not supported") - """ - if not isinstance(self.allocation_id, int) or self.allocation_id < 0: - raise ValueError(f"Allocation ID {self.allocation_id} must be a non-negative integer") - if self.num_replicas <= 0: - raise ValueError(f"num_replicas {self.num_replicas} must be > 0") - if self.num_devices <= 0: - raise ValueError(f"num_devices {self.num_devices} must be > 0") - """ - # ---- derive values ---- - self.time = self.action_result.total_time_s - self.ttff = self.action_result.ttff_s - self.cost = self.action_result.cost - self.energy = self.action_result.total_energy - if self.cost < 0.0: - raise ValueError("cost must be >= 0") - - def __str__(self) -> str: - return ( - f"Action(" - f"{self.name.value}, " - f"model={self.model.value}, " - f"gpu={self.gpu_type.value}, " - f"time={self.time:.2f} s, " - f"ttff={self.ttff:.2f} s, " - f"cost=${self.cost:.2f}, " - f"time*cost={self.time_cost():.2f}, " - f"ttff*cost={self.ttff_cost():.2f}, " - f"energy*cost={self.energy_cost():.2f}, " - f"time*energy={self.time_energy():.2f}, " - f"energy={self.energy:.2f} Ws, " - f"models={self.models}" - f")" - ) - - def time_cost(self) -> float: - """We use improvement in time * $.""" - if self.time <= 0: - return self.cost - if self.cost <= 0: - return self.time - return self.time * self.cost - - def ttff_cost(self) -> float: - """We use improvement in TTFF * $.""" - if self.ttff <= 0: - return self.cost - if self.cost <= 0: - return self.ttff - return self.ttff * self.cost - - def energy_cost(self) -> float: - """We use improvement in Wh * $.""" - if self.cost <= 0: - return self.energy - if self.energy <= 0: - return self.cost - return self.energy * self.cost - - def time_energy(self) -> float: - """We use improvement in TTFF * Wh.""" - if self.energy <= 0: - return self.time - if self.time <= 0: - return self.energy - return self.time * self.energy - - def get_order(self) -> int: - " ""For FIFO scheduling."" " - return MODEL_ORDER[self.model] - - def get_metric( - self, - obj: Objective, - switch_objective: bool = False, - ) -> float: - if obj == Objective.RANDOM: - return 0.0 - if obj == Objective.TIME: - return self.time - if obj == Objective.TTFF: - return self.ttff - if obj == Objective.COST: - return self.cost - if obj == Objective.ENERGY: - return self.energy - if obj == Objective.TIME_COST: - return self.time_cost() - if obj == Objective.TTFF_COST: - return self.ttff_cost() - if obj == Objective.ENERGY_COST: - return self.energy_cost() - if obj == Objective.TIME_ENERGY: - return self.time_energy() - if obj == Objective.FIFO: - # return self.get_order() - return 0 # TODO - if obj == Objective.TTFF_THEN_TIME: - if switch_objective: - return self.time - else: - return self.ttff - raise ValueError(f"Unknown objective {obj}") - - -@dataclass -class Result: - total_time_s: float = 0.0 - first_chunk_time: float = 0.0 # Time to first chunk - ttff_s: float = 0.0 # Time to first frame (accounts for total time and workflow length) - tbf_s: float = 0.0 # Time between frames - total_energy: float = 0.0 # Watts x second - cost: float = 0.0 # Total $ cost - gpus_used: dict[GPUType, int] = field(default_factory=dict) - gpus_total: dict[GPUType, int] = field(default_factory=dict) - models: dict[GPUType, dict[Model, list[ModelAllocation]]] = field(default_factory=dict) - - def __post_init__(self) -> None: - assert self.total_time_s >= 0.0, f"total_time_s={self.total_time_s} must be >= 0.0" - assert self.first_chunk_time >= 0.0, f"first_chunk_time={self.first_chunk_time} must be >= 0.0" - assert self.ttff_s >= 0.0, f"ttff_s={self.ttff_s} must be >= 0.0" - assert self.tbf_s >= 0.0, f"tbf_s={self.tbf_s} must be >= 0.0" - assert self.total_energy >= 0.0, f"total_energy={self.total_energy} must be >= 0.0" - assert self.cost >= 0.0, f"cost={self.cost} must be >= 0.0" - assert len(self.gpus_used) >= 0, f"gpus_used cannot be empty: {self.gpus_used}" - for gpu_used in self.gpus_used.values(): - assert gpu_used >= 0, f"all gpus_used value {self.gpus_used} must be >= 0" - - def to_csv(self) -> str: - num_a100 = self.gpus_used.get(GPUType.A100, 0) - num_h100 = self.gpus_used.get(GPUType.H100, 0) - num_h200 = self.gpus_used.get(GPUType.H200, 0) - num_gb200 = self.gpus_used.get(GPUType.GB200, 0) - return ( - f"{num_a100},{num_h100},{num_h200},{num_gb200}," - f"{self.ttff_s:.2f},{self.tbf_s:.2f},{self.cost:.2f}," - f"{self.total_time_s:.2f},{self.total_energy:.2f}" - ) - - def __str__(self) -> str: - SECONDS_IN_HOUR = 60 * 60 - return ( - f"Time:{self.total_time_s:.2f} s TTFF:{self.ttff_s:.2f} s " - f"Cost:${self.cost:.2f} TTFF*Cost:{self.ttff_s * self.cost:.2f} " - f"Energy:{self.total_energy / SECONDS_IN_HOUR / 1000:.2f} kWh " - f"GPUS: {num_gpus_to_str(self.gpus_used)}" - ) - - def __repr__(self) -> str: - return self.__str__() - - -@dataclass -class LatencyGPUTypeData: - gpu_type: GPUType - # TP -> latency mappings - flux: dict[int, float] = field(default_factory=dict) - hf: dict[int, float] = field(default_factory=dict) - hf_high: dict[int, float] = field(default_factory=dict) - hf_vae: dict[int, float] = field(default_factory=dict) - hf_vae_high: dict[int, float] = field(default_factory=dict) - ft: dict[int, float] = field(default_factory=dict) - ft_high: dict[int, float] = field(default_factory=dict) - ft_vae: dict[int, float] = field(default_factory=dict) - ft_vae_high: dict[int, float] = field(default_factory=dict) - upscaler: dict[int, float] = field(default_factory=dict) - gemma_first_scene: dict[int, float] = field(default_factory=dict) - gemma_per_scene: dict[int, float] = field(default_factory=dict) - others: dict[int, float] = field(default_factory=dict) - - def __getitem__( - self, - key: Model | tuple[Model, int] - ) -> float: - if isinstance(key, tuple): - assert isinstance(key[0], Model) - assert isinstance(key[1], int) - model, num_devices = key - if model == Model.FLUX: - return self.flux[num_devices] - if model == Model.HF: - return self.hf[num_devices] - if model == Model.HF_VAE: - return self.hf_vae[num_devices] - if model == Model.FT: - return self.ft[num_devices] - if model == Model.FT_VAE: - return self.ft_vae[num_devices] - if model == Model.GEMMA: - return self.gemma_first_scene[num_devices] - if model == Model.UPSCALER: - return self.upscaler[num_devices] - if model == Model.OTHERS: - return self.others[num_devices] - raise KeyError(f"Latency for model {key} not found") - - def __contains__(self, key: Model | tuple[Model, int]) -> bool: - if isinstance(key, tuple): - assert isinstance(key[0], Model) - assert isinstance(key[1], int) - model, num_devices = key - if model == Model.GEMMA: - return num_devices in self.gemma_first_scene - if model == Model.FLUX: - return num_devices in self.flux - if model == Model.HF: - return num_devices in self.hf - if model == Model.HF_VAE: - return num_devices in self.hf_vae - if model == Model.FT: - return num_devices in self.ft - if model == Model.FT_VAE: - return num_devices in self.ft_vae - if model == Model.UPSCALER: - return num_devices in self.upscaler - if model == Model.HF_VAE: - return num_devices in self.hf_vae - if model == Model.OTHERS: - return num_devices in self.others - return False - - def get_max_parallelism(self, model: Model) -> int: - """Max number of devices supported for the given model.""" - if model == Model.FLUX: - return max(self.flux.keys()) - if model == Model.HF: - return max(self.hf.keys()) - if model == Model.FT: - return max(self.ft.keys()) - if model == Model.FT_VAE: - return max(self.ft_vae.keys()) - if model == Model.GEMMA: - return max(self.gemma_first_scene.keys()) - if model == Model.UPSCALER: - return max(self.upscaler.keys()) - if model == Model.HF_VAE: - return max(self.hf_vae.keys()) - if model == Model.OTHERS: - return max(self.others.keys()) - raise KeyError(f"Model {model} not found in latency data") - - -@dataclass -class PowerGPUTypeData: - gpu_type: GPUType - # TP -> power mappings - flux: dict[int, float] = field(default_factory=dict) - hf: dict[int, float] = field(default_factory=dict) - hf_high: dict[int, float] = field(default_factory=dict) - hf_vae: dict[int, float] = field(default_factory=dict) - hf_vae_high: dict[int, float] = field(default_factory=dict) - ft: dict[int, float] = field(default_factory=dict) - ft_high: dict[int, float] = field(default_factory=dict) - ft_vae: dict[int, float] = field(default_factory=dict) - ft_vae_high: dict[int, float] = field(default_factory=dict) - upscaler: dict[int, float] = field(default_factory=dict) - gemma_first_scene: dict[int, float] = field(default_factory=dict) - gemma_per_scene: dict[int, float] = field(default_factory=dict) - # Other values - idle: float = 0.0 # Idle power in Watts - tdp: float = 0.0 # TDP power in Watts - - def __getitem__( - self, - key: Model | tuple[Model, int] | str - ) -> float: - if isinstance(key, tuple): - assert isinstance(key[0], Model) - assert isinstance(key[1], int) - model, devices = key - if model == Model.FLUX: - return self.flux[devices] - if model == Model.HF: - return self.hf[devices] - if model == Model.HF_VAE: - return self.hf_vae[devices] - if model == Model.FT: - return self.ft[devices] - if model == Model.FT_VAE: - return self.ft_vae[devices] - if model == Model.UPSCALER: - return self.upscaler[devices] - if isinstance(key, str): - if key == "idle": - return self.idle - if key == "tdp": - return self.tdp - raise KeyError(f"Power for {key} not found") - - -@dataclass -class LatencyData: - gpus: dict[GPUType, LatencyGPUTypeData] - - def __getitem__(self, gpu_type: GPUType) -> LatencyGPUTypeData: - return self.gpus[gpu_type] - - def __setitem__( - self, - gpu_type: GPUType, - latency_data: LatencyGPUTypeData - ) -> None: - self.gpus[gpu_type] = latency_data - - -@dataclass -class PowerData: - gpus: dict[GPUType, PowerGPUTypeData] - - def __getitem__(self, gpu_type: GPUType) -> PowerGPUTypeData: - return self.gpus[gpu_type] - - def __setitem__( - self, - gpu_type: GPUType, - power_data: PowerGPUTypeData - ) -> None: - self.gpus[gpu_type] = power_data - - -def num_gpus_to_str( - provision: dict[GPUType, int] -) -> str: - return "+".join([ - f"{num_gpus}x{gpu_type.name}" - for gpu_type, num_gpus in provision.items() - if num_gpus > 0 - ]) - - -@dataclass -class Provision: - num_gpus: dict[GPUType, int] = field(default_factory=dict) - - def __getitem__(self, gpu_type: GPUType) -> int: - return self.num_gpus[gpu_type] - - def __str__(self) -> str: - return num_gpus_to_str(self.num_gpus) - - -@dataclass -class ProvisioningResult: - latencies: list[float] - costs: list[float] - ttffs: list[float] - tbfs: list[float] - actual_provision: list[dict[GPUType, int]] - config_provision: list[dict[GPUType, int]] - model_provision: list[dict[GPUType, dict[Model, list[ModelAllocation]]]] - qualities: list[float] = field(default_factory=list) - energies: list[float] = field(default_factory=list) - - def save( - self, - policy_name: str, - results_dir: str, - ) -> None: - """Save the provisioning results to a CSV file.""" - num_a100: list[int] = [] - num_h100: list[int] = [] - num_h200: list[int] = [] - num_gb200: list[int] = [] - for provision in self.actual_provision: - num_a100.append(provision.get(GPUType.A100, 0)) - num_h100.append(provision.get(GPUType.H100, 0)) - num_h200.append(provision.get(GPUType.H200, 0)) - num_gb200.append(provision.get(GPUType.GB200, 0)) - df_latency = pd.DataFrame({ - 'num_a100': num_a100, - 'num_h100': num_h100, - 'num_h200': num_h200, - 'num_gb200': num_gb200, - 'ttff_s': self.ttffs, - 'tbf_s': self.tbfs, - 'cost': self.costs, - 'total_time': self.latencies, - 'energy': self.energies, - }) - df_latency[['ttff_s', 'tbf_s', 'cost', 'total_time', 'energy']] = ( - df_latency[['ttff_s', 'tbf_s', 'cost', 'total_time', 'energy']].round(2) - ) - policy_name_clean = policy_name.replace(" ", "_").replace("*", "x").replace("/", "_").lower() - file_name = results_dir + f"provisioning_{policy_name_clean}.csv" - df_latency.to_csv(file_name, index=False) - - def get_pareto_frontier( - self, - max_x: Optional[float] = None, - max_y: Optional[float] = None, - ) -> np.ndarray: - from utils import get_pareto_frontier # TODO this is a lazy fix, we need to reset - # points = np.array(list(zip(self.ttffs, self.costs))) - return get_pareto_frontier( - self.ttffs, - self.costs, - max_x=max_x, - max_y=max_y, - ) - - -class Solver(Enum): - GUROBI = "gurobi" - HIGHS = "highs" - GREEDY = "greedy" - NAIVE = "naive" - HEXGEN = "hexgen" - HELIX = "helix" - - -@dataclass -class Policy: - name: str - gpu_cost: dict[GPUType, float] - objective: Objective - disaggregation: dict[Model, bool] - use_upscaler: bool - hardware: list[GPUType] = field(default_factory=lambda: [GPUType.A100, GPUType.H100, GPUType.H200, GPUType.GB200]) - solver: Solver = Solver.GREEDY - - def is_disaggregated(self, model: Model) -> bool: - """Check if a model has disaggregation enabled.""" - return self.disaggregation.get(model, False) - - def __str__(self) -> str: - disag_str = { - model.value: disaggregated - for model, disaggregated in self.disaggregation.items() - if disaggregated - } - return ( - f"Policy({self.name}, " - f"objective={self.objective}, " - f"disag={disag_str}, " - f"upscaler={self.use_upscaler}, " - f"cost={self.gpu_cost}, " - f"solver={self.solver})" - ) diff --git a/simulator/sim_types_json.py b/simulator/sim_types_json.py deleted file mode 100644 index 9f5451ea..00000000 --- a/simulator/sim_types_json.py +++ /dev/null @@ -1,58 +0,0 @@ -from __future__ import annotations - -import json - -from dataclasses import asdict - -from sim_types import Model -from sim_types import Policy -from sim_types import GPUType -from sim_types import ModelAllocation -from sim_types import WorkflowConfig - - -def models_to_json( - models: dict[GPUType, dict[Model, list[ModelAllocation]]] -) -> str: - result = {} - for gpu_type, model_dict in models.items(): - inner_result = {} - for model, allocation_list in model_dict.items(): - for allocation in allocation_list: - alloc_dict = { - 'devices': allocation.devices, - 'replicas': allocation.replicas, - } - inner_result[model.value] = alloc_dict - result[gpu_type.name] = inner_result - return str(result).replace("}}, '", "}},'") - - -def workflow_to_json(workflow: WorkflowConfig) -> str: - d = asdict(workflow) - # Convert Model enum keys in dict fields to string values - for dict_field in ('total_frames', 'per_subscene_frames', 'num_steps', 'model_work'): - if dict_field in d: - d[dict_field] = { - (k.value if hasattr(k, 'value') else k): v - for k, v in d[dict_field].items() - } - # Convert QualityLevel enum to string value - if 'target_resolution' in d and hasattr(d['target_resolution'], 'value'): - d['target_resolution'] = d['target_resolution'].value - return json.dumps(d) - - -def policy_to_json(policy: Policy) -> str: - result = { - 'name': policy.name, - 'objective': str(policy.objective), - 'disaggregation': {model.value: enabled for model, enabled in policy.disaggregation.items()}, - 'use_upscaler': policy.use_upscaler, - 'hardware': [gpu.name for gpu in policy.hardware], - } - return json.dumps(result) - - -def model_list_to_json(models: list[Model]) -> str: - return json.dumps(models, default=lambda o: o.value) diff --git a/simulator/utils.py b/simulator/utils.py deleted file mode 100644 index 29ffe7ab..00000000 --- a/simulator/utils.py +++ /dev/null @@ -1,297 +0,0 @@ -""" -Utilities for the simulator. -""" - -from __future__ import annotations - -from copy import deepcopy - -import pandas as pd -import numpy as np - -from scipy.interpolate import interp1d - -from sim_types import ProvisioningResult -from sim_types import GPUType -from sim_types import Model -from sim_types import ModelAllocation - -from typing import Optional - - -def to_models_df( - models: dict[GPUType, dict[Model, list[ModelAllocation]]] -) -> pd.DataFrame: - """ - Convert the models dictionary to a pandas DataFrame for easier analysis and visualization. - """ - records = [] - for gpu_type, model_allocations in models.items(): - for model, allocations in model_allocations.items(): - for allocation in allocations: - if allocation is None or allocation.get_num_gpus() == 0: - continue # Ignoring empty allocations - record = { - "GPU": gpu_type.value, - "Model": model.value, - "Devices": allocation.devices, - "Replicas": allocation.replicas, - "Work": allocation.work, - "#GPUs": allocation.get_num_gpus(), - "Time (s)": allocation.time, - "TTFF (s)": allocation.time_first, - "Energy (kWh)": allocation.energy / (60 * 60) / 1000.0, # Convert to kWh - "Cost ($)": allocation.cost, - } - records.append(record) - df = pd.DataFrame(records) - df = df.set_index(["GPU", "Model"]) - df = df.round(2) - - total = df.sum(numeric_only=True) - total["Time (s)"] = df["Time (s)"].groupby(level="Model").max().sum() - total["TTFF (s)"] = df["TTFF (s)"].groupby(level="Model").min().sum() - total.name = ("TOTAL", "") - df = pd.concat([df, total.to_frame().T]) - - df[["Devices", "Replicas", "#GPUs", "Work"]] = df[["Devices", "Replicas", "#GPUs", "Work"]].astype(int) - - return df - - -def coalesce_models( - models: dict[GPUType, dict[Model, list[ModelAllocation]]] -) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: - """The models with the same parallelism and same work, should be accounted as replicas.""" - merged: dict[GPUType, dict[Model, list[ModelAllocation]]] = {} - for gpu_type, model_dict in models.items(): - merged[gpu_type] = {} - for model_name, allocations in model_dict.items(): - merged_allocations: list[ModelAllocation] = [] - for alloc in allocations: - # Check if there's an existing allocation with the same devices and work - match = next(( - model_alloc - for model_alloc in merged_allocations - if model_alloc.devices == alloc.devices and model_alloc.work == alloc.work - ), None) - if match: - # If found, increment replicas and aggregate energy/cost - match.replicas += 1 - match.energy += alloc.energy - match.cost += alloc.cost - else: - # Otherwise, add as new allocation - merged_allocations.append(deepcopy(alloc)) - merged[gpu_type][model_name] = merged_allocations - return merged - - -def simplify_model_allocations( - models: dict[GPUType, dict[Model, list[ModelAllocation]]], -) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: - """ - Simplify model allocations by merging replicas with the same number of devices. - This is to reduce the search space for the optimization loop. - """ - new_models = deepcopy(models) - for gpu_type in new_models.keys(): - for model in new_models[gpu_type].keys(): - model_instances = new_models[gpu_type][model] - alloc_map: dict[int, ModelAllocation] = {} - for model_instance in model_instances: - if model_instance.get_num_gpus() == 0: - continue - if model_instance.devices not in alloc_map: - alloc_map[model_instance.devices] = deepcopy(model_instance) - else: - alloc_map[model_instance.devices].replicas += model_instance.replicas - new_models[gpu_type][model] = list(alloc_map.values()) - return new_models - - -def find_fastest_provisioning( - provisioning: ProvisioningResult, -) -> int: - """Find the fastest provisioning option.""" - min_latency = min(provisioning.latencies) - min_latency_index = provisioning.latencies.index(min_latency) - return min_latency_index - - -def find_fastest_ttff_provisioning( - provisioning: ProvisioningResult, -) -> int: - """Find the fastest provisioning option.""" - min_ttff = min(provisioning.ttffs) - min_ttff_index = provisioning.ttffs.index(min_ttff) - return min_ttff_index - - -def find_cheapest_provisioning( - provisioning: ProvisioningResult, -) -> int: - """Find the cheapest provisioning option.""" - min_cost = min(provisioning.costs) - min_cost_index = provisioning.costs.index(min_cost) - return min_cost_index - - -def find_most_cost_effective_provisioning( - provisioning: ProvisioningResult, -) -> int: - """Find the most cost-effective provisioning option.""" - min_cost = min(provisioning.costs) - min_latency = min(provisioning.latencies) - min_cost_index = provisioning.costs.index(min_cost) - min_latency_index = provisioning.latencies.index(min_latency) - if min_cost_index == min_latency_index: - return min_cost_index - - # if the indices are different, return the provisioning option with the minimum cost*latency - cost_latency_list = [ - cost * latency - for cost, latency in zip(provisioning.costs, provisioning.latencies) - ] - min_cost_latency = min(cost_latency_list) - min_cost_latency_index = cost_latency_list.index(min_cost_latency) - return min_cost_latency_index - - -def find_most_energy_efficient_provisioning( - provisioning: ProvisioningResult, -) -> int: - """Find the most energy-efficient provisioning option.""" - min_energy = min(provisioning.energies) - min_latency = min(provisioning.latencies) - min_energy_index = provisioning.energies.index(min_energy) - min_latency_index = provisioning.latencies.index(min_latency) - if min_energy_index == min_latency_index: - return min_energy_index - - # if the indices are different, return the provisioning option with the minimum energy*latency - energy_latency_list = [ - energy * latency - for energy, latency in zip(provisioning.energies, provisioning.latencies) - ] - min_energy_latency = min(energy_latency_list) - min_energy_latency_index = energy_latency_list.index(min_energy_latency) - return min_energy_latency_index - - -def find_pareto_frontier( - latency_list: list[float], - energy_list: list[float], - provision: list[float] -) -> tuple[list[float], list[float], list[float]]: - pareto_provision = [] - pareto_latency = [] - pareto_energy = [] - for i in range(len(latency_list)): - dominated = False - for j in range(len(latency_list)): - if i != j: - if latency_list[j] <= latency_list[i] and energy_list[j] <= energy_list[i]: - if latency_list[j] < latency_list[i] or energy_list[j] < energy_list[i]: - dominated = True - break - if not dominated: - pareto_provision.append(provision[i]) - pareto_latency.append(latency_list[i]) - pareto_energy.append(energy_list[i]) - return pareto_provision, pareto_latency, pareto_energy - - -def get_pareto_frontier_paper( - points: np.ndarray, - max_y: Optional[float] = None, - max_x: Optional[float] = None, -) -> np.ndarray: - """ - Calculate the Pareto frontier from a set of data points - """ - if points.size == 0: - return points.copy() - - # points = points[np.argsort(points[:, 0])] - points = points[np.lexsort((points[:, 1], points[:, 0]))] - - pareto_front = [points[0]] - for point in points[1:]: - if point[1] < pareto_front[-1][1]: - pareto_front.append(point) - - # Add extreme points to the Pareto frontier - extreme_point_0 = [pareto_front[0][0], max(points[:, 1])] - extreme_point_1 = [max(points[:, 0]), pareto_front[-1][1]] - pareto_front.append(extreme_point_0) - pareto_front.append(extreme_point_1) - - if max_x is not None: - candidate = np.array([max_x, min(points[:, 1])]) - if candidate[0] > pareto_front[-1][0] and candidate[1] <= pareto_front[-1][1]: - pareto_front.append(candidate) - if max_y is not None: - candidate = np.array([min(points[:, 0]), max_y]) - if candidate[1] > pareto_front[0][1] and candidate[0] <= pareto_front[0][0]: - pareto_front.append(candidate) - - pareto_front_np = np.array(pareto_front) - pareto_front_np = pareto_front_np[np.lexsort(( - -pareto_front_np[:, 1], - pareto_front_np[:, 0]))] - - # Avoid repeated points - _, idx = np.unique(pareto_front_np, axis=0, return_index=True) - pareto_front_np = pareto_front_np[np.sort(idx)] - - return pareto_front_np - - -def get_pareto_frontier( - ttff_list: list[float], - costs: list[float], - max_y: Optional[float] = None, - max_x: Optional[float] = None, -) -> np.ndarray: - points = np.array(list(zip(ttff_list, costs))) - return get_pareto_frontier_paper( - points, - max_x, - max_y, - ) - - -def clean_frontier( - frontier: np.ndarray -) -> np.ndarray: - F = frontier[np.argsort(frontier[:, 0])] - xs = [] - ys = [] - i = 0 - while i < len(F): - x = F[i, 0] - same_x = F[F[:, 0] == x] - xs.append(x) - ys.append(same_x[:, 1].min()) - i += len(same_x) - return np.column_stack([xs, ys]) - - -def area_between_frontiers( - A: np.ndarray, - B: np.ndarray, - n: int = 5000 -) -> np.ndarray: - A = clean_frontier(A) - B = clean_frontier(B) - xmin = max(A[:, 0].min(), B[:, 0].min()) - xmax = min(A[:, 0].max(), B[:, 0].max()) - xs = np.linspace(xmin, xmax, n) - fA = interp1d(A[:, 0], A[:, 1], kind="linear") - fB = interp1d(B[:, 0], B[:, 1], kind="linear") - yA = fA(xs) - yB = fB(xs) - # return np.trapezoid(yB - yA, xs) - delta = yB - yA - return 100.0 * delta / yB diff --git a/simulator/workflows.py b/simulator/workflows.py deleted file mode 100644 index ba0caa46..00000000 --- a/simulator/workflows.py +++ /dev/null @@ -1,253 +0,0 @@ -from __future__ import annotations - -import math - -from typing import Optional - -from sim_types import WorkflowConfig -from sim_types import Model -from sim_types import QualityLevel - -from constants import FPS -from constants import FRAMES_OPTIONS -from constants import FRAMES_PER_STEP_IDX -from constants import NUM_STEPS -from constants import SECONDS_IN_HOUR, SECONDS_IN_MINUTE -from constants import TOTAL_INPUT_TOKENS - - -# Shared physical constants -MAX_FT_FRAMES: int = 1 + 80 -SUBSCENE_SECONDS: float = MAX_FT_FRAMES / FPS[Model.FT] # 81 frames @ 23 FPS → ~3.52 s -SUBSCENES_PER_SCENE: int = 4 # default subscene grouping -TOKENS_PER_FRAME = 500 # 1 frame generates around 500 tokens - - -def _get_num_subscenes(total_video_seconds: int) -> int: - """Return the number of subscenes needed to cover the given video duration.""" - return math.ceil(total_video_seconds / SUBSCENE_SECONDS) - - -def _get_num_scenes(total_video_seconds: int) -> int: - """Return the number of scenes needed to cover the given video duration.""" - return math.ceil(_get_num_subscenes(total_video_seconds) / SUBSCENES_PER_SCENE) - - -def _get_num_frames(total_video_seconds: int, model: Model) -> int: - """Return the number of frames needed for the given video duration and model.""" - return math.ceil(total_video_seconds * FPS[model]) - - -def _video_gen_work( - total_video_seconds: int, - num_scenes: int, - num_subscenes: int, - model_work_overrides: Optional[dict[Model, int | str | None]] = None, -) -> dict[Model, int]: - """Standard model work for video-generation workflows (Podcast, Movie, etc.).""" - ret = { - Model.GEMMA: 1, - Model.FLUX: 1, - Model.HF: num_subscenes, - Model.HF_VAE: _get_num_frames(total_video_seconds, Model.HF), - Model.FT: num_subscenes, - Model.FT_VAE: _get_num_frames(total_video_seconds, Model.FT), - Model.UPSCALER: _get_num_frames(total_video_seconds, Model.FT), - Model.OTHERS: 1, - } - if model_work_overrides: - for model, value in model_work_overrides.items(): - if value == "num_scenes": - ret[model] = num_scenes - elif value == "num_subscenes": - ret[model] = num_subscenes - elif isinstance(value, str): - raise ValueError(f"Invalid model_work override value: {value}") - elif value == 0 or value is None: - del ret[model] - else: - ret[model] = value - return ret - - -class WorkOverrideType: - def __init__(self, value: int | str | None = None): - self.value = value - - -def build_workflow_config( - total_video_seconds: int, - input_tokens: int, - model_work: dict[Model, int] | None = None, - *, - model_work_overrides: dict[Model, int | str | None] | None = None, - num_scenes_override: int | None = None, - num_steps_override: dict[Model, int] | None = None, - target_resolution: QualityLevel = QualityLevel.HIGH, -) -> WorkflowConfig: - """Build a ``WorkflowConfig`` from base parameters, computing all derived values. - - Parameters - ---------- - model_work: - Explicit model-work dictionary. When ``None`` (default), standard - video-generation work is auto-generated from the other parameters. - exclude_models: - Models to remove from auto-generated ``model_work``. - model_work_overrides: - Key-value overrides applied on top of auto-generated ``model_work``. - If a value is set to "num_scenes", it will be replaced with the number of scenes (i.e. per-scene work). - target_resolution: - The target output resolution for the workflow (default HIGH). - When not HIGH, UPSCALER is automatically removed from model_work. - """ - num_subscenes = _get_num_subscenes(total_video_seconds) - - num_scenes = _get_num_scenes(total_video_seconds) - if num_scenes_override is not None: - num_scenes = num_scenes_override - - num_steps = dict(NUM_STEPS) - if num_steps_override: - num_steps.update(num_steps_override) - - if model_work is None: - model_work = _video_gen_work( - total_video_seconds, - num_scenes, - num_subscenes, - model_work_overrides, - ) - - return WorkflowConfig( - total_video_seconds=total_video_seconds, - total_scenes=num_scenes, - total_subscenes=num_subscenes, - total_frames={ - Model.HF: _get_num_frames(total_video_seconds, Model.HF), - Model.FT: _get_num_frames(total_video_seconds, Model.FT), - }, - per_subscene_frames={ - Model.HF: math.ceil(_get_num_frames(total_video_seconds, Model.HF) / num_subscenes), - Model.FT: math.ceil(_get_num_frames(total_video_seconds, Model.FT) / num_subscenes), - }, - num_steps=num_steps, - hf_frames=FRAMES_OPTIONS[Model.HF], - ft_frames=FRAMES_OPTIONS[Model.FT], - frames_per_step_idx=FRAMES_PER_STEP_IDX, - target_resolution=target_resolution, - total_input_tokens=input_tokens, - model_work=model_work, - ) - - -WORKFLOW_DURATIONS = { # in seconds - "podcast": int(10 * SECONDS_IN_MINUTE), - # TODO The input is two hours but the output should be shorter something like 1 or 2 minutes - "short": int(2 * SECONDS_IN_HOUR), - "movie": int(2 * SECONDS_IN_HOUR), - "story": int(10 * SECONDS_IN_MINUTE), - "lecture": int(5 * SECONDS_IN_MINUTE), - "slide": int(10 * SECONDS_IN_MINUTE), - "dubbing": int(10 * SECONDS_IN_MINUTE), - "editing": int(10 * SECONDS_IN_MINUTE), - "chat": 5, -} - - -# Podcast: 10-minute video from text/PDF input -PODCAST_WORKFLOW = build_workflow_config( - total_video_seconds=WORKFLOW_DURATIONS["podcast"], - input_tokens=TOTAL_INPUT_TOKENS, -) - -# Shorts: short clips from a 2-hour input video -_SHORTS_SECONDS = WORKFLOW_DURATIONS["short"] -_SHORTS_SCENES = _SHORTS_SECONDS // 10 # 10-second scene segmentation → 720 -SHORTS_WORKFLOW = build_workflow_config( - total_video_seconds=_SHORTS_SECONDS, - input_tokens=int(_SHORTS_SECONDS * TOKENS_PER_FRAME), # 1 fps × 500 tokens/frame - model_work={ - Model.GEMMA: _SHORTS_SCENES, - Model.OTHERS: 1, # TODO isn't this 1 by default? - }, - num_scenes_override=_SHORTS_SCENES, -) - -# Movie: 2-hour movie -MOVIE_WORKFLOW = build_workflow_config( - total_video_seconds=WORKFLOW_DURATIONS["movie"], - input_tokens=TOTAL_INPUT_TOKENS, - model_work_overrides={ - Model.FLUX: "num_scenes", - }, -) - -# Animated Story: Podcast + 5% more HF denoising steps (LoRA overhead) -OVERHEAD_PCT = 5 -ANIMATED_STORY_WORKFLOW = build_workflow_config( - total_video_seconds=WORKFLOW_DURATIONS["story"], - input_tokens=TOTAL_INPUT_TOKENS, - num_steps_override={ - Model.HF: int(NUM_STEPS[Model.HF] * 1 + (OVERHEAD_PCT / 100.0)) - }, -) - -# Lecture: 5-minute video, Flux generates per-scene images -LECTURE_WORKFLOW = build_workflow_config( - total_video_seconds=WORKFLOW_DURATIONS["lecture"], - input_tokens=TOTAL_INPUT_TOKENS, - model_work_overrides={ - Model.FLUX: "num_scenes", - }, -) - -# Slide Persona: same as Podcast but at low resolution, no upscaler -SLIDE_PERSONA_WORKFLOW = build_workflow_config( - total_video_seconds=WORKFLOW_DURATIONS["slide"], - input_tokens=TOTAL_INPUT_TOKENS, - target_resolution=QualityLevel.LOW, - model_work_overrides={ - Model.UPSCALER: None, - }, -) - -# Dubbing: like Podcast but without Flux, and double the audio work -DUBBING_WORKFLOW = build_workflow_config( - total_video_seconds=WORKFLOW_DURATIONS["dubbing"], - input_tokens=TOTAL_INPUT_TOKENS, - model_work_overrides={ - Model.FLUX: None, - Model.OTHERS: 2, # Double audio work - }, -) - -# Editing: like Podcast but without GEMMA, FLUX, or OTHERS -EDITING_WORKFLOW = build_workflow_config( - total_video_seconds=WORKFLOW_DURATIONS["editing"], - input_tokens=TOTAL_INPUT_TOKENS, - model_work_overrides={ - Model.GEMMA: None, - Model.FLUX: None, - Model.OTHERS: None, - } -) - -# Video Chat: like Podcast but only 5 seconds of output video -VIDEO_CHAT_WORKFLOW = build_workflow_config( - total_video_seconds=WORKFLOW_DURATIONS["chat"], - input_tokens=TOTAL_INPUT_TOKENS, -) - - -WORKFLOWS = { - "podcast": PODCAST_WORKFLOW, - "chat": VIDEO_CHAT_WORKFLOW, - "dubbing": DUBBING_WORKFLOW, - "editing": EDITING_WORKFLOW, - "lecture": LECTURE_WORKFLOW, - "movie": MOVIE_WORKFLOW, - "short": SHORTS_WORKFLOW, - "slide": SLIDE_PERSONA_WORKFLOW, - "story": ANIMATED_STORY_WORKFLOW, -} diff --git a/tests/simulator/test_auto_model_allocator.py b/tests/simulator/test_auto_model_allocator.py index a9aa17d6..18ff1871 100644 --- a/tests/simulator/test_auto_model_allocator.py +++ b/tests/simulator/test_auto_model_allocator.py @@ -23,30 +23,30 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from sim_types import GPUType - from sim_types import Model - from sim_types import QualityLevel - from sim_types import Solver +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.sim_types import GPUType + from model_provisioner.sim_types import Model + from model_provisioner.sim_types import QualityLevel + from model_provisioner.sim_types import Solver - from constants import DEFAULT_WORKFLOW_CONFIG + from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG - from data_loading import load_latency_data + from model_provisioner.data_loading import load_latency_data - from policies import STREAMWISE_POLICY - from policies import NAIVE_POLICY - from policies import HEXGEN_POLICY - from policies import HELIX_POLICY + from model_provisioner.policies import STREAMWISE_POLICY + from model_provisioner.policies import NAIVE_POLICY + from model_provisioner.policies import HEXGEN_POLICY + from model_provisioner.policies import HELIX_POLICY - from auto_model_allocator import AutoModelAllocator + from model_provisioner.auto_model_allocator import AutoModelAllocator - from greedy import GreedyAllocator - from naive_baseline import NaiveAllocator - from hexgen import HexGenAllocator - from helix import HelixAllocator - from milp import MILPAllocator + from model_provisioner.greedy import GreedyAllocator + from model_provisioner.naive_baseline import NaiveAllocator + from model_provisioner.hexgen import HexGenAllocator + from model_provisioner.helix import HelixAllocator + from model_provisioner.milp import MILPAllocator - from workflows import PODCAST_WORKFLOW + from model_provisioner.workflows import PODCAST_WORKFLOW # --------------------------------------------------------------------------- diff --git a/tests/simulator/test_data_loading.py b/tests/simulator/test_data_loading.py index 129a2f3b..de883d35 100644 --- a/tests/simulator/test_data_loading.py +++ b/tests/simulator/test_data_loading.py @@ -11,12 +11,12 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from sim_types import QualityLevel +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.sim_types import QualityLevel - from data_loading import load_latency_data - from data_loading import load_power_data - from data_loading import load_adaptive_quality_data + from model_provisioner.data_loading import load_latency_data + from model_provisioner.data_loading import load_power_data + from model_provisioner.data_loading import load_adaptive_quality_data def test_latency() -> None: diff --git a/tests/simulator/test_evaluator.py b/tests/simulator/test_evaluator.py index a162e99b..6f3a5aa7 100644 --- a/tests/simulator/test_evaluator.py +++ b/tests/simulator/test_evaluator.py @@ -8,29 +8,29 @@ from tests.test_utils import assert_equals_approx from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from constants import DEFAULT_WORKFLOW_CONFIG - from constants import SECONDS_IN_HOUR +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG + from model_provisioner.constants import SECONDS_IN_HOUR - from sim_types import GPUType - from sim_types import Model + from model_provisioner.sim_types import GPUType + from model_provisioner.sim_types import Model - from data_loading import load_latency_data - from data_loading import load_power_data + from model_provisioner.data_loading import load_latency_data + from model_provisioner.data_loading import load_power_data - from evaluator import evaluate_model_allocation + from model_provisioner.evaluator import evaluate_model_allocation - from policies import STREAMWISE_POLICY + from model_provisioner.policies import STREAMWISE_POLICY - from models import FluxModelAllocation - from models import GemmaModelAllocation - from models import HFModelAllocation - from models import HFVAEModelAllocation - from models import FTModelAllocation - from models import UpscalerModelAllocation - from models import OthersModelAllocation + from model_provisioner.models import FluxModelAllocation + from model_provisioner.models import GemmaModelAllocation + from model_provisioner.models import HFModelAllocation + from model_provisioner.models import HFVAEModelAllocation + from model_provisioner.models import FTModelAllocation + from model_provisioner.models import UpscalerModelAllocation + from model_provisioner.models import OthersModelAllocation - from utils import to_models_df + from model_provisioner.utils import to_models_df def test_empty() -> None: diff --git a/tests/simulator/test_greedy.py b/tests/simulator/test_greedy.py index c33d6991..786cc2c2 100644 --- a/tests/simulator/test_greedy.py +++ b/tests/simulator/test_greedy.py @@ -8,22 +8,22 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from constants import DEFAULT_WORKFLOW_CONFIG - from constants import SECONDS_IN_HOUR +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG + from model_provisioner.constants import SECONDS_IN_HOUR - from workflows import WORKFLOWS + from model_provisioner.workflows import WORKFLOWS - from sim_types import GPUType - from sim_types import QualityLevel - from sim_types import WorkflowConfig + from model_provisioner.sim_types import GPUType + from model_provisioner.sim_types import QualityLevel + from model_provisioner.sim_types import WorkflowConfig - from data_loading import load_latency_data - from data_loading import load_power_data + from model_provisioner.data_loading import load_latency_data + from model_provisioner.data_loading import load_power_data - from greedy import GreedyAllocator + from model_provisioner.greedy import GreedyAllocator - from policies import STREAMWISE_POLICY + from model_provisioner.policies import STREAMWISE_POLICY def test_allocate_8A_8H() -> None: diff --git a/tests/simulator/test_helix.py b/tests/simulator/test_helix.py index a336595d..06ec8f3a 100644 --- a/tests/simulator/test_helix.py +++ b/tests/simulator/test_helix.py @@ -12,16 +12,16 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from constants import DEFAULT_WORKFLOW_CONFIG - from sim_types import GPUType - from sim_types import Model - from sim_types import MODEL_ORDER - from sim_types import Solver - from data_loading import load_latency_data - from data_loading import load_power_data - from helix import HelixAllocator - from policies import HELIX_POLICY +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG + from model_provisioner.sim_types import GPUType + from model_provisioner.sim_types import Model + from model_provisioner.sim_types import MODEL_ORDER + from model_provisioner.sim_types import Solver + from model_provisioner.data_loading import load_latency_data + from model_provisioner.data_loading import load_power_data + from model_provisioner.helix import HelixAllocator + from model_provisioner.policies import HELIX_POLICY def test_get_model_order() -> None: diff --git a/tests/simulator/test_hexgen.py b/tests/simulator/test_hexgen.py index 99e7eef5..3317a82e 100644 --- a/tests/simulator/test_hexgen.py +++ b/tests/simulator/test_hexgen.py @@ -7,13 +7,13 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from constants import DEFAULT_WORKFLOW_CONFIG - from sim_types import GPUType - from data_loading import load_latency_data - from hexgen import HexGenAllocator - from hexgen import _get_model_order - from sim_types import MODEL_ORDER +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG + from model_provisioner.sim_types import GPUType + from model_provisioner.data_loading import load_latency_data + from model_provisioner.hexgen import HexGenAllocator + from model_provisioner.hexgen import _get_model_order + from model_provisioner.sim_types import MODEL_ORDER def test_get_model_order() -> None: @@ -154,7 +154,7 @@ def test_no_gpus_error() -> None: def test_is_subclass_of_greedy() -> None: """HexGenAllocator should extend GreedyAllocator.""" - from greedy import GreedyAllocator + from model_provisioner.greedy import GreedyAllocator latency_data = load_latency_data("simulator/data/") allocator = HexGenAllocator( workflow=DEFAULT_WORKFLOW_CONFIG, diff --git a/tests/simulator/test_milp.py b/tests/simulator/test_milp.py index 70c4bfa8..9b0e909e 100644 --- a/tests/simulator/test_milp.py +++ b/tests/simulator/test_milp.py @@ -13,29 +13,29 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from sim_types import LatencyData - from sim_types import PowerData - from sim_types import GPUType - from sim_types import Objective - from sim_types import Solver - from sim_types import QualityLevel +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.sim_types import LatencyData + from model_provisioner.sim_types import PowerData + from model_provisioner.sim_types import GPUType + from model_provisioner.sim_types import Objective + from model_provisioner.sim_types import Solver + from model_provisioner.sim_types import QualityLevel - from data_loading import load_latency_data - from data_loading import load_power_data + from model_provisioner.data_loading import load_latency_data + from model_provisioner.data_loading import load_power_data - from constants import DEFAULT_WORKFLOW_CONFIG - from constants import SECONDS_IN_HOUR + from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG + from model_provisioner.constants import SECONDS_IN_HOUR - from policies import STREAMWISE_MILP_POLICY + from model_provisioner.policies import STREAMWISE_MILP_POLICY - from workflows import WORKFLOWS + from model_provisioner.workflows import WORKFLOWS - from milp import MILPAllocator + from model_provisioner.milp import MILPAllocator - from evaluator import evaluate_model_allocation + from model_provisioner.evaluator import evaluate_model_allocation - from utils import to_models_df + from model_provisioner.utils import to_models_df def test_base() -> None: diff --git a/tests/simulator/test_models.py b/tests/simulator/test_models.py index 57e00a0a..c0171d99 100644 --- a/tests/simulator/test_models.py +++ b/tests/simulator/test_models.py @@ -16,34 +16,34 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from sim_types import GPUType - from sim_types import Model - from sim_types import ModelAllocation - from sim_types import QualityLevel - from sim_types import LatencyData - from sim_types import PowerData - - from constants import DEFAULT_WORKFLOW_CONFIG - - from data_loading import load_latency_data - from data_loading import load_power_data - - from policies import STREAMWISE_POLICY - from policies import NAIVE_POLICY - - from models import get_model_allocation - from models import _calculate_total_time - from models import assert_pixel_config - from models import _MODEL_ALLOCATION_REGISTRY - from models import GemmaModelAllocation - from models import FluxModelAllocation - from models import HFModelAllocation - from models import HFVAEModelAllocation - from models import FTModelAllocation - from models import FTVAEModelAllocation - from models import UpscalerModelAllocation - from models import OthersModelAllocation +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.sim_types import GPUType + from model_provisioner.sim_types import Model + from model_provisioner.sim_types import ModelAllocation + from model_provisioner.sim_types import QualityLevel + from model_provisioner.sim_types import LatencyData + from model_provisioner.sim_types import PowerData + + from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG + + from model_provisioner.data_loading import load_latency_data + from model_provisioner.data_loading import load_power_data + + from model_provisioner.policies import STREAMWISE_POLICY + from model_provisioner.policies import NAIVE_POLICY + + from model_provisioner.models import get_model_allocation + from model_provisioner.models import _calculate_total_time + from model_provisioner.models import assert_pixel_config + from model_provisioner.models import _MODEL_ALLOCATION_REGISTRY + from model_provisioner.models import GemmaModelAllocation + from model_provisioner.models import FluxModelAllocation + from model_provisioner.models import HFModelAllocation + from model_provisioner.models import HFVAEModelAllocation + from model_provisioner.models import FTModelAllocation + from model_provisioner.models import FTVAEModelAllocation + from model_provisioner.models import UpscalerModelAllocation + from model_provisioner.models import OthersModelAllocation # --------------------------------------------------------------------------- @@ -152,7 +152,7 @@ def test_assert_pixel_config() -> None: assert_pixel_config(DEFAULT_WORKFLOW_CONFIG) # Patching MEDIUM > HIGH violates the ordering constraint → AssertionError. - with patch.dict("sim_types.RESOLUTION_PIXELS", + with patch.dict("model_provisioner.sim_types.RESOLUTION_PIXELS", {QualityLevel.MEDIUM: 1000, QualityLevel.HIGH: 500}): with pytest.raises(AssertionError): assert_pixel_config(DEFAULT_WORKFLOW_CONFIG) diff --git a/tests/simulator/test_multirequests_derive.py b/tests/simulator/test_multirequests_derive.py index 8e7ed798..d5286121 100644 --- a/tests/simulator/test_multirequests_derive.py +++ b/tests/simulator/test_multirequests_derive.py @@ -7,10 +7,10 @@ from tests.test_utils import assert_equal_dict from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from sim_types import GPUType - from sim_types import Model - from sim_types import QualityLevel +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.sim_types import GPUType + from model_provisioner.sim_types import Model + from model_provisioner.sim_types import QualityLevel from multirequests import TIME_PER_REQ from multirequests import INIT_REPLICAS diff --git a/tests/simulator/test_simulator.py b/tests/simulator/test_simulator.py index fc791151..d621cd33 100644 --- a/tests/simulator/test_simulator.py +++ b/tests/simulator/test_simulator.py @@ -13,23 +13,23 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from sim_types import WorkflowConfig - from sim_types import Model - from sim_types import Objective - from sim_types import GPUType +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.sim_types import WorkflowConfig + from model_provisioner.sim_types import Model + from model_provisioner.sim_types import Objective + from model_provisioner.sim_types import GPUType - from constants import SECONDS_IN_HOUR - from constants import DEFAULT_WORKFLOW_CONFIG + from model_provisioner.constants import SECONDS_IN_HOUR + from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG - from data_loading import load_latency_data - from data_loading import load_power_data + from model_provisioner.data_loading import load_latency_data + from model_provisioner.data_loading import load_power_data - from auto_model_allocator import AutoModelAllocator - from greedy import GreedyAllocator + from model_provisioner.auto_model_allocator import AutoModelAllocator + from model_provisioner.greedy import GreedyAllocator - from policies import STREAMWISE_POLICY - from policies import NAIVE_POLICY + from model_provisioner.policies import STREAMWISE_POLICY + from model_provisioner.policies import NAIVE_POLICY def test_estimate_total_time() -> None: diff --git a/tests/simulator/test_simulator_actions.py b/tests/simulator/test_simulator_actions.py index dd3bf4fd..11efd7b2 100644 --- a/tests/simulator/test_simulator_actions.py +++ b/tests/simulator/test_simulator_actions.py @@ -7,12 +7,12 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from sim_types import Action - from sim_types import ActionName - from sim_types import GPUType - from sim_types import Model - from sim_types import Result +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.sim_types import Action + from model_provisioner.sim_types import ActionName + from model_provisioner.sim_types import GPUType + from model_provisioner.sim_types import Model + from model_provisioner.sim_types import Result def test_action() -> None: diff --git a/tests/simulator/test_simulator_baseline.py b/tests/simulator/test_simulator_baseline.py index 64282777..24749ffb 100644 --- a/tests/simulator/test_simulator_baseline.py +++ b/tests/simulator/test_simulator_baseline.py @@ -11,28 +11,28 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from sim_types import GPUType - from sim_types import Model +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.sim_types import GPUType + from model_provisioner.sim_types import Model - from constants import DEFAULT_WORKFLOW_CONFIG - from constants import SECONDS_IN_HOUR - from constants import POWER_GPU_IDLE - from constants import POWER_GPU_TDP + from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG + from model_provisioner.constants import SECONDS_IN_HOUR + from model_provisioner.constants import POWER_GPU_IDLE + from model_provisioner.constants import POWER_GPU_TDP - from data_loading import load_latency_data - from data_loading import load_power_data + from model_provisioner.data_loading import load_latency_data + from model_provisioner.data_loading import load_power_data - from auto_model_allocator import AutoModelAllocator - from naive_baseline import NaiveAllocator - from greedy import GreedyAllocator + from model_provisioner.auto_model_allocator import AutoModelAllocator + from model_provisioner.naive_baseline import NaiveAllocator + from model_provisioner.greedy import GreedyAllocator - from policies import NAIVE_POLICY - from policies import BASELINE_POLICIES - from policies import STREAMWISE_POLICY + from model_provisioner.policies import NAIVE_POLICY + from model_provisioner.policies import BASELINE_POLICIES + from model_provisioner.policies import STREAMWISE_POLICY - from workflows import SHORTS_WORKFLOW - from workflows import WORKFLOWS + from model_provisioner.workflows import SHORTS_WORKFLOW + from model_provisioner.workflows import WORKFLOWS def test_baseline() -> None: diff --git a/tests/simulator/test_simulator_energy.py b/tests/simulator/test_simulator_energy.py index 16b6e8bf..a739f698 100644 --- a/tests/simulator/test_simulator_energy.py +++ b/tests/simulator/test_simulator_energy.py @@ -9,23 +9,23 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from constants import DEFAULT_WORKFLOW_CONFIG +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG - from sim_types import GPUType - from sim_types import Model - from sim_types import Objective - from sim_types import Solver + from model_provisioner.sim_types import GPUType + from model_provisioner.sim_types import Model + from model_provisioner.sim_types import Objective + from model_provisioner.sim_types import Solver - from data_loading import load_latency_data - from data_loading import load_power_data + from model_provisioner.data_loading import load_latency_data + from model_provisioner.data_loading import load_power_data - from auto_model_allocator import AutoModelAllocator - from greedy import GreedyAllocator - from naive_baseline import NaiveAllocator + from model_provisioner.auto_model_allocator import AutoModelAllocator + from model_provisioner.greedy import GreedyAllocator + from model_provisioner.naive_baseline import NaiveAllocator - from policies import NAIVE_POLICY - from policies import STREAMWISE_POLICY + from model_provisioner.policies import NAIVE_POLICY + from model_provisioner.policies import STREAMWISE_POLICY def test_energy() -> None: diff --git a/tests/simulator/test_simulator_multirequests.py b/tests/simulator/test_simulator_multirequests.py index 972596ec..3d3e350a 100644 --- a/tests/simulator/test_simulator_multirequests.py +++ b/tests/simulator/test_simulator_multirequests.py @@ -7,7 +7,7 @@ from tests.test_utils import assert_equals_approx from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from multirequests import QPM_LIST from multirequests import get_replicas from multirequests import get_costs @@ -21,12 +21,12 @@ from multirequests import TIME_PER_REQ_ADAPTIVE from multirequests import get_time_per_request_baseline - from data_loading import load_latency_data - from workflows import PODCAST_WORKFLOW + from model_provisioner.data_loading import load_latency_data + from model_provisioner.workflows import PODCAST_WORKFLOW - from constants import GPU_SPOT_COST - from sim_types import GPUType - from sim_types import Model + from model_provisioner.constants import GPU_SPOT_COST + from model_provisioner.sim_types import GPUType + from model_provisioner.sim_types import Model def test_multirequests() -> None: diff --git a/tests/simulator/test_simulator_plotutils.py b/tests/simulator/test_simulator_plotutils.py index cee69368..2d3b35e2 100644 --- a/tests/simulator/test_simulator_plotutils.py +++ b/tests/simulator/test_simulator_plotutils.py @@ -6,7 +6,7 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): +with temp_sys_path("simulator", "streamwise"): from plot_utils import plot_ttff_vs_cost from plot_utils import plot_ttff_vs_energy from plot_utils import plot_adaptive_quality @@ -14,10 +14,10 @@ from plot_utils import plot_cost_vs_qpm from plot_utils import _get_time_ticklabels - from sim_types import ProvisioningResult - from sim_types import GPUType - from sim_types import QualityLevel - from sim_types import Model + from model_provisioner.sim_types import ProvisioningResult + from model_provisioner.sim_types import GPUType + from model_provisioner.sim_types import QualityLevel + from model_provisioner.sim_types import Model def test_plot_ttff_vs_cost() -> None: diff --git a/tests/simulator/test_simulator_policies.py b/tests/simulator/test_simulator_policies.py index ffab5ba0..42bf69db 100644 --- a/tests/simulator/test_simulator_policies.py +++ b/tests/simulator/test_simulator_policies.py @@ -11,11 +11,11 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from policies import STREAMWISE_POLICY - from policies import BASELINE_POLICIES +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.policies import STREAMWISE_POLICY + from model_provisioner.policies import BASELINE_POLICIES - from sim_types import Objective + from model_provisioner.sim_types import Objective def test_streamwise_policies() -> None: diff --git a/tests/simulator/test_simulator_provisioning.py b/tests/simulator/test_simulator_provisioning.py index 6bd142ae..d781bc2e 100644 --- a/tests/simulator/test_simulator_provisioning.py +++ b/tests/simulator/test_simulator_provisioning.py @@ -7,8 +7,8 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from constants import DEFAULT_WORKFLOW_CONFIG +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG from provisioning import get_provisioning_results from provisioning import get_provisioning_adaptive_results @@ -17,15 +17,15 @@ from provisioning import GPU_PROVISIONS from provisioning import GPU_PROVISIONS_SHORT - from sim_types import GPUType - from sim_types import QualityLevel - from sim_types import Solver + from model_provisioner.sim_types import GPUType + from model_provisioner.sim_types import QualityLevel + from model_provisioner.sim_types import Solver - from data_loading import load_latency_data + from model_provisioner.data_loading import load_latency_data - from policies import NAIVE_POLICY - from policies import STREAMWISE_POLICY - from policies import HEXGEN_POLICY + from model_provisioner.policies import NAIVE_POLICY + from model_provisioner.policies import STREAMWISE_POLICY + from model_provisioner.policies import HEXGEN_POLICY @pytest.mark.parametrize("gpu_type", [gpu_type for gpu_type in GPUType]) diff --git a/tests/simulator/test_simulator_types.py b/tests/simulator/test_simulator_types.py index 8bfc292f..9e2384ed 100644 --- a/tests/simulator/test_simulator_types.py +++ b/tests/simulator/test_simulator_types.py @@ -8,21 +8,21 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from sim_types import Model - from sim_types import GPUType +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.sim_types import Model + from model_provisioner.sim_types import GPUType - from sim_types_json import models_to_json - from sim_types_json import workflow_to_json - from sim_types_json import policy_to_json - from sim_types_json import model_list_to_json + from model_provisioner.sim_types_json import models_to_json + from model_provisioner.sim_types_json import workflow_to_json + from model_provisioner.sim_types_json import policy_to_json + from model_provisioner.sim_types_json import model_list_to_json - from models import GemmaModelAllocation - from models import FluxModelAllocation + from model_provisioner.models import GemmaModelAllocation + from model_provisioner.models import FluxModelAllocation - from policies import STREAMWISE_POLICY + from model_provisioner.policies import STREAMWISE_POLICY - from workflows import PODCAST_WORKFLOW + from model_provisioner.workflows import PODCAST_WORKFLOW def test_serialize_models() -> None: diff --git a/tests/simulator/test_simulator_utils.py b/tests/simulator/test_simulator_utils.py index 9711a696..e1575d9a 100644 --- a/tests/simulator/test_simulator_utils.py +++ b/tests/simulator/test_simulator_utils.py @@ -6,19 +6,19 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from sim_types import Model - from sim_types import GPUType - from sim_types import ModelAllocation - from sim_types import ProvisioningResult - - from utils import get_pareto_frontier - from utils import find_most_cost_effective_provisioning - from utils import find_most_energy_efficient_provisioning - from utils import find_pareto_frontier - from utils import coalesce_models - - from models import FTModelAllocation +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.sim_types import Model + from model_provisioner.sim_types import GPUType + from model_provisioner.sim_types import ModelAllocation + from model_provisioner.sim_types import ProvisioningResult + + from model_provisioner.utils import get_pareto_frontier + from model_provisioner.utils import find_most_cost_effective_provisioning + from model_provisioner.utils import find_most_energy_efficient_provisioning + from model_provisioner.utils import find_pareto_frontier + from model_provisioner.utils import coalesce_models + + from model_provisioner.models import FTModelAllocation def test_get_pareto_frontier() -> None: diff --git a/tests/simulator/test_workflows.py b/tests/simulator/test_workflows.py index bff7ed56..19a7ff0c 100644 --- a/tests/simulator/test_workflows.py +++ b/tests/simulator/test_workflows.py @@ -15,9 +15,9 @@ from tests.test_utils import temp_sys_path -with temp_sys_path("simulator"): - from sim_types import WorkflowConfig, Model, QualityLevel, GPUType - from constants import ( +with temp_sys_path("simulator", "streamwise"): + from model_provisioner.sim_types import WorkflowConfig, Model, QualityLevel, GPUType + from model_provisioner.constants import ( FPS, FRAMES_OPTIONS, FRAMES_PER_STEP_IDX, @@ -26,10 +26,10 @@ SECONDS_IN_MINUTE, TOTAL_INPUT_TOKENS, ) - from data_loading import load_latency_data - from auto_model_allocator import AutoModelAllocator - from policies import STREAMWISE_POLICY, NAIVE_POLICY - from workflows import ( + from model_provisioner.data_loading import load_latency_data + from model_provisioner.auto_model_allocator import AutoModelAllocator + from model_provisioner.policies import STREAMWISE_POLICY, NAIVE_POLICY + from model_provisioner.workflows import ( MAX_FT_FRAMES, SUBSCENE_SECONDS, SUBSCENES_PER_SCENE, From fc7f5e7d6013aff10f5fdf3eba17e7a67bcb8e98 Mon Sep 17 00:00:00 2001 From: Haoran Qiu Date: Fri, 15 May 2026 15:21:38 -0700 Subject: [PATCH 2/9] Move 11 non-policy files from model_provisioner back to simulator Move actions, auto_model_allocator, constants, data_loading, evaluator, model_allocator, models, sim_types, sim_types_json, utils, and workflows from streamwise/model_provisioner/ back to simulator/. Only 6 policy files remain in model_provisioner: greedy, helix, hexgen, milp, naive_baseline, and policies. Import changes: - Moved files use bare imports (from sim_types import ...) instead of relative imports (from .sim_types import ...) - Policy files use bare imports for moved modules and keep relative imports for sibling policy modules - simulator/ and streamwise/allocator_bridge.py updated accordingly - All test files updated to match new import paths - Added tests/simulator/conftest.py to set PYTHONPATH for child processes spawned by ProcessPoolExecutor Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- simulator/__init__.py | 14 +- simulator/actions.py | 737 ++++++++++++ simulator/auto_model_allocator.py | 109 ++ simulator/constants.py | 142 +++ simulator/data_loading.py | 300 +++++ simulator/evaluator.py | 414 +++++++ simulator/model_allocator.py | 282 +++++ simulator/models.py | 811 +++++++++++++ simulator/multirequests.py | 24 +- simulator/plot_utils.py | 10 +- simulator/provisioning.py | 39 +- simulator/sim_types.py | 796 ++++++++++++ simulator/sim_types_json.py | 58 + simulator/utils.py | 297 +++++ simulator/workflows.py | 253 ++++ streamwise/allocator_bridge.py | 256 ++++ streamwise/model_provisioner/__init__.py | 15 + streamwise/model_provisioner/greedy.py | 573 +++++++++ streamwise/model_provisioner/helix.py | 403 +++++++ streamwise/model_provisioner/hexgen.py | 629 ++++++++++ streamwise/model_provisioner/milp.py | 1070 +++++++++++++++++ .../model_provisioner/naive_baseline.py | 484 ++++++++ streamwise/model_provisioner/policies.py | 252 ++++ tests/simulator/conftest.py | 24 + tests/simulator/test_auto_model_allocator.py | 16 +- tests/simulator/test_data_loading.py | 8 +- tests/simulator/test_evaluator.py | 30 +- tests/simulator/test_greedy.py | 16 +- tests/simulator/test_helix.py | 14 +- tests/simulator/test_hexgen.py | 8 +- tests/simulator/test_milp.py | 26 +- tests/simulator/test_models.py | 44 +- tests/simulator/test_multirequests_derive.py | 6 +- tests/simulator/test_simulator.py | 18 +- tests/simulator/test_simulator_actions.py | 10 +- tests/simulator/test_simulator_baseline.py | 22 +- tests/simulator/test_simulator_energy.py | 16 +- .../simulator/test_simulator_multirequests.py | 10 +- tests/simulator/test_simulator_plotutils.py | 8 +- tests/simulator/test_simulator_policies.py | 2 +- .../simulator/test_simulator_provisioning.py | 10 +- tests/simulator/test_simulator_types.py | 18 +- tests/simulator/test_simulator_utils.py | 24 +- tests/simulator/test_workflows.py | 10 +- tests/streamwise/test_allocator_bridge.py | 280 +++++ 45 files changed, 8391 insertions(+), 197 deletions(-) create mode 100644 simulator/actions.py create mode 100644 simulator/auto_model_allocator.py create mode 100644 simulator/constants.py create mode 100644 simulator/data_loading.py create mode 100644 simulator/evaluator.py create mode 100644 simulator/model_allocator.py create mode 100644 simulator/models.py create mode 100644 simulator/sim_types.py create mode 100644 simulator/sim_types_json.py create mode 100644 simulator/utils.py create mode 100644 simulator/workflows.py create mode 100644 streamwise/allocator_bridge.py create mode 100644 streamwise/model_provisioner/__init__.py create mode 100644 streamwise/model_provisioner/greedy.py create mode 100644 streamwise/model_provisioner/helix.py create mode 100644 streamwise/model_provisioner/hexgen.py create mode 100644 streamwise/model_provisioner/milp.py create mode 100644 streamwise/model_provisioner/naive_baseline.py create mode 100644 streamwise/model_provisioner/policies.py create mode 100644 tests/simulator/conftest.py create mode 100644 tests/streamwise/test_allocator_bridge.py diff --git a/simulator/__init__.py b/simulator/__init__.py index 24058e01..263309ff 100644 --- a/simulator/__init__.py +++ b/simulator/__init__.py @@ -1,15 +1,15 @@ """ -Simulator package. +Simulator package — provisioning sweeps, multi-request analysis, and plotting +on top of the model_provisioner allocation policies. -The core allocation logic lives in ``streamwise.model_provisioner``. -This package adds provisioning sweeps, multi-request analysis, and plotting -on top of that shared foundation. +The allocation policy implementations live in ``streamwise/model_provisioner/``. """ import os import sys -# Make model_provisioner importable for simulator modules and child processes. -_STREAMWISE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "streamwise") -_STREAMWISE_DIR = os.path.normpath(_STREAMWISE_DIR) +# Make model_provisioner importable for simulator modules. +_STREAMWISE_DIR = os.path.normpath( + os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "streamwise") +) if _STREAMWISE_DIR not in sys.path: sys.path.insert(0, _STREAMWISE_DIR) diff --git a/simulator/actions.py b/simulator/actions.py new file mode 100644 index 00000000..69af1618 --- /dev/null +++ b/simulator/actions.py @@ -0,0 +1,737 @@ +""" +Actions for scaling models for the greedy allocator. +""" + +from __future__ import annotations + +import random + +from collections import Counter + +from copy import deepcopy + +from typing import Optional + +from constants import DEVICE_OPTIONS +from constants import SINGLE_INSTANCE_MODELS +from constants import SINGLE_DEVICE_MODELS + +from sim_types import Action +from sim_types import ActionName +from sim_types import Model +from sim_types import ModelAllocation +from sim_types import GPUType +from sim_types import WorkflowConfig +from sim_types import LatencyData +from sim_types import PowerData +from sim_types import Objective +from sim_types import Policy + +from model_provisioner.policies import STREAMWISE_POLICY + +from models import get_model_allocation + +from evaluator import evaluate_model_allocation +from evaluator import calc_used_gpus + + +def _is_single_instance( + model_name: Model, + workflow: Optional[WorkflowConfig] = None, +) -> bool: + """Check if a model is single-instance, considering workflow parallelism settings.""" + if model_name not in SINGLE_INSTANCE_MODELS: + return False + if workflow is not None and workflow.is_parallelizable(model_name): + return False + return True + + +def find_next_devices( + device_options: list[int], + num_devices: int, + num_replicas: int, + remaining_devices: int, + max_num_devices: Optional[int] = None, +) -> Optional[int]: + """ + Find the next device combination. + For example, with device options [2, 4, 8, 16, 40], current devices 8, 1 replica, we get 16. + """ + if num_replicas == 0: + # means we haven't allocated any replicas yet so start from smallest device option + return device_options[0] if device_options[0] <= remaining_devices else None + + for device_option in device_options: + # if device_option > num_devices and device_option <= remaining_devices + num_devices: + if ( + device_option > num_devices + and (device_option - num_devices) * num_replicas <= remaining_devices + and (max_num_devices is None or device_option <= max_num_devices) + ): + return device_option + return None + + +def choose_action( + actions: list[Action], + objective: Objective, + switch_objective: bool = False, +) -> Optional[Action]: + """Schedule requests.""" + if not actions: + return None + + if objective == Objective.TIME_COST: + # return min(actions, key=lambda a: a.time) + return min( + actions, + key=lambda a: ( + a.time_cost(), + a.time, + ), + ) + if objective == Objective.TIME_COST: + return min( + actions, + key=lambda a: ( + a.time_cost(), + a.time, + ), + ) + if objective == Objective.TTFF_COST: + return min( + actions, + key=lambda a: ( + a.ttff_cost(), + a.ttff, + ), + ) + if objective == Objective.FIFO: + # return min(actions, key=lambda a: a.arrival_time_s) + return min(actions, key=lambda a: a.get_order()) + if objective == Objective.TIME: + return min(actions, key=lambda a: a.time) + if objective == Objective.TTFF: + return min(actions, key=lambda a: a.ttff) + if objective == Objective.COST: + return min(actions, key=lambda a: a.cost) + if objective == Objective.ENERGY: + return min(actions, key=lambda a: a.energy) + if objective == Objective.TIME_ENERGY: + return min(actions, key=lambda a: a.time_energy()) + if objective == Objective.ENERGY_COST: + return min(actions, key=lambda a: a.energy_cost()) + if objective == Objective.RANDOM: + # randomly pick an improvement to simulate naive allocation + return random.choice(actions) + if objective == Objective.TTFF_THEN_TIME: + if switch_objective: + return min(actions, key=lambda a: a.time) + else: + return min(actions, key=lambda a: a.ttff) + if objective == Objective.NONE: + return None + raise ValueError(f"Cannot recognize objective {objective}") + + +def apply_action( + action: Action, + models: dict[GPUType, dict[Model, list[ModelAllocation]]], +) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: + """Apply the chosen action to the models and update remaining devices.""" + + for gpu_type in action.models.keys(): + if gpu_type not in models: + raise ValueError(f"Cannot find gpu type {gpu_type} in {models.keys()}") + for model in action.models[gpu_type].keys(): + if model not in models[gpu_type]: + raise ValueError(f"Cannot find model {model} in {models[gpu_type].keys()}") + allocs_to_remove = [] + for alloc_id in range(len(action.models[gpu_type][model])): + # check if devices and replicas are non-negative + num_devices = action.models[gpu_type][model][alloc_id].devices + if num_devices < 0: + raise ValueError(f"Action devices {num_devices} must be >= 0") + if action.models[gpu_type][model][alloc_id].replicas <= 0: + # remove that instance if replicas is 0 or negative + allocs_to_remove.append(alloc_id) + for alloc_id in reversed(allocs_to_remove): + del action.models[gpu_type][model][alloc_id] + + return action.models + + +def gen_actions( + workflow: WorkflowConfig, + num_gpus: dict[GPUType, int], + latency_data: LatencyData, + power_data: Optional[PowerData] = None, + models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {}, + policy: Policy = STREAMWISE_POLICY, + allow_removal: bool = False, + allow_merging: bool = False, + look_ahead_replicas: int = 3, +) -> list[Action]: + actions: list[Action] = [] + + # Extract GPU types from models + gpu_types = list(models.keys()) + assert len(gpu_types) == len(num_gpus), \ + f"Number of GPU types in models {len(gpu_types)} must match num_gpus {len(num_gpus)}" + + remaining_gpus = {} + for gpu_type in num_gpus.keys(): + remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]}) + + # Option 1: Provision more by increasing for each model allocation + for model in Model: + if model not in workflow.models: + continue + for gpu_type in gpu_types: + for alloc_id in range(len(models[gpu_type][model])): + actions.extend(_gen_add_device_replica_actions( + models=models, + num_gpus=num_gpus, + remaining_gpus=remaining_gpus[gpu_type], + gpu_type=gpu_type, + model_name=model, + allocation_id=alloc_id, + workflow=workflow, + policy=policy, + latency_data=latency_data, + power_data=power_data, + look_ahead_replicas=look_ahead_replicas, + )) + + # Option 2: Add a model instance of + for model in Model: + if model not in workflow.models: + continue + for gpu_type in gpu_types: + actions.extend(_gen_add_instance( + models=models, + num_gpus=num_gpus, + remaining_gpus=remaining_gpus[gpu_type], + gpu_type=gpu_type, + model_name=model, + workflow=workflow, + policy=policy, + latency_data=latency_data, + power_data=power_data, + look_ahead_replicas=look_ahead_replicas, + )) + + if allow_removal: + # Option 3: Remove replicas for each model allocation + for model in Model: + if model not in workflow.models: + continue + for gpu_type in gpu_types: + model_instances = models[gpu_type][model] + for alloc_id in range(len(model_instances)): + action = _gen_remove_replica_action( + models=models, + num_gpus=num_gpus, + gpu_type=gpu_type, + model_name=model, + allocation_id=alloc_id, + workflow=workflow, + policy=policy, + latency_data=latency_data, + power_data=power_data, + ) + if action: + actions.append(action) + + if allow_merging: + # Option 4: Merge across model allocations + for model in Model: + if model not in workflow.models: + continue + for gpu_type in gpu_types: + actions.extend(_gen_merge_replicas_actions( + models=models, + num_gpus=num_gpus, + gpu_type=gpu_type, + model_name=model, + workflow=workflow, + policy=policy, + latency_data=latency_data, + power_data=power_data, + )) + + return actions + + +def _get_min_device_combinations( + num_gpus: int, + model: Model, +) -> list[tuple[int, int]]: + """ + Get the minimum device combinations for a given number of GPUs and model. + [(device_count, num_replicas), ...] + For example, for 64, it would return [(40, 1), (16, 1)]. + """ + remaining = num_gpus + result: list[int] = [] + for size in sorted(DEVICE_OPTIONS[model], reverse=True): + while remaining >= size: + result.append(size) + remaining -= size + if remaining > 0: + raise ValueError(f"Cannot exactly decompose {num_gpus} with DEVICE_OPTIONS") + counts = Counter(result) + return sorted(counts.items(), reverse=True) # Sort by device count descending + + +def _get_large_instance_many_small_combinations( + num_gpus: int, + model: Model, +) -> list[tuple[int, int]]: + """ + Get the largest instance possible and then split the rest into 1 GPU instances. + For example, for 64, it would return [(40, 1), (1, 16)]. + """ + assert num_gpus > 0 + assert model in DEVICE_OPTIONS + assert DEVICE_OPTIONS[model][0] == 1 # must have 1 GPU option to use this function + + remaining_gpus = num_gpus + result: list[tuple[int, int]] = [] + for size in sorted(DEVICE_OPTIONS[model], reverse=True): + if remaining_gpus >= size: + result = [(size, 1)] + remaining_gpus -= size + break + if remaining_gpus > 0: + result.append((1, remaining_gpus)) + return result + + +def _gen_add_device_replica_actions( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + num_gpus: dict[GPUType, int], + remaining_gpus: int, + gpu_type: GPUType, + model_name: Model, + allocation_id: int, + workflow: WorkflowConfig, + policy: Policy, + latency_data: LatencyData, + power_data: Optional[PowerData] = None, + look_ahead_replicas: int = 3, +) -> list[Action]: + """ + Generate actions that explore all valid (replicas, devices) provisioning + options for a given model allocation, using the remaining GPUs. + + From the current replicas * devices, find the next options by distributing the remaining devices. + For example, if currently 2 replicas at parallelism 4 with 4 remaining devices, options include: + - 3 replicas, 4 devices (uses 12 total, 4 more than current 8) + - 1 replica, 10 devices (uses 10 total, 2 more than current 8) + - etc. + """ + actions: list[Action] = [] + + if model_name in SINGLE_DEVICE_MODELS and _is_single_instance(model_name, workflow): + return actions # No scaling possible + + alloc = models[gpu_type][model_name][allocation_id] + current_total = alloc.devices * max(alloc.replicas, 0) + current_replicas = alloc.replicas + total_available = current_total + remaining_gpus + + max_num_devices = latency_data[gpu_type].get_max_parallelism(model_name) + max_replicas = alloc.get_max_replicas(workflow) + is_single_instance = _is_single_instance(model_name, workflow) + is_single_device = model_name in SINGLE_DEVICE_MODELS + + seen: set[tuple[int, int]] = set() + seen.add((max(alloc.replicas, 0), alloc.devices)) # skip current config + + for new_devices in DEVICE_OPTIONS[model_name]: + if new_devices > max_num_devices: + continue # Exceeds max parallelism from latency data + if is_single_device and new_devices > 1: + continue # Model only supports single device + if (model_name, new_devices) not in latency_data[gpu_type]: + continue # No latency data for this device count + + # Determine the range of replicas possible with this device count + if is_single_instance: + replica_candidates = [1] + else: + max_r = min(max_replicas, total_available // new_devices) if new_devices > 0 else 0 + # limit max replicas to original replicas + X to avoid too many combinations + max_r = min(max_r, current_replicas + look_ahead_replicas) + replica_candidates = list(range(1, max_r + 1)) + + for new_replicas in replica_candidates: + new_total = new_replicas * new_devices + if new_total <= current_total: + continue # Must be an increase + if new_total > total_available: + continue # Not enough GPUs + if (new_replicas, new_devices) in seen: + continue + seen.add((new_replicas, new_devices)) + + try: + new_models = deepcopy(models) + new_models[gpu_type][model_name][allocation_id] = get_model_allocation( + model=model_name, + gpu_type=gpu_type, + devices=new_devices, + replicas=new_replicas, + ) + action_result = evaluate_model_allocation( + models=new_models, + num_gpus=num_gpus, + workflow=workflow, + latency_data=latency_data, + power_data=power_data, + policy=policy, + include_models=[model_name], + ) + actions.append(Action( + name=ActionName.ADD_DEVICE_REPLICA, + model=model_name, + gpu_type=gpu_type, + models=new_models, + action_result=action_result, + arrival_time_s=alloc.time, + )) + except Exception: + pass # Invalid configuration, skip + + return actions + + +def _gen_add_device_action( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + num_gpus: dict[GPUType, int], + remaining_gpus: int, + gpu_type: GPUType, + model_name: Model, + allocation_id: int, + workflow: WorkflowConfig, + policy: Policy, + latency_data: LatencyData, + power_data: Optional[PowerData] = None, +) -> Optional[Action]: + """ + Action to add devices (increase parallelism) for a specific model allocation. + """ + action: Optional[Action] = None + + if model_name in SINGLE_DEVICE_MODELS: + return action # These models only run on a single GPU, so we don't add more devices + + alloc = models[gpu_type][model_name][allocation_id] + + max_num_devices = latency_data[gpu_type].get_max_parallelism(model_name) + next_num_devices = find_next_devices( + DEVICE_OPTIONS[model_name], + num_devices=alloc.devices, + num_replicas=alloc.replicas, + remaining_devices=remaining_gpus, + max_num_devices=max_num_devices) + + if not next_num_devices: + return action # No valid next device option, skip + if (model_name, next_num_devices) not in latency_data[gpu_type]: + return action # No latency data for this device option, skip + + new_models = deepcopy(models) + new_models[gpu_type][model_name][allocation_id] = get_model_allocation( + model=model_name, + gpu_type=gpu_type, + devices=next_num_devices, + replicas=max(1, alloc.replicas), + ) + try: + action_result = evaluate_model_allocation( + models=new_models, + num_gpus=num_gpus, + workflow=workflow, + latency_data=latency_data, + power_data=power_data, + policy=policy, + include_models=[model_name], + ) + action = Action( + name=ActionName.ADD_DEVICE, + model=model_name, + gpu_type=gpu_type, + models=new_models, + action_result=action_result, + arrival_time_s=alloc.time, + ) + except Exception: + pass # Invalid action + + return action + + +def _gen_merge_replicas_actions( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + gpu_type: GPUType, + model_name: Model, + num_gpus: dict[GPUType, int], + workflow: WorkflowConfig, + policy: Policy, + latency_data: LatencyData, + power_data: Optional[PowerData] = None, +) -> list[Action]: + actions: list[Action] = [] + + if _is_single_instance(model_name, workflow): + return actions # These models only support a single instance, so no need to merge + + model_instances = models[gpu_type][model_name] + model_num_gpus = 0 + for model_instance in model_instances: + model_num_gpus += model_instance.get_num_gpus() + if model_num_gpus <= 1: + return actions # No replicas to merge for this model and GPU type + + for device_combos in [ + _get_min_device_combinations(model_num_gpus, model_name), + _get_large_instance_many_small_combinations(model_num_gpus, model_name) + ]: + new_models = deepcopy(models) + new_models[gpu_type][model_name] = [] + + for new_num_devices, new_num_replicas in device_combos: + new_models[gpu_type][model_name].append(get_model_allocation( + model=model_name, + gpu_type=gpu_type, + devices=new_num_devices, + replicas=new_num_replicas, + )) + + try: + action_result = evaluate_model_allocation( + models=new_models, + num_gpus=num_gpus, + workflow=workflow, + latency_data=latency_data, + power_data=power_data, + policy=policy, + include_models=[model_name], + ) + + instance_id = 0 + actions.append(Action( + name=ActionName.MERGE, + model=model_name, + gpu_type=gpu_type, + models=new_models, + action_result=action_result, + arrival_time_s=new_models[gpu_type][model_name][instance_id].time, + )) + except Exception: + pass # Invalid action + + return actions + + +def _gen_add_instance( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + num_gpus: dict[GPUType, int], + remaining_gpus: int, + gpu_type: GPUType, + model_name: Model, + workflow: WorkflowConfig, + policy: Policy, + latency_data: LatencyData, + power_data: Optional[PowerData] = None, + look_ahead_replicas: int = 3, +) -> list[Action]: + actions: list[Action] = [] + + if _is_single_instance(model_name, workflow): + return actions # These models only support a single instance, so we don't add more + + for new_num_devices in DEVICE_OPTIONS[model_name]: + for new_num_replicas in list(range(1, look_ahead_replicas + 1)): + new_instance = get_model_allocation( + model=model_name, + gpu_type=gpu_type, + devices=new_num_devices, + replicas=new_num_replicas, + ) + if new_instance.get_num_gpus() > remaining_gpus: + continue # Not enough remaining GPUs for this new instance + + new_models = deepcopy(models) + new_models[gpu_type][model_name].append(new_instance) + + try: + action_result = evaluate_model_allocation( + models=new_models, + num_gpus=num_gpus, + workflow=workflow, + latency_data=latency_data, + power_data=power_data, + policy=policy, + include_models=[model_name], + ) + action = Action( + name=ActionName.ADD_INSTANCE, + model=model_name, + gpu_type=gpu_type, + models=new_models, + action_result=action_result, + arrival_time_s=new_instance.time, + ) + actions.append(action) + except Exception: + pass # Invalid action + + return actions + + +def _gen_remove_replica_action( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + num_gpus: dict[GPUType, int], + gpu_type: GPUType, + model_name: Model, + allocation_id: int, + workflow: WorkflowConfig, + policy: Policy, + latency_data: LatencyData, + power_data: Optional[PowerData] = None, +) -> Optional[Action]: + action: Optional[Action] = None + + model = models[gpu_type][model_name][allocation_id] + + if model.replicas == 0: + return action # No replicas to remove for this model and GPU type + + new_models = deepcopy(models) + new_models[gpu_type][model_name][allocation_id] = get_model_allocation( + model=model_name, + gpu_type=gpu_type, + devices=model.devices, + replicas=model.replicas - 1, + ) + + if len(num_gpus) == 2: + # For dual GPU setting, initialize removed replica on the other GPU type to see if it improves performance + gpu_types = list(num_gpus.keys()) + other_gpu_type = gpu_types[0] if gpu_type == gpu_types[1] else gpu_types[1] + if _is_single_instance(model_name, workflow): + if new_models[gpu_type][model_name][allocation_id].replicas == 0: + # If this is a single instance model and we're removing the only replica, add it to the other GPU type + new_models[other_gpu_type][model_name].append(get_model_allocation( + model=model_name, + gpu_type=other_gpu_type, + devices=model.devices, + replicas=1, + )) + + try: + action_result = evaluate_model_allocation( + models=new_models, + num_gpus=num_gpus, + workflow=workflow, + latency_data=latency_data, + power_data=power_data, + policy=policy, + include_models=[model_name], + ) + action = Action( + name=ActionName.REMOVE_REPLICA, + model=model_name, + gpu_type=gpu_type, + models=new_models, + action_result=action_result, + arrival_time_s=new_models[gpu_type][model_name][allocation_id].time, + ) + except Exception: + pass # Ignore not possible action + return action + + +def _gen_add_replica_action( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + num_gpus: dict[GPUType, int], + remaining_gpus: int, + gpu_type: GPUType, + model_name: Model, + allocation_id: int, + workflow: WorkflowConfig, + policy: Policy, + latency_data: LatencyData, + power_data: Optional[PowerData] = None, +) -> Optional[Action]: + """ + Action to add replicas for a specific model allocation. + """ + action: Optional[Action] = None + + if _is_single_instance(model_name, workflow): + return action # These models don't support replication, so we skip + + model = models[gpu_type][model_name][allocation_id] + + if remaining_gpus < model.devices: + return action # Not enough remaining GPUs to add another replica + + max_replicas = model.get_max_replicas(workflow) + if model.replicas >= max_replicas: + return action # Already at max replicas, skip + + new_num_replicas = min( + model.replicas + 1, + max_replicas, # - models[other_gpu_type][Model.HF].replicas + model.replicas + remaining_gpus // model.devices + ) + if new_num_replicas == model.replicas: + return action # No changes, skip + + new_models = deepcopy(models) + new_models[gpu_type][model_name][allocation_id] = get_model_allocation( + model=model_name, + gpu_type=gpu_type, + devices=model.devices, + replicas=new_num_replicas, + ) + + try: + action_result = evaluate_model_allocation( + models=new_models, + num_gpus=num_gpus, + workflow=workflow, + latency_data=latency_data, + power_data=power_data, + policy=policy, + include_models=[model_name], + ) + action = Action( + name=ActionName.ADD_REPLICA, + model=model_name, + gpu_type=gpu_type, + models=new_models, + action_result=action_result, + arrival_time_s=model.time, + ) + except Exception: + pass # Invalid action + + return action + + +def max_time( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + model_name: Model, +) -> float: + values = [] + for models_gpu in models.values(): + if model_name in models_gpu: + for alloc in models_gpu[model_name]: + values.append(alloc.time) + return max(values) diff --git a/simulator/auto_model_allocator.py b/simulator/auto_model_allocator.py new file mode 100644 index 00000000..3ca86cb7 --- /dev/null +++ b/simulator/auto_model_allocator.py @@ -0,0 +1,109 @@ +""" +Factory helpers for selecting the right model allocator implementation. +""" + +from __future__ import annotations + +import logging + +from dataclasses import replace +from typing import Optional + +from sim_types import Policy +from sim_types import WorkflowConfig +from sim_types import LatencyData +from sim_types import Model +from sim_types import PowerData +from sim_types import QualityLevel +from sim_types import Solver +from sim_types import GPUType +from sim_types import Result + +from model_provisioner.policies import STREAMWISE_POLICY + +from model_allocator import ModelAllocator + + +class AutoModelAllocator(ModelAllocator): + """Allocator wrapper that routes to a concrete allocator by solver.""" + + policy: Policy + + def __init__( + self, + workflow: WorkflowConfig, + latency_data: LatencyData, + power_data: Optional[PowerData] = None, + policy: Policy = STREAMWISE_POLICY, + ) -> None: + super().__init__( + workflow=workflow, + latency_data=latency_data, + power_data=power_data, + policy=policy, + ) + self._allocator = self._build_allocator() + + def _build_allocator(self) -> ModelAllocator: + """Create concrete allocator based on configured solver.""" + if self.policy.solver == Solver.GREEDY: + from model_provisioner.greedy import GreedyAllocator + return GreedyAllocator( + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + ) + if self.policy.solver == Solver.NAIVE: + from model_provisioner.naive_baseline import NaiveAllocator + return NaiveAllocator( + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + ) + if self.policy.solver in {Solver.GUROBI, Solver.HIGHS}: + from model_provisioner.milp import MILPAllocator + return MILPAllocator( + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + ) + if self.policy.solver == Solver.HEXGEN: + from model_provisioner.hexgen import HexGenAllocator + return HexGenAllocator( + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + ) + if self.policy.solver == Solver.HELIX: + from model_provisioner.helix import HelixAllocator + return HelixAllocator( + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + ) + raise ValueError(f"Unsupported solver for allocator selection: {self.policy.solver}") + + def allocate( + self, + num_gpus: dict[GPUType, int], + verbose: bool = False, + ) -> Result: + if self.policy.use_upscaler and self.workflow.target_resolution == QualityLevel.LOW: + logging.warning( + f"Policy {self.policy.name} uses upscaler, but workflow target resolution is LOW. " + f"Disabling upscaler for this allocation.") + self.policy = replace(self.policy, use_upscaler=False) + self._allocator.policy = self.policy + # Remove upscaler from model work + self.workflow.model_work.pop(Model.UPSCALER, None) + self._allocator.workflow = self.workflow + + return self._allocator.allocate( + num_gpus=num_gpus, + verbose=verbose, + ) diff --git a/simulator/constants.py b/simulator/constants.py new file mode 100644 index 00000000..bb6f9034 --- /dev/null +++ b/simulator/constants.py @@ -0,0 +1,142 @@ +from __future__ import annotations + +import math + +from sim_types import WorkflowConfig +from sim_types import GPUType +from sim_types import Model + + +SECONDS_IN_MINUTE = 60.0 +SECONDS_IN_HOUR = 60.0 * 60.0 + +# Video resolution constants (16:10) +NUM_PIXELS_ORIGINAL = 1280 * 800 +NUM_PIXELS_ORIGINAL_FLUX = 1280 * 800 +NUM_PIXELS_ORIGINAL_HF = 512 * 320 +NUM_PIXELS_ORIGINAL_FT = 640 * 400 +NUM_PIXELS_ORIGINAL_UPSCALER = 1280 * 800 + +NUM_PIXELS_MEDIUM = 640 * 400 +NUM_PIXELS_MEDIUM_FLUX = 640 * 400 +NUM_PIXELS_MEDIUM_HF = 256 * 160 +NUM_PIXELS_MEDIUM_FT = 320 * 200 +NUM_PIXELS_MEDIUM_UPSCALER = 640 * 400 + +NUM_PIXELS_LOW = 320 * 200 +NUM_PIXELS_LOW_FLUX = 320 * 200 +NUM_PIXELS_LOW_HF = 128 * 80 +NUM_PIXELS_LOW_FT = 160 * 100 +NUM_PIXELS_LOW_UPSCALER = 320 * 200 + +# StreamCast constants +TOTAL_INPUT_TOKENS = 20 * 1024 # 20K tokens for instructions, PDFs, etc. +TOTAL_VIDEO_SECONDS = 10 * 60 # 10 minutes video +TOTAL_SUBSCENES = 172 # each subscene is 3.5 seconds -> limited by fantasytalking 81 frames at 23 FPS +TOTAL_SCENES = 43 # each scene is 4 subscenes +FPS: dict[Model, float] = { + Model.HF: 30, + Model.FT: 23, +} +NUM_STEPS: dict[Model, int] = { + Model.FLUX: 25, + Model.HF: 10, + Model.FT: 10, +} +FRAMES_OPTIONS: dict[Model, list[int]] = { + Model.HF: [36, 72, 108, 144, 324], + Model.FT: [9, 21, 41, 61, 77], +} +FRAMES_PER_STEP_IDX = 4 + +DEFAULT_WORKFLOW_CONFIG = WorkflowConfig( + total_video_seconds=TOTAL_VIDEO_SECONDS, + total_scenes=TOTAL_SCENES, + total_frames={ + Model.HF: math.ceil(TOTAL_VIDEO_SECONDS * FPS[Model.HF]), + Model.FT: math.ceil(TOTAL_VIDEO_SECONDS * FPS[Model.FT]), + }, + total_subscenes=TOTAL_SUBSCENES, + per_subscene_frames={ + Model.HF: math.ceil(TOTAL_VIDEO_SECONDS * FPS[Model.HF] / TOTAL_SUBSCENES), + Model.FT: math.ceil(TOTAL_VIDEO_SECONDS * FPS[Model.FT] / TOTAL_SUBSCENES), + }, + # default per-frame number of denoising steps + num_steps=dict(NUM_STEPS), + # supported number of generation frames + hf_frames=FRAMES_OPTIONS[Model.HF], + ft_frames=FRAMES_OPTIONS[Model.FT], + frames_per_step_idx=FRAMES_PER_STEP_IDX, + total_input_tokens=TOTAL_INPUT_TOKENS, +) + +# Available device counts for scaling +# Tensor parallelism (TP) or sequence parallelism (SP) +DEVICE_OPTIONS = { + Model.GEMMA: [1, 2, 4, 8], + Model.FLUX: [1, 2, 4, 8, 16], + Model.OTHERS: [1], # Single GPU, no parallelism + Model.HF: [1, 2, 4, 8, 10, 16, 20, 24, 32, 40], + Model.HF_VAE: [1], # Single GPU, no parallelism + Model.FT: [1, 2, 4, 8, 10, 16, 20, 24, 32, 40], + Model.FT_VAE: [1], # Single GPU, no parallelism + Model.UPSCALER: [1, 2, 4, 8], # Single GPU, no parallelism +} + +# Models that only have one instance in the system, so not scaling them across GPU types +SINGLE_INSTANCE_MODELS = [ + Model.GEMMA, + Model.FLUX, + Model.OTHERS, +] + +# Models that can only be run on a single GPU +SINGLE_DEVICE_MODELS = [ + Model.OTHERS, + Model.HF_VAE, + Model.FT_VAE, +] + + +NUM_GPUS_PER_SERVER = { + GPUType.A100: 8, + GPUType.H100: 8, + GPUType.H200: 8, + GPUType.GB200: 8, # This is technically 4 GPUs per server, but nothing fits +} + + +POWER_GPU_IDLE = { + GPUType.A100: 65.0, # Watts + GPUType.H100: 80.0, # Watts TODO placeholder value + GPUType.H200: 80.0, # Watts TODO placeholder value + GPUType.GB200: 170.0, # Watts +} + + +POWER_GPU_TDP = { + GPUType.A100: 400.0, # Watts + GPUType.H100: 700.0, # Watts + GPUType.H200: 700.0, # Watts + GPUType.GB200: 1200.0, # Watts +} + + +# Cost per GPU +GPU_SPOT_COST = { + # $ / hour (Spot prices) + GPUType.A100: 1.07, # $8.56 for 8 GPUs + GPUType.H100: 4.03, # $32.24 for 8 GPUs + GPUType.H200: 4.22, # $33.76 for 8 GPUs + GPUType.GB200: 10.76 # $43.04 for 4 GPUs +} + +GPU_RESERVED_COST = { + # $ / hour (Reserved prices) + GPUType.A100: 3.4, # $27.2 for 8 GPUs + GPUType.H100: 5.39, # $43.12 for 8 GPUs + GPUType.H200: 5.64, # $45.12 for 8 GPUs + GPUType.GB200: 14.42 # $57.68 for 4 GPUs +} + +GPU_COST = GPU_SPOT_COST diff --git a/simulator/data_loading.py b/simulator/data_loading.py new file mode 100644 index 00000000..bea78715 --- /dev/null +++ b/simulator/data_loading.py @@ -0,0 +1,300 @@ +""" +Module for loading latency and power consumption data from CSV files. +""" + +import pandas as pd + +from pathlib import Path + +from sim_types import LatencyData +from sim_types import PowerData +from sim_types import GPUType +from sim_types import LatencyGPUTypeData +from sim_types import PowerGPUTypeData +from sim_types import QualityLevel + +from constants import NUM_PIXELS_ORIGINAL_UPSCALER +from constants import NUM_PIXELS_ORIGINAL_FT +from constants import NUM_PIXELS_ORIGINAL_HF +from constants import NUM_PIXELS_ORIGINAL_FLUX +from constants import NUM_PIXELS_LOW_FT +from constants import NUM_PIXELS_LOW_HF +from constants import NUM_PIXELS_LOW_FLUX +from constants import NUM_PIXELS_LOW_UPSCALER +from constants import NUM_PIXELS_MEDIUM_FT +from constants import NUM_PIXELS_MEDIUM_HF +from constants import NUM_PIXELS_MEDIUM_UPSCALER +from constants import NUM_PIXELS_MEDIUM_FLUX +from constants import POWER_GPU_IDLE +from constants import POWER_GPU_TDP + +_DEFAULT_DATA_DIR = str(Path(__file__).resolve().parents[2] / "simulator" / "data") + + +def load_latency_data( + data_dir: str = _DEFAULT_DATA_DIR, +) -> LatencyData: + """ + Load latency and throughput mapping data from CSV files. + + Args: + data_dir (str): The directory where the CSV files are stored. + Returns: + LatencyData: An object containing all loaded latency data. + """ + data_path = Path(data_dir) + + data = LatencyData(gpus={}) + for gpu_type in GPUType: + data.gpus[gpu_type] = LatencyGPUTypeData(gpu_type=gpu_type) + + # Flux time -> per image generation + csv_flux_path = data_path / f"latency_flux_mapping_{gpu_type.value.lower()}.csv" + df_flux = pd.read_csv(csv_flux_path, comment='#') + data[gpu_type].flux = dict(zip( + df_flux["world_size"], + df_flux["avg_steps_time"])) + + # Hunyuan Framepack per step time -> [36, 72, 108, 144, 324] frames generation + csv_hf_path = data_path / f"latency_hf_mapping_{gpu_type.value.lower()}.csv" + df_hf = pd.read_csv(csv_hf_path, comment='#') + data[gpu_type].hf = dict(zip( + df_hf["world_size"], + df_hf["avg_steps_time"])) + + # Hunyuan Framepack VAE time -> per inference iteration + # Derived: steps * avg_step_time * vae_pct(vae_time / total_time) + data[gpu_type].hf_vae = dict(zip( + df_hf["world_size"], + df_hf["vae_time"])) + + # Fantasy Talking per step time -> [9, 21, 41, 61, 77] frames generation + csv_ft_path = data_path / f"latency_ft_mapping_{gpu_type.value.lower()}.csv" + df_ft = pd.read_csv(csv_ft_path, comment='#') + data[gpu_type].ft = dict(zip( + df_ft["world_size"], + df_ft["avg_steps_time"])) + + # Fantasy Talking VAE time -> per inference iteration + # Derived: steps * avg_step_time * vae_pct(vae_time / total_time) + data[gpu_type].ft_vae = dict(zip( + df_ft["world_size"], + df_ft["vae_time"])) + + # Upscaler time -> per image frame + csv_upscaler_path = data_path / f"latency_upscaler_{gpu_type.value.lower()}.csv" + df_upscaler = pd.read_csv(csv_upscaler_path, comment='#') + data[gpu_type].upscaler = dict(zip( + df_upscaler['world_size'], + df_upscaler['avg_steps_time'])) + + # Gemma time -> first scene and per scene + csv_gemma_path = data_path / f"latency_gemma_{gpu_type.value.lower()}.csv" + df_gemma = pd.read_csv(csv_gemma_path, comment='#') + data[gpu_type].gemma_first_scene = dict(zip( + df_gemma['tp'], + df_gemma['first_scene_time'])) + data[gpu_type].gemma_per_scene = dict(zip( + df_gemma['tp'], + df_gemma['per_scene_time'])) + + # Others time -> kokoro and other overheads -> time per scene + csv_others_path = data_path / f"latency_others_{gpu_type.value.lower()}.csv" + df_others = pd.read_csv(csv_others_path, comment='#') + data[gpu_type].others = dict(zip( + df_others['world_size'], + df_others['time'])) + + return data + + +def load_power_data( + data_dir: str = _DEFAULT_DATA_DIR +) -> PowerData: + """ + Load power consumption data from CSV files. + + Args: + data_dir (str): The directory where the CSV files are stored. + Returns: + PowerData: An object containing all loaded power consumption data. + """ + data_path = Path(data_dir) + + data = PowerData(gpus={}) + for gpu_type in GPUType: + data.gpus[gpu_type] = PowerGPUTypeData(gpu_type=gpu_type) + + # Flux power profile + power_flux_file_name = data_path / f'power_flux_mapping_{gpu_type.value.lower()}.csv' + power_flux_df = pd.read_csv(power_flux_file_name, comment='#') + data[gpu_type].flux = dict(zip( + power_flux_df['world_size'], + power_flux_df['power_watts'])) + + # Hunyuan Framepack 640x400 power profile + power_hf_file_name = data_path / f'power_hf_mapping_{gpu_type.value.lower()}.csv' + power_hf_df = pd.read_csv(power_hf_file_name, comment='#') + data[gpu_type].hf = dict(zip( + power_hf_df['world_size'], + power_hf_df['power_watts'])) + + # Hunyuan Framepack 1280x800 power profile + power_hf_file_name_high = data_path / f'power_hf_mapping_{gpu_type.value.lower()}_high.csv' + power_hf_high_df = pd.read_csv(power_hf_file_name_high, comment='#') + data[gpu_type].hf_high = dict(zip( + power_hf_high_df['world_size'], + power_hf_high_df['power_watts'])) + + # Hunyuan Framepack VAE power profile + power_hf_vae_file_name = data_path / f'power_hf_vae_{gpu_type.value.lower()}.csv' + power_hf_vae_df = pd.read_csv(power_hf_vae_file_name, comment='#') + data[gpu_type].hf_vae = dict(zip( + power_hf_vae_df['world_size'], + power_hf_vae_df['power_watts'])) + + # Hunyuan Framepack VAE high power profile + power_hf_vae_high_file_name = data_path / f'power_hf_vae_{gpu_type.value.lower()}_high.csv' + power_hf_vae_high_df = pd.read_csv(power_hf_vae_high_file_name, comment='#') + data[gpu_type].hf_vae_high = dict(zip( + power_hf_vae_high_df['world_size'], + power_hf_vae_high_df['power_watts'])) + + # Fantasy Talking 640x400 power profile + power_ft_file_name = data_path / f'power_ft_mapping_{gpu_type.value.lower()}.csv' + power_ft_df = pd.read_csv(power_ft_file_name, comment='#') + data[gpu_type].ft = dict(zip( + power_ft_df['world_size'], + power_ft_df['power_watts'])) + + # Fantasy Talking 1280x800 power profile + power_ft_high_file_name = data_path / f'power_ft_mapping_{gpu_type.value.lower()}_high.csv' + power_ft_high_df = pd.read_csv(power_ft_high_file_name, comment='#') + data[gpu_type].ft_high = dict(zip( + power_ft_high_df['world_size'], + power_ft_high_df['power_watts'])) + + # Fantasy Talking VAE mapping + power_ft_vae_file_name = data_path / f'power_ft_vae_mapping_{gpu_type.value.lower()}.csv' + power_ft_vae_df = pd.read_csv(power_ft_vae_file_name, comment='#') + data[gpu_type].ft_vae = dict(zip( + power_ft_vae_df['world_size'], + power_ft_vae_df['power_watts'])) + + # Fantasy Talking VAE high mapping + power_ft_vae_high_file_name = data_path / f'power_ft_vae_mapping_{gpu_type.value.lower()}_high.csv' + power_ft_vae_high_df = pd.read_csv(power_ft_vae_high_file_name, comment='#') + data[gpu_type].ft_vae_high = dict(zip( + power_ft_vae_high_df['world_size'], + power_ft_vae_high_df['power_watts'])) + + # Upscaler power profile + power_upscaler_file_name = data_path / f'power_upscaler_{gpu_type.value.lower()}.csv' + power_upscaler_df = pd.read_csv(power_upscaler_file_name, comment='#') + data[gpu_type].upscaler = dict(zip( + power_upscaler_df['world_size'], + power_upscaler_df['power_watts'])) + + # Gemma power profile + power_gemma_first_scene_file_name = data_path / f'power_gemma_first_scene_{gpu_type.value.lower()}.csv' + power_gemma_per_scene_file_name = data_path / f'power_gemma_per_scene_{gpu_type.value.lower()}.csv' + power_gemma_first_scene_df = pd.read_csv(power_gemma_first_scene_file_name, comment='#') + power_gemma_per_scene_df = pd.read_csv(power_gemma_per_scene_file_name, comment='#') + data[gpu_type].gemma_first_scene = dict(zip( + power_gemma_first_scene_df['world_size'], + power_gemma_first_scene_df['power_watts'] + )) + data[gpu_type].gemma_per_scene = dict(zip( + power_gemma_per_scene_df['world_size'], + power_gemma_per_scene_df['power_watts'] + )) + + # Idle and TDP power profiles + for gpu_type in GPUType: + data[gpu_type].idle = POWER_GPU_IDLE[gpu_type] + data[gpu_type].tdp = POWER_GPU_TDP[gpu_type] + + return data + + +def load_adaptive_quality_data( + data_dir: str, + level: QualityLevel, +) -> LatencyData: + """Load latency data for adaptive quality.""" + assert isinstance(level, QualityLevel) + + latency_data = load_latency_data(data_dir=data_dir) + + if level == QualityLevel.ORIGINAL or level == QualityLevel.HIGH: + return latency_data + + if level == QualityLevel.MEDIUM: + ratio_flux = NUM_PIXELS_MEDIUM_FLUX / NUM_PIXELS_ORIGINAL_FLUX + ratio_hf = NUM_PIXELS_MEDIUM_HF / NUM_PIXELS_ORIGINAL_HF + ratio_hf_vae = NUM_PIXELS_MEDIUM_HF / NUM_PIXELS_ORIGINAL_HF + ratio_ft = NUM_PIXELS_MEDIUM_FT / NUM_PIXELS_ORIGINAL_FT + ratio_ft_vae = NUM_PIXELS_MEDIUM_FT / NUM_PIXELS_ORIGINAL_FT + ratio_upscaler = NUM_PIXELS_MEDIUM_UPSCALER / NUM_PIXELS_ORIGINAL_UPSCALER + for gpu_type in GPUType: + latency_data[gpu_type].flux = { + k: v * ratio_flux + for k, v in latency_data[gpu_type].flux.items() + } + latency_data[gpu_type].hf = { + k: v * ratio_hf + for k, v in latency_data[gpu_type].hf.items() + } + latency_data[gpu_type].hf_vae = { + k: v * ratio_hf_vae + for k, v in latency_data[gpu_type].hf_vae.items() + } + latency_data[gpu_type].ft = { + k: v * ratio_ft + for k, v in latency_data[gpu_type].ft.items() + } + latency_data[gpu_type].ft_vae = { + k: v * ratio_ft_vae + for k, v in latency_data[gpu_type].ft_vae.items() + } + latency_data[gpu_type].upscaler = { + k: v * ratio_upscaler + for k, v in latency_data[gpu_type].upscaler.items() + } + return latency_data + + if level == QualityLevel.LOW: + ratio_flux = NUM_PIXELS_LOW_FLUX / NUM_PIXELS_ORIGINAL_FLUX + ratio_hf = NUM_PIXELS_LOW_HF / NUM_PIXELS_ORIGINAL_HF + ratio_hf_vae = NUM_PIXELS_LOW_HF / NUM_PIXELS_ORIGINAL_HF + ratio_ft = NUM_PIXELS_LOW_FT / NUM_PIXELS_ORIGINAL_FT + ratio_ft_vae = NUM_PIXELS_LOW_FT / NUM_PIXELS_ORIGINAL_FT + ratio_upscaler = NUM_PIXELS_LOW_UPSCALER / NUM_PIXELS_ORIGINAL_UPSCALER + for gpu_type in GPUType: + latency_data[gpu_type].flux = { + k: v * ratio_flux + for k, v in latency_data[gpu_type].flux.items() + } + latency_data[gpu_type].hf = { + k: v * ratio_hf + for k, v in latency_data[gpu_type].hf.items() + } + latency_data[gpu_type].hf_vae = { + k: v * ratio_hf_vae + for k, v in latency_data[gpu_type].hf_vae.items() + } + latency_data[gpu_type].ft = { + k: v * ratio_ft + for k, v in latency_data[gpu_type].ft.items() + } + latency_data[gpu_type].ft_vae = { + k: v * ratio_ft_vae + for k, v in latency_data[gpu_type].ft_vae.items() + } + latency_data[gpu_type].upscaler = { + k: v * ratio_upscaler + for k, v in latency_data[gpu_type].upscaler.items() + } + return latency_data + + return latency_data diff --git a/simulator/evaluator.py b/simulator/evaluator.py new file mode 100644 index 00000000..a9730bb2 --- /dev/null +++ b/simulator/evaluator.py @@ -0,0 +1,414 @@ +""" +Evaluate the performance of a given model allocation in terms of time, energy, and cost. +It includes some assertions (e.g., only one instance of Gemma and Flux). +""" +from __future__ import annotations + +import math +import logging + +from typing import Optional + +from constants import NUM_GPUS_PER_SERVER +from constants import TOTAL_INPUT_TOKENS +from constants import SECONDS_IN_HOUR + +from sim_types import Result +from sim_types import GPUType +from sim_types import WorkflowConfig +from sim_types import PowerData +from sim_types import LatencyData +from sim_types import Model +from sim_types import ModelAllocation +from sim_types import Policy + +from sim_types_json import models_to_json +from sim_types_json import workflow_to_json +from sim_types_json import policy_to_json + + +def _count_instances( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + model: Model, +) -> int: + num_instances = 0 + for model_gpus in models.values(): + if model in model_gpus: + for model_allocation in model_gpus[model]: + if model_allocation.get_num_gpus() > 0: + num_instances += 1 + return num_instances + + +def _assert_single_instance( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + model: Model, +) -> None: + num_instances = _count_instances(models, model) + assert num_instances == 1, f"Expected exactly one instance of {model}, but found {num_instances}" + + +def _assert_at_least_one_instance( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + model: Model, +) -> None: + num_instances = _count_instances(models, model) + assert num_instances > 0, f"Expected at least one instance of {model}, but found {num_instances}" + + +def _assert_no_instances( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + model: Model, +) -> None: + num_instances = _count_instances(models, model) + assert num_instances == 0, f"Expected no instances of {model}, but found {num_instances}" + + +def evaluate_times( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + latency_data: LatencyData, + workflow: WorkflowConfig, + policy: Policy, + include_models: Optional[list[Model]] = None, +) -> None: + """ + Compute the total time for the given model allocation and workflow, using the latency data. + It only evaluates the models specified in "include_models" if provided. + """ + gpu_types = list(models.keys()) + + upscaler_gpus = sum( + model_alloc.get_num_gpus() + for gpu_type in gpu_types + for model_alloc in models.get(gpu_type, {}).get(Model.UPSCALER, []) + ) + if not policy.use_upscaler: + assert upscaler_gpus == 0 + + for model_name in workflow.models: + if include_models is not None and model_name not in include_models: + continue + + # Special conditions: models that require a policy flag + if model_name == Model.HF_VAE and not policy.is_disaggregated(Model.HF): + _assert_no_instances(models, Model.HF_VAE) + continue + if model_name == Model.FT_VAE and not policy.is_disaggregated(Model.FT): + _assert_no_instances(models, Model.FT_VAE) + continue + if model_name == Model.UPSCALER and not policy.use_upscaler: + _assert_no_instances(models, Model.UPSCALER) + continue + + _assert_at_least_one_instance(models, model_name) + + if not workflow.is_parallelizable(model_name): + # Single-instance: no work splitting + for gpu_type in gpu_types: + if model_name in models[gpu_type]: + for model_alloc in models[gpu_type][model_name]: + model_alloc.calculate_time( + policy, workflow, latency_data) + model_alloc.calculate_time_first( + policy, workflow, latency_data) + continue + + # Parallel: capacity-based work splitting (throughput-weighted) + capacities: dict[GPUType, list[float]] = {} + for gpu_type in gpu_types: + capacities[gpu_type] = [] + if model_name not in models[gpu_type]: + continue + for model_alloc in models[gpu_type][model_name]: + if model_alloc.get_num_gpus() > 0: + latency = latency_data[gpu_type][model_name, model_alloc.devices] + # When not disaggregated, include VAE overhead in capacity + if model_name == Model.FT and not policy.is_disaggregated(Model.FT): + latency += latency_data[gpu_type][Model.FT_VAE, 1] / workflow.num_steps[Model.FT] + if model_name == Model.HF and not policy.is_disaggregated(Model.HF): + latency += latency_data[gpu_type][Model.HF_VAE, 1] / workflow.num_steps[Model.HF] + if model_name in (Model.HF, Model.HF_VAE, Model.FT, Model.FT_VAE): + latency *= workflow.get_resolution_scale(policy.use_upscaler) + if model_name == Model.GEMMA: + latency *= workflow.total_input_tokens / TOTAL_INPUT_TOKENS + if latency == 0: + capacities[gpu_type].append(0.0) + else: + capacities[gpu_type].append(model_alloc.replicas / latency) + + total_capacity = sum(sum(c) for c in capacities.values()) + for gpu_type in gpu_types: + if model_name not in models[gpu_type]: + continue + cap_idx = 0 + for model_alloc in models[gpu_type][model_name]: + if model_alloc.get_num_gpus() > 0: + work_pct = capacities[gpu_type][cap_idx] / total_capacity if total_capacity > 0 else 0.0 + model_alloc.calculate_time( + policy, workflow, latency_data, + work_pct=work_pct) + model_alloc.calculate_time_first( + policy, workflow, latency_data) + cap_idx += 1 + + +def evaluate_energy( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + power_data: PowerData, + workflow: WorkflowConfig, + total_time_s: float = 0.0, +) -> None: + """ + Calculate total energy (power * time * replicas for each model). + Need to run after evaluate_times since energy calculation depends on time. + """ + for gpu_type_allocs in models.values(): + for model_allocation_list in gpu_type_allocs.values(): + for model_allocation in model_allocation_list: + model_allocation.calculate_energy( + workflow, + power_data, + total_time_s) + + +def evaluate_cost( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + total_time_s: float, + policy: Policy, +) -> None: + """ + Calculate total cost based on GPU hours used. + Need to run after evaluate_times since cost calculation depends on time. + """ + for gpu_type_allocs in models.values(): + for model_allocation_list in gpu_type_allocs.values(): + for model in model_allocation_list: + model.calculate_cost(policy, total_time_s) + + +_EVALUATOR_CACHE: dict[str, Result] = {} + + +def evaluate_model_allocation( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + num_gpus: dict[GPUType, int], + workflow: WorkflowConfig, + latency_data: LatencyData, + power_data: Optional[PowerData], + policy: Policy, + include_models: Optional[list[Model]] = None, + cache_results: bool = False, + round_up_cost_to_server: bool = False, +) -> Result: + """ + Evaluate the metrics for a given allocation of models to GPUs. + It only evaluates the models in "include_models" if specified. + """ + cache_key = None + if cache_results: + cache_key = models_to_json(models) + \ + workflow_to_json(workflow) + \ + str(latency_data) + \ + str(power_data) + \ + policy_to_json(policy) + \ + str(include_models) + if cache_key in _EVALUATOR_CACHE: + return _EVALUATOR_CACHE[cache_key] + + # Check if setup is possible + gpus_used = {} + for gpu_type, model_gpu in models.items(): + gpus_used[gpu_type] = calc_used_gpus({gpu_type: model_gpu}) + assert num_gpus[gpu_type] % NUM_GPUS_PER_SERVER[gpu_type] == 0, \ + f"{gpu_type.value}: {num_gpus[gpu_type]} % {NUM_GPUS_PER_SERVER[gpu_type]}" + assert gpus_used[gpu_type] <= num_gpus[gpu_type], \ + f"{gpu_type.value}: {gpus_used[gpu_type]} > {num_gpus[gpu_type]}" + + # Assert input models are built correctly + for gpu_type in models.keys(): + for model_name in models[gpu_type].keys(): + for instance_id in range(len(models[gpu_type][model_name])): + assert models[gpu_type][model_name][instance_id].model == model_name + assert models[gpu_type][model_name][instance_id].gpu_type == gpu_type + + # Actual evaluation + evaluate_times( + models, latency_data, workflow, policy, + include_models=include_models, + ) + time_s = calc_total_time(models) + + first_chunk_time = calc_ttff(models) + ttff_s = max( + first_chunk_time, + time_s - workflow.total_video_seconds + ) + + num_frames = (workflow.total_frames[Model.FT] - workflow.per_subscene_frames[Model.FT]) + tbf_s = (time_s - first_chunk_time) / num_frames + if tbf_s < 0: + logging.debug( + f"Negative TBF: " + F"{tbf_s:.2f} = ({time_s:.2f} - {first_chunk_time:.2f}) / {num_frames}") + tbf_s = 0.0 + + # Calculate total energy (power * time * replicas for each model) + energy = 0.0 + if power_data is not None: + evaluate_energy(models, power_data, workflow, time_s) + energy = calc_energy(models=models) + + evaluate_cost(models, time_s, policy) + cost = calc_cost( + models, time_s, policy, + round_up_to_server=round_up_cost_to_server) + + ret = Result( + models=models, + gpus_used=gpus_used, + gpus_total=num_gpus, + total_time_s=time_s, + first_chunk_time=first_chunk_time, + ttff_s=ttff_s, + tbf_s=tbf_s, + total_energy=energy if power_data else 0.0, + cost=cost, + ) + + if cache_key is not None: + _EVALUATOR_CACHE[cache_key] = ret + + return ret + + +def calc_energy( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], +) -> float: + """ + Calculate total energy (power * time * replicas for each model). + Energy in Watt x seconds (Joules). + This assumes that evaluate_energy() has been called already. + """ + energy = 0.0 # Total energy in Watt-seconds (Joules = Watt x second) + for model_dict in models.values(): + for model_allocations in model_dict.values(): + for model_allocation in model_allocations: + energy += model_allocation.energy + return energy + + +def calc_model_cost( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], +) -> float: + """ + Calculate total cost based on GPU hours used. + This assumes that evaluate_cost() has been called already. + """ + costs = {} + for gpu_type, model_dict in models.items(): + costs[gpu_type] = 0.0 + for model_allocations in model_dict.values(): + for model_allocation in model_allocations: + costs[gpu_type] += model_allocation.cost + return sum(costs.values()) + + +def calc_cost( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + time_s: float, + policy: Policy, + round_up_to_server: bool = True, +) -> float: + """ + Calculate total cost based on GPU hours used. + """ + used_gpus = calc_used_gpus_per_type(models) + + # Round up to the nearest server (pack of GPUs) since we pay for whole servers + if round_up_to_server: + for gpu_type, used in used_gpus.items(): + used_pack = math.ceil(used / NUM_GPUS_PER_SERVER[gpu_type]) * NUM_GPUS_PER_SERVER[gpu_type] + used_gpus[gpu_type] = used_pack + + return calc_cost_total(used_gpus, time_s, policy) + + +def calc_cost_total( + num_gpus: dict[GPUType, int], + time_s: float, + policy: Policy, +) -> float: + """ + Calculate total cost based on GPU hours used. + It includes the idle GPUs not assigned to a model. + """ + cost = 0.0 + for gpu_type, num in num_gpus.items(): + cost += num * (time_s / SECONDS_IN_HOUR) * policy.gpu_cost[gpu_type] + return cost + + +def calc_used_gpus_per_type( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], +) -> dict[GPUType, int]: + """ + Calculate number of GPUs used per GPU type across all models. + """ + gpus_used = {} + for gpu_type, model_gpu in models.items(): + gpus_used[gpu_type] = 0 + for model_allocations in model_gpu.values(): + for model_allocation in model_allocations: + gpus_used[gpu_type] += model_allocation.get_num_gpus() + return gpus_used + + +def calc_used_gpus( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], +) -> int: + """ + Calculate total number of GPUs used across all models and GPU types. + """ + gpus_used = calc_used_gpus_per_type(models) + return sum(gpus_used.values()) + + +def calc_total_time( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], +) -> float: + """ + Calculate total time considering all stages and dependencies. + This assumes that evaluate_time() has been called already. + """ + total_time_secs = 0.0 + for model_name in Model: + model_alloc_times = [ + model_alloc.time + for gpu_type in GPUType + if gpu_type in models and model_name in models[gpu_type] + for model_alloc in models[gpu_type][model_name] + ] + model_time = max(model_alloc_times) if model_alloc_times else 0.0 + total_time_secs += model_time + return total_time_secs + + +def calc_ttff( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], +) -> float: + """ + Calculate time to first frame (chunk). + It takes the time to first frame (TTFF) for each model. + This assumes that evaluate_time() has been called already. + """ + models_time_first: dict[Model, float] = {} + for model_name in Model: + times_first = [] + for gpu_type in models.keys(): + if model_name in models[gpu_type]: + for model_alloc in models[gpu_type][model_name]: + if model_alloc.get_num_gpus() > 0: + times_first.append(model_alloc.time_first) + if len(times_first) > 0: + models_time_first[model_name] = min(times_first) # The fastest model determines TTFF + return sum(models_time_first.values()) diff --git a/simulator/model_allocator.py b/simulator/model_allocator.py new file mode 100644 index 00000000..0f773a51 --- /dev/null +++ b/simulator/model_allocator.py @@ -0,0 +1,282 @@ +""" +Defines the ModelAllocator abstract base class and its interface for model allocation strategies. +""" + +from __future__ import annotations + +from typing import Optional + +from abc import ABC +from abc import abstractmethod + +from sim_types import GPUType +from sim_types import Model +from sim_types import ModelAllocation +from sim_types import Policy +from sim_types import WorkflowConfig +from sim_types import LatencyData +from sim_types import PowerData +from sim_types import Result + +from models import FluxModelAllocation +from models import GemmaModelAllocation +from models import HFModelAllocation +from models import HFVAEModelAllocation +from models import FTModelAllocation +from models import FTVAEModelAllocation +from models import UpscalerModelAllocation +from models import OthersModelAllocation + +from model_provisioner.policies import NAIVE_POLICY + + +class ModelAllocator(ABC): + """ + Abstract base class for model allocators. + """ + + def __init__( + self, + workflow: WorkflowConfig, + latency_data: LatencyData, + power_data: Optional[PowerData] = None, + policy: Policy = NAIVE_POLICY, + ) -> None: + self.workflow = workflow + self.latency_data = latency_data + self.power_data = power_data + self.policy = policy + + @abstractmethod + def allocate( + self, + num_gpus: dict[GPUType, int], + verbose: bool = False, + ) -> Result: + """Allocate models to GPUs and return the provisioning result.""" + ... + + def _init_single_server_models( + self, + gpu_type: GPUType, + ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: + """ + Initialize model allocations for a single server (8 GPUs or fewer). + Each model gets a single allocation entry. + """ + models: dict[GPUType, dict[Model, list[ModelAllocation]]] = { + gpu_type: { + Model.GEMMA: [ + GemmaModelAllocation( + gpu_type=gpu_type, + devices=1, replicas=1) + ], + Model.FLUX: [ + FluxModelAllocation( + gpu_type=gpu_type, + devices=1, replicas=1) + ], + Model.HF: [ + HFModelAllocation( + gpu_type=gpu_type, + devices=1, replicas=2) + ], + Model.HF_VAE: [ + HFVAEModelAllocation( + gpu_type=gpu_type, + devices=1, replicas=1) + ], + Model.FT: [ + FTModelAllocation( + gpu_type=gpu_type, + devices=1, replicas=1) + ], + Model.FT_VAE: [ + FTVAEModelAllocation( + gpu_type=gpu_type, + devices=1, replicas=1) + ], + Model.UPSCALER: [ + UpscalerModelAllocation( + gpu_type=gpu_type) + ], + Model.OTHERS: [ + OthersModelAllocation( + gpu_type=gpu_type, + devices=1, replicas=1) # + 1 for Kokoro/YOLO + ], + }, + } + + if self.policy.use_upscaler: + # HF -> UPSCALER + models[gpu_type][Model.HF][0].replicas -= 1 + models[gpu_type][Model.UPSCALER][0].replicas += 1 + + if not self.policy.is_disaggregated(Model.HF): + # HF_VAE -> HF + models[gpu_type][Model.HF_VAE][0].replicas -= 1 + models[gpu_type][Model.HF][0].replicas += 1 + if not self.policy.is_disaggregated(Model.FT): + # FT_VAE -> FT + models[gpu_type][Model.FT_VAE][0].replicas -= 1 + models[gpu_type][Model.FT][0].replicas += 1 + + self._zero_out_unused_models(models) + return models + + def _init_single_device_models( + self, + gpu_type: GPUType, + ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: + """ + Initialize model allocations for a single GPU type with >8 GPUs. + Each model gets two allocation entries (active and inactive). + """ + models: dict[GPUType, dict[Model, list[ModelAllocation]]] = { + gpu_type: { + Model.GEMMA: [ + GemmaModelAllocation( + gpu_type=gpu_type, + devices=1, replicas=1), + GemmaModelAllocation( + gpu_type=gpu_type), + ], + Model.FLUX: [ + FluxModelAllocation( + gpu_type=gpu_type, + devices=1, replicas=1), + FluxModelAllocation( + gpu_type=gpu_type), + ], + Model.HF: [ + HFModelAllocation( + gpu_type=gpu_type, + devices=1, replicas=1), + HFModelAllocation( + gpu_type=gpu_type), + ], + Model.HF_VAE: [ + HFVAEModelAllocation( + gpu_type=gpu_type, + devices=1, replicas=1), + HFVAEModelAllocation( + gpu_type=gpu_type), + ], + Model.FT: [ + FTModelAllocation( + gpu_type=gpu_type, + devices=2, replicas=1), + FTModelAllocation( + gpu_type=gpu_type), + ], + Model.FT_VAE: [ + FTVAEModelAllocation( + gpu_type=gpu_type, + devices=1, replicas=1), + FTVAEModelAllocation( + gpu_type=gpu_type), + ], + Model.UPSCALER: [ + UpscalerModelAllocation( + gpu_type=gpu_type), + UpscalerModelAllocation( + gpu_type=gpu_type), + ], + Model.OTHERS: [ + OthersModelAllocation( + gpu_type=gpu_type, + devices=1, replicas=1), + OthersModelAllocation( + gpu_type=gpu_type), + ], + }, + } + + if self.policy.use_upscaler: + models[gpu_type][Model.UPSCALER][0].replicas = 1 + + if not self.policy.is_disaggregated(Model.HF): + # HF_VAE -> HF + models[gpu_type][Model.HF_VAE][0].replicas -= 1 + models[gpu_type][Model.HF][0].replicas += 1 + if not self.policy.is_disaggregated(Model.FT): + # FT_VAE -> FT + models[gpu_type][Model.FT_VAE][0].replicas -= 1 + models[gpu_type][Model.FT][0].replicas += 1 + + self._zero_out_unused_models(models) + return models + + def _init_both_devices_models( + self, + gpu_type1: GPUType, + gpu_type2: GPUType, + ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: + """ + Initialize model allocations for two GPU types. + gpu_type1 gets GEMMA, FLUX, OTHERS; gpu_type2 gets HF, VAE, FT, UPSCALER. + """ + models: dict[GPUType, dict[Model, list[ModelAllocation]]] = { + gpu_type1: { + Model.GEMMA: [GemmaModelAllocation( + gpu_type=gpu_type1, + devices=1, replicas=1)], + Model.FLUX: [FluxModelAllocation( + gpu_type=gpu_type1, + devices=1, replicas=1)], + Model.HF: [], + Model.HF_VAE: [], + Model.FT: [], + Model.FT_VAE: [], + Model.UPSCALER: [], + Model.OTHERS: [OthersModelAllocation( + gpu_type=gpu_type1, + devices=1, replicas=1)], # + 1 for Kokoro/YOLO + }, + gpu_type2: { + Model.GEMMA: [], + Model.FLUX: [], + Model.HF: [HFModelAllocation( + gpu_type=gpu_type2, + devices=1, replicas=1)], + Model.HF_VAE: [HFVAEModelAllocation( + gpu_type=gpu_type2, + devices=1, replicas=1)], + Model.FT: [FTModelAllocation( + gpu_type=gpu_type2, + devices=2, replicas=1)], + Model.FT_VAE: [FTVAEModelAllocation( + gpu_type=gpu_type2, + devices=1, replicas=1)], + Model.UPSCALER: [UpscalerModelAllocation( + gpu_type=gpu_type2)], + Model.OTHERS: [], + }, + } + + if not self.policy.is_disaggregated(Model.HF): + # HF_VAE -> HF + models[gpu_type2][Model.HF_VAE][0].replicas -= 1 + models[gpu_type2][Model.HF][0].replicas += 1 + if not self.policy.is_disaggregated(Model.FT): + # FT_VAE -> FT + models[gpu_type2][Model.FT_VAE][0].replicas -= 1 + models[gpu_type2][Model.FT][0].replicas += 1 + + if self.policy.use_upscaler: + models[gpu_type2][Model.UPSCALER][0].replicas = 1 + + self._zero_out_unused_models(models) + return models + + def _zero_out_unused_models( + self, + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + ) -> None: + """Zero out replicas for models not in the workflow.""" + for gpu_type in models: + for model in Model: + if model not in self.workflow.models: + for alloc in models[gpu_type][model]: + alloc.replicas = 0 diff --git a/simulator/models.py b/simulator/models.py new file mode 100644 index 00000000..9a56ab79 --- /dev/null +++ b/simulator/models.py @@ -0,0 +1,811 @@ +""" +Contains the definition for each model. +It includes the calculations for time, energy, and cost. +""" +from __future__ import annotations + +import math + +from typing import override +from typing import Callable +from typing import Optional +from typing import Type +from typing import ClassVar + +from sim_types import LatencyData +from sim_types import PowerData +from sim_types import ModelAllocation +from sim_types import Model +from sim_types import Policy +from sim_types import QualityLevel +from sim_types import WorkflowConfig +from sim_types import GPUType + +from constants import TOTAL_INPUT_TOKENS + + +# ModelAllocation Factory +ModelAllocationCls = Type[ModelAllocation] + +_MODEL_ALLOCATION_REGISTRY: dict[Model, ModelAllocationCls] = {} + + +def register_model( + model: Model +) -> Callable[[ModelAllocationCls], ModelAllocationCls]: + """Register a ModelAllocation class for the factory.""" + def decorator(cls: ModelAllocationCls) -> ModelAllocationCls: + _MODEL_ALLOCATION_REGISTRY[model] = cls + return cls + return decorator + + +def get_model_allocation( + *, + model: Model, + gpu_type: GPUType, + devices: int = 1, + replicas: int = 0, +) -> ModelAllocation: + """Factory to get the ModelAllocation instance for a specific model.""" + if model not in _MODEL_ALLOCATION_REGISTRY: + raise ValueError(f"No ModelAllocation for model {model}") + cls = _MODEL_ALLOCATION_REGISTRY[model] + return cls( + gpu_type=gpu_type, + devices=devices, + replicas=replicas, + ) + + +def _calculate_total_time( + total_work: float, + num_replicas: int, + time_per_work: float, +) -> float: + """Calculate total time given work, replicas, and time per work unit.""" + if num_replicas <= 0: + return 0.0 + total_time = (total_work / num_replicas) * time_per_work + if total_time < time_per_work: # We cannot go faster than single work unit time + total_time = time_per_work + return total_time + + +def assert_pixel_config( + workflow: WorkflowConfig +) -> None: + """Verify that the workflow's pixel configuration is valid for upscaling.""" + from sim_types import RESOLUTION_PIXELS + assert 0 < RESOLUTION_PIXELS[QualityLevel.MEDIUM] < RESOLUTION_PIXELS[QualityLevel.HIGH] + + +@register_model(Model.GEMMA) +class GemmaModelAllocation(ModelAllocation): + """Gemma model allocation.""" + model: ClassVar[Model] = Model.GEMMA + + @override + def get_max_replicas( + self, + workflow: WorkflowConfig, + ) -> int: + return workflow.model_work.get(Model.GEMMA, 1) + + @override + def calculate_time( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + work_pct: float = 1.0, + ) -> float: + if self.get_num_gpus() == 0: + self.time = 0.0 + return self.time + latency_first = latency_data[self.gpu_type].gemma_first_scene[self.devices] + latency_per_scene = latency_data[self.gpu_type].gemma_per_scene[self.devices] + latency_first *= workflow.total_input_tokens / TOTAL_INPUT_TOKENS + latency_per_scene *= workflow.total_input_tokens / TOTAL_INPUT_TOKENS + total_work = workflow.model_work.get(Model.GEMMA, 1) + if total_work > 1: + num_scenes = math.ceil(work_pct * total_work) + total_time_per_scene = latency_first + latency_per_scene * (num_scenes - 1) + self.time = _calculate_total_time( + num_scenes, + self.replicas, + total_time_per_scene / num_scenes) + else: + self.time = latency_first + latency_per_scene * (workflow.total_scenes - 1) + return self.time + + @override + def calculate_time_first( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + ) -> float: + if self.get_num_gpus() == 0: + self.time_first = 0.0 + return self.time_first + latency_first = latency_data[self.gpu_type].gemma_first_scene[self.devices] + latency_first *= workflow.total_input_tokens / TOTAL_INPUT_TOKENS + self.time_first = latency_first + return self.time_first + + @override + def calculate_energy( + self, + workflow: WorkflowConfig, + power_data: Optional[PowerData] = None, + total_time_s: float = 0.0, + ) -> float: + if self.get_num_gpus() == 0 or power_data is None: + self.energy = 0.0 + return self.energy + # Gemma energy + latency_first = self.time_first + latency_per_scene = max(0.0, self.time - latency_first) + power_first = power_data[self.gpu_type].gemma_first_scene[self.devices] + power_per_scene = power_data[self.gpu_type].gemma_per_scene[self.devices] + self.energy = \ + power_first * latency_first + \ + power_per_scene * latency_per_scene * (workflow.total_scenes - 1) + # Idle energy + power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus() + time_idle = total_time_s - self.time + if time_idle > 0: + self.energy += power_idle * time_idle + return self.energy + + +@register_model(Model.FLUX) +class FluxModelAllocation(ModelAllocation): + """Flux model allocation.""" + model: ClassVar[Model] = Model.FLUX + + def _calc_time_per_scene( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + ) -> float: + return ( + latency_data[self.gpu_type][self.model, self.devices] + * workflow.num_steps[Model.FLUX] + ) + + @override + def get_max_replicas( + self, + workflow: WorkflowConfig, + ) -> int: + return workflow.model_work.get(Model.FLUX, 1) + + @override + def calculate_time( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + work_pct: float = 1.0, + ) -> float: + if self.get_num_gpus() == 0: + self.time = 0.0 + return self.time + time_per_scene = self._calc_time_per_scene( + policy, + workflow, + latency_data, + ) + total_work = workflow.model_work.get(Model.FLUX, 1) + if total_work > 1: + num_scenes = math.ceil(work_pct * total_work) + self.time = _calculate_total_time( + num_scenes, + self.replicas, + time_per_scene) + else: + self.time = time_per_scene + return self.time + + @override + def calculate_time_first( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + ) -> float: + if self.get_num_gpus() == 0: + self.time_first = 0.0 + return self.time_first + time_per_scene = self._calc_time_per_scene( + policy, + workflow, + latency_data, + ) + self.time_first = time_per_scene + return self.time_first + + @override + def calculate_energy( + self, + workflow: WorkflowConfig, + power_data: Optional[PowerData] = None, + total_time_s: float = 0.0, + ) -> float: + if self.get_num_gpus() == 0 or power_data is None: + self.energy = 0.0 + return self.energy + power_flux = power_data[self.gpu_type][Model.FLUX, self.devices] + self.energy = power_flux * self.time * self.replicas + # Idle energy + power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus() + time_idle = total_time_s - self.time + if time_idle > 0: + self.energy += power_idle * time_idle + return self.energy + + +@register_model(Model.HF) +class HFModelAllocation(ModelAllocation): + """HunyuanFramePack model allocation.""" + model: ClassVar[Model] = Model.HF + + def _calc_time_per_frame( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + ) -> float: + return ( + latency_data[self.gpu_type][self.model, self.devices] + * workflow.get_resolution_scale(policy.use_upscaler) + * workflow.num_steps[Model.HF] + ) + + def _calc_time_per_subscene( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + ) -> float: + return ( + workflow.per_subscene_frames[Model.HF] + / workflow.hf_frames[workflow.frames_per_step_idx] + * latency_data[self.gpu_type][self.model, self.devices] + * workflow.get_resolution_scale(policy.use_upscaler) # latency_ratio + * workflow.num_steps[Model.HF] + ) + + @override + def calculate_time( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + work_pct: float = 1.0, + ) -> float: + if self.get_num_gpus() == 0: + self.time = 0.0 + return self.time + + hf_time_per_subscene = self._calc_time_per_subscene( + policy, + workflow, + latency_data, + ) + self.time = _calculate_total_time( + math.ceil(work_pct * workflow.total_subscenes), + self.replicas, + hf_time_per_subscene) + + if not policy.is_disaggregated(Model.HF): + # Include VAE time in the same GPU when disaggregation is disabled + hf_vae_time_per_frame = ( + latency_data[self.gpu_type][Model.HF_VAE, 1] # VAE is single-device only in current policy + * workflow.get_resolution_scale(policy.use_upscaler) + / workflow.hf_frames[workflow.frames_per_step_idx] + ) + self.time += _calculate_total_time( + math.ceil(work_pct * workflow.total_frames[Model.HF]), + self.replicas, + hf_vae_time_per_frame) + + return self.time + + @override + def calculate_time_first( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + ) -> float: + if self.get_num_gpus() == 0: + self.time_first = 0.0 + return self.time_first + + if policy.is_disaggregated(Model.HF): + # HF for the first chunk + self.time_first = min( + # Option 1: the first few frames until the first chunk is done + workflow.hf_frames[0] + / workflow.hf_frames[workflow.frames_per_step_idx] + * self._calc_time_per_frame( + policy, + workflow, + latency_data + ), + # Option 2: the full subscene + self._calc_time_per_subscene( + policy, + workflow, + latency_data + ), + ) + else: + # HF + VAE for the full subscene + hf_time_per_subscene = self._calc_time_per_subscene( + policy, + workflow, + latency_data) + hf_vae_time_per_subscene = ( + workflow.per_subscene_frames[Model.HF] + / workflow.hf_frames[workflow.frames_per_step_idx] + * latency_data[self.gpu_type][Model.HF_VAE, 1] # VAE is single-device only in current policy + * workflow.get_resolution_scale(policy.use_upscaler) + ) + self.time_first = hf_time_per_subscene + hf_vae_time_per_subscene + + return self.time_first + + @override + def calculate_energy( + self, + workflow: WorkflowConfig, + power_data: Optional[PowerData] = None, + total_time_s: float = 0.0, + ) -> float: + if self.get_num_gpus() == 0 or power_data is None: + self.energy = 0.0 + return self.energy + power_hf = power_data[self.gpu_type][Model.HF, self.devices] + self.energy = power_hf * self.time * self.replicas + # Idle energy + power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus() + time_idle = total_time_s - self.time + if time_idle > 0: + self.energy += power_idle * time_idle + return self.energy + + @override + def get_max_replicas( + self, + workflow: WorkflowConfig, + ) -> int: + return workflow.model_work.get(Model.HF, 1) + + +@register_model(Model.HF_VAE) +class HFVAEModelAllocation(ModelAllocation): + """HunyuanFramePack VAE model allocation.""" + model: ClassVar[Model] = Model.HF_VAE + + def _calc_time_per_frame( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + ) -> float: + return ( + latency_data[self.gpu_type][Model.HF_VAE, self.devices] + * workflow.get_resolution_scale(policy.use_upscaler) + / workflow.hf_frames[workflow.frames_per_step_idx] + ) + + @override + def calculate_time( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + work_pct: float = 1.0, + ) -> float: + if not policy.is_disaggregated(Model.HF): + assert self.get_num_gpus() == 0 + self.time = 0.0 + return self.time + if self.get_num_gpus() == 0: + self.time = 0.0 + return self.time + + vae_time_per_frame = self._calc_time_per_frame( + policy, + workflow, + latency_data + ) + self.time = _calculate_total_time( + math.ceil(workflow.total_frames[Model.HF] * work_pct), + self.replicas, + vae_time_per_frame) + return self.time + + @override + def calculate_time_first( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + ) -> float: + if not policy.is_disaggregated(Model.HF): + assert self.get_num_gpus() == 0 + self.time_first = 0.0 + return self.time_first + if self.get_num_gpus() == 0: + self.time_first = 0.0 + return self.time_first + + vae_time_per_frame = self._calc_time_per_frame( + policy, + workflow, + latency_data, + ) + num_frames = workflow.per_subscene_frames[Model.HF] + self.time_first = num_frames * vae_time_per_frame + return self.time_first + + @override + def calculate_energy( + self, + workflow: WorkflowConfig, + power_data: Optional[PowerData] = None, + total_time_s: float = 0.0, + ) -> float: + if self.get_num_gpus() == 0 or power_data is None: + self.energy = 0.0 + return self.energy + self.energy = power_data[self.gpu_type][Model.HF_VAE, self.devices] * self.time * self.replicas + # Idle energy + power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus() + time_idle = total_time_s - self.time + if time_idle > 0: + self.energy += power_idle * time_idle + return self.energy + + @override + def get_max_replicas( + self, + workflow: WorkflowConfig, + ) -> int: + return workflow.model_work.get(Model.HF_VAE, 1) + + +@register_model(Model.FT) +class FTModelAllocation(ModelAllocation): + """FantasyTalking model allocation.""" + model: ClassVar[Model] = Model.FT + + def _calc_time_per_subscene( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + ) -> float: + return ( + workflow.per_subscene_frames[Model.FT] + / workflow.ft_frames[workflow.frames_per_step_idx] + * latency_data[self.gpu_type][Model.FT, self.devices] + * workflow.get_resolution_scale(policy.use_upscaler) + * workflow.num_steps[Model.FT] + ) + + @override + def calculate_time( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + work_pct: float = 1.0, + ) -> float: + if self.get_num_gpus() == 0: + self.time = 0.0 + return self.time + + ft_time_per_subscene = self._calc_time_per_subscene( + policy, + workflow, + latency_data, + ) + self.time = _calculate_total_time( + math.ceil(work_pct * workflow.total_subscenes), + self.replicas, + ft_time_per_subscene) + + if not policy.is_disaggregated(Model.FT): + # Include VAE time in the same GPU when disaggregation is disabled + # Note: VAE latency uses devices=1 as VAE processing is not parallelized + # across multiple devices in the same way as the main FT diffusion + ft_vae_time_per_frame = ( + latency_data[self.gpu_type][Model.FT_VAE, 1] + * workflow.get_resolution_scale(policy.use_upscaler) + / workflow.ft_frames[workflow.frames_per_step_idx] + ) + self.time += _calculate_total_time( + math.ceil(work_pct * workflow.total_frames[Model.FT]), + self.replicas, + ft_vae_time_per_frame) + + return self.time + + @override + def calculate_time_first( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + ) -> float: + if self.get_num_gpus() == 0: + self.time_first = 0.0 + return self.time_first + + ft_time_per_subscene = self._calc_time_per_subscene( + policy, + workflow, + latency_data, + ) + self.time_first = ft_time_per_subscene + + if not policy.is_disaggregated(Model.FT): + # Include VAE time_first when FT-VAE is not disaggregated + # Note: VAE latency uses devices=1 (see note in calculate_time) + ft_vae_time_per_subscene = ( + workflow.per_subscene_frames[Model.FT] + / workflow.ft_frames[workflow.frames_per_step_idx] + * latency_data[self.gpu_type][Model.FT_VAE, 1] + * workflow.get_resolution_scale(policy.use_upscaler) + ) + self.time_first += ft_vae_time_per_subscene + + return self.time_first + + @override + def calculate_energy( + self, + workflow: WorkflowConfig, + power_data: Optional[PowerData] = None, + total_time_s: float = 0.0, + ) -> float: + if self.get_num_gpus() == 0 or power_data is None: + self.energy = 0.0 + return self.energy + power_ft = power_data[self.gpu_type][Model.FT, self.devices] + self.energy = power_ft * self.time * self.replicas + # Idle energy + power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus() + time_idle = total_time_s - self.time + if time_idle > 0: + self.energy += power_idle * time_idle + return self.energy + + @override + def get_max_replicas( + self, + workflow: WorkflowConfig, + ) -> int: + return workflow.model_work.get(Model.FT, 1) + + +@register_model(Model.FT_VAE) +class FTVAEModelAllocation(ModelAllocation): + """FantasyTalking VAE model allocation.""" + model: ClassVar[Model] = Model.FT_VAE + + def _calc_time_per_frame( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + ) -> float: + return ( + latency_data[self.gpu_type][Model.FT_VAE, self.devices] + * workflow.get_resolution_scale(policy.use_upscaler) + / workflow.ft_frames[workflow.frames_per_step_idx] + ) + + @override + def calculate_time( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + work_pct: float = 1.0, + ) -> float: + if not policy.is_disaggregated(Model.FT): + assert self.get_num_gpus() == 0 + self.time = 0.0 + return self.time + if self.get_num_gpus() == 0: + self.time = 0.0 + return self.time + + vae_time_per_frame = self._calc_time_per_frame( + policy, + workflow, + latency_data, + ) + self.time = _calculate_total_time( + math.ceil(workflow.total_frames[Model.FT] * work_pct), + self.replicas, + vae_time_per_frame) + return self.time + + @override + def calculate_time_first( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + ) -> float: + if not policy.is_disaggregated(Model.FT): + assert self.get_num_gpus() == 0 + self.time_first = 0.0 + return self.time_first + if self.get_num_gpus() == 0: + self.time_first = 0.0 + return self.time_first + + vae_time_per_frame = self._calc_time_per_frame( + policy, + workflow, + latency_data, + ) + num_frames = workflow.per_subscene_frames[Model.FT] + self.time_first = num_frames * vae_time_per_frame + return self.time_first + + @override + def calculate_energy( + self, + workflow: WorkflowConfig, + power_data: Optional[PowerData] = None, + total_time_s: float = 0.0, + ) -> float: + if self.get_num_gpus() == 0 or power_data is None: + self.energy = 0.0 + return self.energy + self.energy = power_data[self.gpu_type][Model.FT_VAE, self.devices] * self.time * self.replicas + # Idle energy + power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus() + time_idle = total_time_s - self.time + if time_idle > 0: + self.energy += power_idle * time_idle + return self.energy + + @override + def get_max_replicas( + self, + workflow: WorkflowConfig, + ) -> int: + return workflow.model_work.get(Model.FT_VAE, 1) + + +@register_model(Model.UPSCALER) +class UpscalerModelAllocation(ModelAllocation): + """Upscaler model allocation.""" + model: ClassVar[Model] = Model.UPSCALER + + @override + def calculate_time( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + work_pct: float = 1.0, + ) -> float: + if self.get_num_gpus() == 0: + self.time = 0.0 + return self.time + self.time = _calculate_total_time( + math.ceil(work_pct * workflow.total_frames[Model.FT]), + self.replicas, + latency_data[self.gpu_type][self.model, self.devices]) + return self.time + + @override + def calculate_time_first( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + ) -> float: + if not policy.use_upscaler: + assert self.get_num_gpus() == 0 + if self.get_num_gpus() == 0: + self.time_first = 0.0 + return self.time_first + + self.time_first = ( + workflow.per_subscene_frames[Model.FT] + * latency_data[self.gpu_type][self.model, self.devices] + ) + return self.time_first + + @override + def calculate_energy( + self, + workflow: WorkflowConfig, + power_data: Optional[PowerData] = None, + total_time_s: float = 0.0, + ) -> float: + if self.get_num_gpus() == 0 or power_data is None: + self.energy = 0.0 + return self.energy + # Assumes a single device and multiple replicas + self.energy = power_data[self.gpu_type][self.model, self.devices] * self.time * self.replicas + # Idle energy + power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus() + time_idle = total_time_s - self.time + if time_idle > 0: + self.energy += power_idle * time_idle + return self.energy + + @override + def get_max_replicas( + self, + workflow: WorkflowConfig, + ) -> int: + return workflow.model_work.get(Model.UPSCALER, 1) + + +@register_model(Model.OTHERS) +class OthersModelAllocation(ModelAllocation): + """Others: Kokoro + YOLO.""" + model: ClassVar[Model] = Model.OTHERS + + @override + def calculate_time( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + work_pct: float = 1.0, + ) -> float: + if self.get_num_gpus() == 0: + self.time = 0.0 + return self.time + + self.time = ( + workflow.total_scenes + * latency_data[self.gpu_type][self.model, self.devices] + ) + return self.time + + @override + def calculate_time_first( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + ) -> float: + if self.get_num_gpus() == 0: + self.time_first = 0.0 + return self.time_first + + self.time_first = latency_data[self.gpu_type][self.model, self.devices] + return self.time_first + + @override + def calculate_energy( + self, + workflow: WorkflowConfig, + power_data: Optional[PowerData] = None, + total_time_s: float = 0.0, + ) -> float: + if self.get_num_gpus() == 0 or power_data is None: + self.energy = 0.0 + return self.energy + # Idle energy; not much GPU usage + power_idle = power_data[self.gpu_type]["idle"] * self.get_num_gpus() + self.energy = power_idle * self.time + return self.energy diff --git a/simulator/multirequests.py b/simulator/multirequests.py index 82957c8f..a8d87a8b 100644 --- a/simulator/multirequests.py +++ b/simulator/multirequests.py @@ -4,23 +4,23 @@ import os from dataclasses import replace -from model_provisioner.sim_types import GPUType -from model_provisioner.sim_types import Model -from model_provisioner.sim_types import QualityLevel -from model_provisioner.sim_types import RESOLUTION_PIXELS -from model_provisioner.sim_types import Result -from model_provisioner.sim_types import WorkflowConfig -from model_provisioner.sim_types import LatencyData +from sim_types import GPUType +from sim_types import Model +from sim_types import QualityLevel +from sim_types import RESOLUTION_PIXELS +from sim_types import Result +from sim_types import WorkflowConfig +from sim_types import LatencyData -from model_provisioner.data_loading import load_latency_data -from model_provisioner.data_loading import load_power_data -from model_provisioner.data_loading import load_adaptive_quality_data +from data_loading import load_latency_data +from data_loading import load_power_data +from data_loading import load_adaptive_quality_data -from model_provisioner.workflows import PODCAST_WORKFLOW +from workflows import PODCAST_WORKFLOW from model_provisioner.policies import STREAMWISE_POLICY -from model_provisioner.auto_model_allocator import AutoModelAllocator +from auto_model_allocator import AutoModelAllocator # Queries per minute diff --git a/simulator/plot_utils.py b/simulator/plot_utils.py index 2ec13de9..4b0d5849 100644 --- a/simulator/plot_utils.py +++ b/simulator/plot_utils.py @@ -10,12 +10,12 @@ from typing import Optional -from model_provisioner.utils import get_pareto_frontier +from utils import get_pareto_frontier -from model_provisioner.sim_types import ProvisioningResult -from model_provisioner.sim_types import GPUType -from model_provisioner.sim_types import Model -from model_provisioner.sim_types import QualityLevel +from sim_types import ProvisioningResult +from sim_types import GPUType +from sim_types import Model +from sim_types import QualityLevel FIG_SIZE = (7, 5) diff --git a/simulator/provisioning.py b/simulator/provisioning.py index 26e9c8a9..51e1ab11 100644 --- a/simulator/provisioning.py +++ b/simulator/provisioning.py @@ -15,6 +15,15 @@ if _p not in sys.path: sys.path.insert(0, _p) +# Propagate paths to child processes spawned by ProcessPoolExecutor (Windows +# uses 'spawn' which starts a fresh interpreter that reads PYTHONPATH). +_EXTRA_PATHS = os.pathsep.join((_REPO_ROOT, _STREAMWISE_DIR, _SIMULATOR_DIR)) +_EXISTING = os.environ.get("PYTHONPATH", "") +if _SIMULATOR_DIR not in _EXISTING: + os.environ["PYTHONPATH"] = ( + _EXTRA_PATHS + os.pathsep + _EXISTING if _EXISTING else _EXTRA_PATHS + ) + from tqdm.auto import tqdm import logging @@ -30,24 +39,24 @@ from concurrent.futures import TimeoutError from concurrent.futures import as_completed -from model_provisioner.sim_types import WorkflowConfig -from model_provisioner.sim_types import GPUType -from model_provisioner.sim_types import LatencyData -from model_provisioner.sim_types import Provision -from model_provisioner.sim_types import ProvisioningResult -from model_provisioner.sim_types import Model -from model_provisioner.sim_types import ModelAllocation -from model_provisioner.sim_types import PowerData -from model_provisioner.sim_types import QualityLevel -from model_provisioner.sim_types import Policy -from model_provisioner.sim_types import Result -from model_provisioner.sim_types import num_gpus_to_str - -from model_provisioner.auto_model_allocator import AutoModelAllocator +from sim_types import WorkflowConfig +from sim_types import GPUType +from sim_types import LatencyData +from sim_types import Provision +from sim_types import ProvisioningResult +from sim_types import Model +from sim_types import ModelAllocation +from sim_types import PowerData +from sim_types import QualityLevel +from sim_types import Policy +from sim_types import Result +from sim_types import num_gpus_to_str + +from auto_model_allocator import AutoModelAllocator from model_provisioner.policies import STREAMWISE_POLICY -from model_provisioner.constants import SECONDS_IN_HOUR +from constants import SECONDS_IN_HOUR GPU_PROVISIONS: list[int] = [ diff --git a/simulator/sim_types.py b/simulator/sim_types.py new file mode 100644 index 00000000..a83cec22 --- /dev/null +++ b/simulator/sim_types.py @@ -0,0 +1,796 @@ +from __future__ import annotations + +import pandas as pd +import numpy as np + +from typing import Optional +from typing import ClassVar + +from abc import ABC +from abc import abstractmethod + +from dataclasses import dataclass +from dataclasses import field + +from enum import Enum + + +class GPUType(Enum): + A100 = "A100" + H100 = "H100" + H200 = "H200" + GB200 = "GB200" + + def __lt__(self, other: object) -> bool: + if not isinstance(other, GPUType): + return NotImplemented + order = [GPUType.A100, GPUType.H100, GPUType.H200, GPUType.GB200] + return order.index(self) < order.index(other) + + +class QualityLevel(Enum): + ORIGINAL = "original" + HIGH = "high" + MEDIUM = "medium" + LOW = "low" + + +# Pixel counts per quality level (16:10 aspect ratio). +# Latency data is profiled at MEDIUM resolution. +RESOLUTION_PIXELS: dict[QualityLevel, int] = { + QualityLevel.HIGH: 1280 * 800, + QualityLevel.MEDIUM: 640 * 400, + QualityLevel.LOW: 320 * 200, +} + + +class Model(Enum): + GEMMA = "gemma" + FLUX = "flux" + HF = "hf" # HunyuanFramePack + HF_VAE = "hf_vae" # HunyuanFramePack VAE + FT = "ft" # FantasyTalking + FT_VAE = "ft_vae" # FantasyTalking VAE + UPSCALER = "upscaler" + OTHERS = "others" # YOLO + Kokoro + + +# Used for FIFO +MODEL_ORDER: dict[Model, int] = { + Model.GEMMA: 0, + Model.FLUX: 1, + Model.OTHERS: 2, + Model.HF: 3, + Model.HF_VAE: 4, + Model.FT: 5, + Model.FT_VAE: 6, + Model.UPSCALER: 7, +} + + +@dataclass +class ModelAllocation(ABC): + model: ClassVar[Model] + + # policy TODO + # workflow TODO + gpu_type: GPUType + devices: int = 1 + replicas: int = 0 # No replicas by default + work: int = 0 + time: float = 0.0 + time_first: float = 0.0 + energy: float = 0.0 + cost: float = 0.0 + + def __str__(self) -> str: + if self.replicas <= 0: + assert self.time == 0.0, f"time must be 0 when no replicas, got {self.time:.2f}" + assert self.energy == 0.0, f"energy must be 0 when no replicas, got {self.energy:.2f}" + return "--" + return \ + f"devices={self.devices:2d}, " \ + f"replicas={self.replicas}, " \ + f"work={self.work}, " \ + f"time={self.time:.2f} secs, " \ + f"time_first={self.time_first:.2f} secs, " \ + f"energy={self.energy / 60.0 / 60.0:.2f} Wh, " \ + f"cost=${self.cost:.2f}" + + def __repr__(self) -> str: + return self.__str__() + + def __post_init__(self) -> None: + if self.replicas > 0: + return + if self.time != 0.0 or self.energy != 0.0: + raise ValueError( + f"time and energy must be 0.0 when no replicas, got time={self.time:.2f}, energy={self.energy:.2f}") + + def get_num_gpus(self) -> int: + if self.replicas <= 0: + return 0 + return self.devices * self.replicas + + def disable(self) -> None: + self.devices = 0 + self.replicas = 0 + self.time = 0.0 + self.time_first = 0.0 + self.energy = 0.0 + + @abstractmethod + def calculate_time( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + work_pct: float = 1.0, + ) -> float: + ... + + @abstractmethod + def calculate_time_first( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + ) -> float: + ... + + @abstractmethod + def calculate_energy( + self, + workflow: WorkflowConfig, + power_data: Optional[PowerData] = None, + total_time_s: float = 0.0, + ) -> float: + ... + + def calculate_cost( + self, + policy: Policy, + total_time_s: float = 0.0, + ) -> float: + """Calculate the cost for this model allocation.""" + SECONDS_IN_HOUR = 60 * 60 + gpu_cost = policy.gpu_cost[self.gpu_type] + self.cost = total_time_s * (self.get_num_gpus() * gpu_cost) / SECONDS_IN_HOUR + return self.cost + + def calculate( + self, + policy: Policy, + workflow: WorkflowConfig, + latency_data: LatencyData, + power_data: Optional[PowerData] = None, + total_time_s: float = 0.0, + work_pct: float = 1.0, + ) -> None: + """Calculate all the values for this model allocation.""" + self.calculate_time(policy, workflow, latency_data, work_pct) + self.calculate_time_first(policy, workflow, latency_data) + self.calculate_cost(policy, total_time_s) + self.calculate_energy(workflow, power_data, total_time_s) + + def get_max_replicas( + self, + workflow: WorkflowConfig, + ) -> int: + """Get the maximum number of replicas that can leverage parallelism.""" + return 1 + + +class Objective(Enum): + FIFO = "fifo" + TIME = "time" + TTFF = "ttff" + COST = "cost" + ENERGY = "energy" + TIME_COST = "time_cost" + TTFF_COST = "ttff_cost" + ENERGY_COST = "energy_cost" + TIME_ENERGY = "time_energy" + RANDOM = "random" + NONE = "none" + + TTFF_THEN_TIME = "ttff_then_time" # first minimize ttff, then minimize time + + def is_monotonic(self) -> bool: + return self not in {Objective.RANDOM, Objective.FIFO} + + +@dataclass +class WorkflowConfig: + total_video_seconds: int + total_scenes: int + total_frames: dict[Model, int] + total_subscenes: int + per_subscene_frames: dict[Model, int] + # default per-frame number of denoising steps + num_steps: dict[Model, int] + # supported number of generation frames + hf_frames: list[int] + ft_frames: list[int] + frames_per_step_idx: int + # target output resolution (default: HIGH) + target_resolution: QualityLevel = QualityLevel.HIGH + + # total input tokens + total_input_tokens: int = 0 + + # work per model (determines parallelism; work > 1 means parallelizable across replicas) + # models included in the workflow are derived from the keys of this dict + model_work: dict[Model, int] = field(default_factory=dict) + + @property + def models(self) -> list[Model]: + """Models included in the workflow (derived from model_work keys).""" + return list(self.model_work.keys()) + + @property + def work(self) -> dict[Model, int]: + """Units of work per model (0 for models not in the workflow).""" + return { + model_name: self.model_work.get(model_name, 0) + for model_name in Model + } + + def get_model_order(self) -> list[Model]: + """Get ordered list of models in the workflow, sorted by MODEL_ORDER.""" + return sorted( + [m for m in self.models if m in MODEL_ORDER], + key=lambda m: MODEL_ORDER[m], + ) + + def get_resolution_scale(self, use_upscaler: bool) -> float: + """Compute latency scaling factor based on target resolution. + + Latency data is profiled at MEDIUM resolution. The scale factor + adjusts for the actual generation resolution: + + 1. Upscaler used, HIGH → 1.0 (models generate at MEDIUM) + 2. Upscaler used, MEDIUM → LOW / MEDIUM (models generate at LOW) + 3. No upscaler, HIGH → HIGH / MEDIUM (scale up) + 4. No upscaler, MEDIUM → 1.0 + 5. No upscaler, LOW → LOW / MEDIUM (scale down) + """ + if use_upscaler: + assert self.target_resolution in (QualityLevel.HIGH, QualityLevel.MEDIUM), \ + "Upscaler can only be used when target resolution is HIGH or MEDIUM" + if self.target_resolution == QualityLevel.HIGH: + return 1.0 + # MEDIUM target with upscaler: generate at LOW, upscale to MEDIUM + return RESOLUTION_PIXELS[QualityLevel.LOW] / RESOLUTION_PIXELS[QualityLevel.MEDIUM] + if self.target_resolution == QualityLevel.MEDIUM: + return 1.0 + return RESOLUTION_PIXELS[self.target_resolution] / RESOLUTION_PIXELS[QualityLevel.MEDIUM] + + def is_parallelizable(self, model: Model) -> bool: + """Whether the given model can be parallelized across multiple replicas.""" + return self.model_work.get(model, 0) > 1 + + def filter_parallelizable_models( + self, + models: list[Model], + disaggregation: dict[Model, bool], + ) -> list[Model]: + filtered_models = [ + model + for model in models + if self.is_parallelizable(model) + ] + # Remove VAE models when their parent model disaggregation is disabled + if not disaggregation.get(Model.HF, False): + filtered_models = [m for m in filtered_models if m != Model.HF_VAE] + if not disaggregation.get(Model.FT, False): + filtered_models = [m for m in filtered_models if m != Model.FT_VAE] + return filtered_models + + def __post_init__(self) -> None: + assert self.total_frames[Model.HF] > self.per_subscene_frames[Model.HF] + assert self.total_frames[Model.FT] > self.per_subscene_frames[Model.FT] + + # If no models specified, populate defaults for all models + if not self.model_work: + defaults: dict[Model, int] = { + Model.GEMMA: 1, + Model.FLUX: 1, + Model.HF: self.total_subscenes, + Model.HF_VAE: self.total_frames[Model.HF], + Model.FT: self.total_subscenes, + Model.FT_VAE: self.total_frames[Model.FT], + Model.UPSCALER: self.total_frames[Model.FT], + Model.OTHERS: 1, + } + for model, work in defaults.items(): + self.model_work[model] = work + if self.target_resolution != QualityLevel.HIGH: + if Model.UPSCALER in self.model_work: + del self.model_work[Model.UPSCALER] + + @property + def num_frames(self) -> int: + """Number of frames generated by the workflow.""" + if Model.FT in self.total_frames: + return self.total_frames[Model.FT] + return 0 + + +class ActionName(Enum): + MERGE = "merge" + ADD_DEVICE = "add device" + ADD_REPLICA = "add replica" + ADD_DEVICE_REPLICA = "add device replica" + ADD_INSTANCE = "add instance" + REMOVE_DEVICE = "remove device" + REMOVE_REPLICA = "remove replica" + + +@dataclass +class Action: + """ + Optimization action to take. + """ + name: ActionName + model: Model + gpu_type: GPUType + models: dict[GPUType, dict[Model, list[ModelAllocation]]] + + action_result: Result = field(repr=False) + + arrival_time_s: float = 0.0 # For FIFO scheduling + + # Derived fields from action_result (not passed by caller) + time: float = field(init=False) # Total execution time + ttff: float = field(init=False) # Time to first frame + cost: float = field(init=False) # Cost in $ + energy: float = field(init=False) # Energy in W*s + + def __post_init__(self) -> None: + # ---- type checks ---- + if not isinstance(self.model, Model): + raise ValueError(f"Model {self.model} [{type(self.model)}] not supported") + if not isinstance(self.name, ActionName): + raise ValueError(f"Action name {self.name} [{type(self.name)}] not supported") + if not isinstance(self.models, dict): + raise ValueError(f"models must be a dict, got {type(self.models)}") + if not isinstance(self.gpu_type, GPUType): + raise ValueError(f"Device type {self.gpu_type} [{type(self.gpu_type)}] not supported") + """ + if not isinstance(self.allocation_id, int) or self.allocation_id < 0: + raise ValueError(f"Allocation ID {self.allocation_id} must be a non-negative integer") + if self.num_replicas <= 0: + raise ValueError(f"num_replicas {self.num_replicas} must be > 0") + if self.num_devices <= 0: + raise ValueError(f"num_devices {self.num_devices} must be > 0") + """ + # ---- derive values ---- + self.time = self.action_result.total_time_s + self.ttff = self.action_result.ttff_s + self.cost = self.action_result.cost + self.energy = self.action_result.total_energy + if self.cost < 0.0: + raise ValueError("cost must be >= 0") + + def __str__(self) -> str: + return ( + f"Action(" + f"{self.name.value}, " + f"model={self.model.value}, " + f"gpu={self.gpu_type.value}, " + f"time={self.time:.2f} s, " + f"ttff={self.ttff:.2f} s, " + f"cost=${self.cost:.2f}, " + f"time*cost={self.time_cost():.2f}, " + f"ttff*cost={self.ttff_cost():.2f}, " + f"energy*cost={self.energy_cost():.2f}, " + f"time*energy={self.time_energy():.2f}, " + f"energy={self.energy:.2f} Ws, " + f"models={self.models}" + f")" + ) + + def time_cost(self) -> float: + """We use improvement in time * $.""" + if self.time <= 0: + return self.cost + if self.cost <= 0: + return self.time + return self.time * self.cost + + def ttff_cost(self) -> float: + """We use improvement in TTFF * $.""" + if self.ttff <= 0: + return self.cost + if self.cost <= 0: + return self.ttff + return self.ttff * self.cost + + def energy_cost(self) -> float: + """We use improvement in Wh * $.""" + if self.cost <= 0: + return self.energy + if self.energy <= 0: + return self.cost + return self.energy * self.cost + + def time_energy(self) -> float: + """We use improvement in TTFF * Wh.""" + if self.energy <= 0: + return self.time + if self.time <= 0: + return self.energy + return self.time * self.energy + + def get_order(self) -> int: + " ""For FIFO scheduling."" " + return MODEL_ORDER[self.model] + + def get_metric( + self, + obj: Objective, + switch_objective: bool = False, + ) -> float: + if obj == Objective.RANDOM: + return 0.0 + if obj == Objective.TIME: + return self.time + if obj == Objective.TTFF: + return self.ttff + if obj == Objective.COST: + return self.cost + if obj == Objective.ENERGY: + return self.energy + if obj == Objective.TIME_COST: + return self.time_cost() + if obj == Objective.TTFF_COST: + return self.ttff_cost() + if obj == Objective.ENERGY_COST: + return self.energy_cost() + if obj == Objective.TIME_ENERGY: + return self.time_energy() + if obj == Objective.FIFO: + # return self.get_order() + return 0 # TODO + if obj == Objective.TTFF_THEN_TIME: + if switch_objective: + return self.time + else: + return self.ttff + raise ValueError(f"Unknown objective {obj}") + + +@dataclass +class Result: + total_time_s: float = 0.0 + first_chunk_time: float = 0.0 # Time to first chunk + ttff_s: float = 0.0 # Time to first frame (accounts for total time and workflow length) + tbf_s: float = 0.0 # Time between frames + total_energy: float = 0.0 # Watts x second + cost: float = 0.0 # Total $ cost + gpus_used: dict[GPUType, int] = field(default_factory=dict) + gpus_total: dict[GPUType, int] = field(default_factory=dict) + models: dict[GPUType, dict[Model, list[ModelAllocation]]] = field(default_factory=dict) + + def __post_init__(self) -> None: + assert self.total_time_s >= 0.0, f"total_time_s={self.total_time_s} must be >= 0.0" + assert self.first_chunk_time >= 0.0, f"first_chunk_time={self.first_chunk_time} must be >= 0.0" + assert self.ttff_s >= 0.0, f"ttff_s={self.ttff_s} must be >= 0.0" + assert self.tbf_s >= 0.0, f"tbf_s={self.tbf_s} must be >= 0.0" + assert self.total_energy >= 0.0, f"total_energy={self.total_energy} must be >= 0.0" + assert self.cost >= 0.0, f"cost={self.cost} must be >= 0.0" + assert len(self.gpus_used) >= 0, f"gpus_used cannot be empty: {self.gpus_used}" + for gpu_used in self.gpus_used.values(): + assert gpu_used >= 0, f"all gpus_used value {self.gpus_used} must be >= 0" + + def to_csv(self) -> str: + num_a100 = self.gpus_used.get(GPUType.A100, 0) + num_h100 = self.gpus_used.get(GPUType.H100, 0) + num_h200 = self.gpus_used.get(GPUType.H200, 0) + num_gb200 = self.gpus_used.get(GPUType.GB200, 0) + return ( + f"{num_a100},{num_h100},{num_h200},{num_gb200}," + f"{self.ttff_s:.2f},{self.tbf_s:.2f},{self.cost:.2f}," + f"{self.total_time_s:.2f},{self.total_energy:.2f}" + ) + + def __str__(self) -> str: + SECONDS_IN_HOUR = 60 * 60 + return ( + f"Time:{self.total_time_s:.2f} s TTFF:{self.ttff_s:.2f} s " + f"Cost:${self.cost:.2f} TTFF*Cost:{self.ttff_s * self.cost:.2f} " + f"Energy:{self.total_energy / SECONDS_IN_HOUR / 1000:.2f} kWh " + f"GPUS: {num_gpus_to_str(self.gpus_used)}" + ) + + def __repr__(self) -> str: + return self.__str__() + + +@dataclass +class LatencyGPUTypeData: + gpu_type: GPUType + # TP -> latency mappings + flux: dict[int, float] = field(default_factory=dict) + hf: dict[int, float] = field(default_factory=dict) + hf_high: dict[int, float] = field(default_factory=dict) + hf_vae: dict[int, float] = field(default_factory=dict) + hf_vae_high: dict[int, float] = field(default_factory=dict) + ft: dict[int, float] = field(default_factory=dict) + ft_high: dict[int, float] = field(default_factory=dict) + ft_vae: dict[int, float] = field(default_factory=dict) + ft_vae_high: dict[int, float] = field(default_factory=dict) + upscaler: dict[int, float] = field(default_factory=dict) + gemma_first_scene: dict[int, float] = field(default_factory=dict) + gemma_per_scene: dict[int, float] = field(default_factory=dict) + others: dict[int, float] = field(default_factory=dict) + + def __getitem__( + self, + key: Model | tuple[Model, int] + ) -> float: + if isinstance(key, tuple): + assert isinstance(key[0], Model) + assert isinstance(key[1], int) + model, num_devices = key + if model == Model.FLUX: + return self.flux[num_devices] + if model == Model.HF: + return self.hf[num_devices] + if model == Model.HF_VAE: + return self.hf_vae[num_devices] + if model == Model.FT: + return self.ft[num_devices] + if model == Model.FT_VAE: + return self.ft_vae[num_devices] + if model == Model.GEMMA: + return self.gemma_first_scene[num_devices] + if model == Model.UPSCALER: + return self.upscaler[num_devices] + if model == Model.OTHERS: + return self.others[num_devices] + raise KeyError(f"Latency for model {key} not found") + + def __contains__(self, key: Model | tuple[Model, int]) -> bool: + if isinstance(key, tuple): + assert isinstance(key[0], Model) + assert isinstance(key[1], int) + model, num_devices = key + if model == Model.GEMMA: + return num_devices in self.gemma_first_scene + if model == Model.FLUX: + return num_devices in self.flux + if model == Model.HF: + return num_devices in self.hf + if model == Model.HF_VAE: + return num_devices in self.hf_vae + if model == Model.FT: + return num_devices in self.ft + if model == Model.FT_VAE: + return num_devices in self.ft_vae + if model == Model.UPSCALER: + return num_devices in self.upscaler + if model == Model.HF_VAE: + return num_devices in self.hf_vae + if model == Model.OTHERS: + return num_devices in self.others + return False + + def get_max_parallelism(self, model: Model) -> int: + """Max number of devices supported for the given model.""" + if model == Model.FLUX: + return max(self.flux.keys()) + if model == Model.HF: + return max(self.hf.keys()) + if model == Model.FT: + return max(self.ft.keys()) + if model == Model.FT_VAE: + return max(self.ft_vae.keys()) + if model == Model.GEMMA: + return max(self.gemma_first_scene.keys()) + if model == Model.UPSCALER: + return max(self.upscaler.keys()) + if model == Model.HF_VAE: + return max(self.hf_vae.keys()) + if model == Model.OTHERS: + return max(self.others.keys()) + raise KeyError(f"Model {model} not found in latency data") + + +@dataclass +class PowerGPUTypeData: + gpu_type: GPUType + # TP -> power mappings + flux: dict[int, float] = field(default_factory=dict) + hf: dict[int, float] = field(default_factory=dict) + hf_high: dict[int, float] = field(default_factory=dict) + hf_vae: dict[int, float] = field(default_factory=dict) + hf_vae_high: dict[int, float] = field(default_factory=dict) + ft: dict[int, float] = field(default_factory=dict) + ft_high: dict[int, float] = field(default_factory=dict) + ft_vae: dict[int, float] = field(default_factory=dict) + ft_vae_high: dict[int, float] = field(default_factory=dict) + upscaler: dict[int, float] = field(default_factory=dict) + gemma_first_scene: dict[int, float] = field(default_factory=dict) + gemma_per_scene: dict[int, float] = field(default_factory=dict) + # Other values + idle: float = 0.0 # Idle power in Watts + tdp: float = 0.0 # TDP power in Watts + + def __getitem__( + self, + key: Model | tuple[Model, int] | str + ) -> float: + if isinstance(key, tuple): + assert isinstance(key[0], Model) + assert isinstance(key[1], int) + model, devices = key + if model == Model.FLUX: + return self.flux[devices] + if model == Model.HF: + return self.hf[devices] + if model == Model.HF_VAE: + return self.hf_vae[devices] + if model == Model.FT: + return self.ft[devices] + if model == Model.FT_VAE: + return self.ft_vae[devices] + if model == Model.UPSCALER: + return self.upscaler[devices] + if isinstance(key, str): + if key == "idle": + return self.idle + if key == "tdp": + return self.tdp + raise KeyError(f"Power for {key} not found") + + +@dataclass +class LatencyData: + gpus: dict[GPUType, LatencyGPUTypeData] + + def __getitem__(self, gpu_type: GPUType) -> LatencyGPUTypeData: + return self.gpus[gpu_type] + + def __setitem__( + self, + gpu_type: GPUType, + latency_data: LatencyGPUTypeData + ) -> None: + self.gpus[gpu_type] = latency_data + + +@dataclass +class PowerData: + gpus: dict[GPUType, PowerGPUTypeData] + + def __getitem__(self, gpu_type: GPUType) -> PowerGPUTypeData: + return self.gpus[gpu_type] + + def __setitem__( + self, + gpu_type: GPUType, + power_data: PowerGPUTypeData + ) -> None: + self.gpus[gpu_type] = power_data + + +def num_gpus_to_str( + provision: dict[GPUType, int] +) -> str: + return "+".join([ + f"{num_gpus}x{gpu_type.name}" + for gpu_type, num_gpus in provision.items() + if num_gpus > 0 + ]) + + +@dataclass +class Provision: + num_gpus: dict[GPUType, int] = field(default_factory=dict) + + def __getitem__(self, gpu_type: GPUType) -> int: + return self.num_gpus[gpu_type] + + def __str__(self) -> str: + return num_gpus_to_str(self.num_gpus) + + +@dataclass +class ProvisioningResult: + latencies: list[float] + costs: list[float] + ttffs: list[float] + tbfs: list[float] + actual_provision: list[dict[GPUType, int]] + config_provision: list[dict[GPUType, int]] + model_provision: list[dict[GPUType, dict[Model, list[ModelAllocation]]]] + qualities: list[float] = field(default_factory=list) + energies: list[float] = field(default_factory=list) + + def save( + self, + policy_name: str, + results_dir: str, + ) -> None: + """Save the provisioning results to a CSV file.""" + num_a100: list[int] = [] + num_h100: list[int] = [] + num_h200: list[int] = [] + num_gb200: list[int] = [] + for provision in self.actual_provision: + num_a100.append(provision.get(GPUType.A100, 0)) + num_h100.append(provision.get(GPUType.H100, 0)) + num_h200.append(provision.get(GPUType.H200, 0)) + num_gb200.append(provision.get(GPUType.GB200, 0)) + df_latency = pd.DataFrame({ + 'num_a100': num_a100, + 'num_h100': num_h100, + 'num_h200': num_h200, + 'num_gb200': num_gb200, + 'ttff_s': self.ttffs, + 'tbf_s': self.tbfs, + 'cost': self.costs, + 'total_time': self.latencies, + 'energy': self.energies, + }) + df_latency[['ttff_s', 'tbf_s', 'cost', 'total_time', 'energy']] = ( + df_latency[['ttff_s', 'tbf_s', 'cost', 'total_time', 'energy']].round(2) + ) + policy_name_clean = policy_name.replace(" ", "_").replace("*", "x").replace("/", "_").lower() + file_name = results_dir + f"provisioning_{policy_name_clean}.csv" + df_latency.to_csv(file_name, index=False) + + def get_pareto_frontier( + self, + max_x: Optional[float] = None, + max_y: Optional[float] = None, + ) -> np.ndarray: + from utils import get_pareto_frontier # TODO this is a lazy fix, we need to reset + # points = np.array(list(zip(self.ttffs, self.costs))) + return get_pareto_frontier( + self.ttffs, + self.costs, + max_x=max_x, + max_y=max_y, + ) + + +class Solver(Enum): + GUROBI = "gurobi" + HIGHS = "highs" + GREEDY = "greedy" + NAIVE = "naive" + HEXGEN = "hexgen" + HELIX = "helix" + + +@dataclass +class Policy: + name: str + gpu_cost: dict[GPUType, float] + objective: Objective + disaggregation: dict[Model, bool] + use_upscaler: bool + hardware: list[GPUType] = field(default_factory=lambda: [GPUType.A100, GPUType.H100, GPUType.H200, GPUType.GB200]) + solver: Solver = Solver.GREEDY + + def is_disaggregated(self, model: Model) -> bool: + """Check if a model has disaggregation enabled.""" + return self.disaggregation.get(model, False) + + def __str__(self) -> str: + disag_str = { + model.value: disaggregated + for model, disaggregated in self.disaggregation.items() + if disaggregated + } + return ( + f"Policy({self.name}, " + f"objective={self.objective}, " + f"disag={disag_str}, " + f"upscaler={self.use_upscaler}, " + f"cost={self.gpu_cost}, " + f"solver={self.solver})" + ) diff --git a/simulator/sim_types_json.py b/simulator/sim_types_json.py new file mode 100644 index 00000000..9f5451ea --- /dev/null +++ b/simulator/sim_types_json.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import json + +from dataclasses import asdict + +from sim_types import Model +from sim_types import Policy +from sim_types import GPUType +from sim_types import ModelAllocation +from sim_types import WorkflowConfig + + +def models_to_json( + models: dict[GPUType, dict[Model, list[ModelAllocation]]] +) -> str: + result = {} + for gpu_type, model_dict in models.items(): + inner_result = {} + for model, allocation_list in model_dict.items(): + for allocation in allocation_list: + alloc_dict = { + 'devices': allocation.devices, + 'replicas': allocation.replicas, + } + inner_result[model.value] = alloc_dict + result[gpu_type.name] = inner_result + return str(result).replace("}}, '", "}},'") + + +def workflow_to_json(workflow: WorkflowConfig) -> str: + d = asdict(workflow) + # Convert Model enum keys in dict fields to string values + for dict_field in ('total_frames', 'per_subscene_frames', 'num_steps', 'model_work'): + if dict_field in d: + d[dict_field] = { + (k.value if hasattr(k, 'value') else k): v + for k, v in d[dict_field].items() + } + # Convert QualityLevel enum to string value + if 'target_resolution' in d and hasattr(d['target_resolution'], 'value'): + d['target_resolution'] = d['target_resolution'].value + return json.dumps(d) + + +def policy_to_json(policy: Policy) -> str: + result = { + 'name': policy.name, + 'objective': str(policy.objective), + 'disaggregation': {model.value: enabled for model, enabled in policy.disaggregation.items()}, + 'use_upscaler': policy.use_upscaler, + 'hardware': [gpu.name for gpu in policy.hardware], + } + return json.dumps(result) + + +def model_list_to_json(models: list[Model]) -> str: + return json.dumps(models, default=lambda o: o.value) diff --git a/simulator/utils.py b/simulator/utils.py new file mode 100644 index 00000000..29ffe7ab --- /dev/null +++ b/simulator/utils.py @@ -0,0 +1,297 @@ +""" +Utilities for the simulator. +""" + +from __future__ import annotations + +from copy import deepcopy + +import pandas as pd +import numpy as np + +from scipy.interpolate import interp1d + +from sim_types import ProvisioningResult +from sim_types import GPUType +from sim_types import Model +from sim_types import ModelAllocation + +from typing import Optional + + +def to_models_df( + models: dict[GPUType, dict[Model, list[ModelAllocation]]] +) -> pd.DataFrame: + """ + Convert the models dictionary to a pandas DataFrame for easier analysis and visualization. + """ + records = [] + for gpu_type, model_allocations in models.items(): + for model, allocations in model_allocations.items(): + for allocation in allocations: + if allocation is None or allocation.get_num_gpus() == 0: + continue # Ignoring empty allocations + record = { + "GPU": gpu_type.value, + "Model": model.value, + "Devices": allocation.devices, + "Replicas": allocation.replicas, + "Work": allocation.work, + "#GPUs": allocation.get_num_gpus(), + "Time (s)": allocation.time, + "TTFF (s)": allocation.time_first, + "Energy (kWh)": allocation.energy / (60 * 60) / 1000.0, # Convert to kWh + "Cost ($)": allocation.cost, + } + records.append(record) + df = pd.DataFrame(records) + df = df.set_index(["GPU", "Model"]) + df = df.round(2) + + total = df.sum(numeric_only=True) + total["Time (s)"] = df["Time (s)"].groupby(level="Model").max().sum() + total["TTFF (s)"] = df["TTFF (s)"].groupby(level="Model").min().sum() + total.name = ("TOTAL", "") + df = pd.concat([df, total.to_frame().T]) + + df[["Devices", "Replicas", "#GPUs", "Work"]] = df[["Devices", "Replicas", "#GPUs", "Work"]].astype(int) + + return df + + +def coalesce_models( + models: dict[GPUType, dict[Model, list[ModelAllocation]]] +) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: + """The models with the same parallelism and same work, should be accounted as replicas.""" + merged: dict[GPUType, dict[Model, list[ModelAllocation]]] = {} + for gpu_type, model_dict in models.items(): + merged[gpu_type] = {} + for model_name, allocations in model_dict.items(): + merged_allocations: list[ModelAllocation] = [] + for alloc in allocations: + # Check if there's an existing allocation with the same devices and work + match = next(( + model_alloc + for model_alloc in merged_allocations + if model_alloc.devices == alloc.devices and model_alloc.work == alloc.work + ), None) + if match: + # If found, increment replicas and aggregate energy/cost + match.replicas += 1 + match.energy += alloc.energy + match.cost += alloc.cost + else: + # Otherwise, add as new allocation + merged_allocations.append(deepcopy(alloc)) + merged[gpu_type][model_name] = merged_allocations + return merged + + +def simplify_model_allocations( + models: dict[GPUType, dict[Model, list[ModelAllocation]]], +) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: + """ + Simplify model allocations by merging replicas with the same number of devices. + This is to reduce the search space for the optimization loop. + """ + new_models = deepcopy(models) + for gpu_type in new_models.keys(): + for model in new_models[gpu_type].keys(): + model_instances = new_models[gpu_type][model] + alloc_map: dict[int, ModelAllocation] = {} + for model_instance in model_instances: + if model_instance.get_num_gpus() == 0: + continue + if model_instance.devices not in alloc_map: + alloc_map[model_instance.devices] = deepcopy(model_instance) + else: + alloc_map[model_instance.devices].replicas += model_instance.replicas + new_models[gpu_type][model] = list(alloc_map.values()) + return new_models + + +def find_fastest_provisioning( + provisioning: ProvisioningResult, +) -> int: + """Find the fastest provisioning option.""" + min_latency = min(provisioning.latencies) + min_latency_index = provisioning.latencies.index(min_latency) + return min_latency_index + + +def find_fastest_ttff_provisioning( + provisioning: ProvisioningResult, +) -> int: + """Find the fastest provisioning option.""" + min_ttff = min(provisioning.ttffs) + min_ttff_index = provisioning.ttffs.index(min_ttff) + return min_ttff_index + + +def find_cheapest_provisioning( + provisioning: ProvisioningResult, +) -> int: + """Find the cheapest provisioning option.""" + min_cost = min(provisioning.costs) + min_cost_index = provisioning.costs.index(min_cost) + return min_cost_index + + +def find_most_cost_effective_provisioning( + provisioning: ProvisioningResult, +) -> int: + """Find the most cost-effective provisioning option.""" + min_cost = min(provisioning.costs) + min_latency = min(provisioning.latencies) + min_cost_index = provisioning.costs.index(min_cost) + min_latency_index = provisioning.latencies.index(min_latency) + if min_cost_index == min_latency_index: + return min_cost_index + + # if the indices are different, return the provisioning option with the minimum cost*latency + cost_latency_list = [ + cost * latency + for cost, latency in zip(provisioning.costs, provisioning.latencies) + ] + min_cost_latency = min(cost_latency_list) + min_cost_latency_index = cost_latency_list.index(min_cost_latency) + return min_cost_latency_index + + +def find_most_energy_efficient_provisioning( + provisioning: ProvisioningResult, +) -> int: + """Find the most energy-efficient provisioning option.""" + min_energy = min(provisioning.energies) + min_latency = min(provisioning.latencies) + min_energy_index = provisioning.energies.index(min_energy) + min_latency_index = provisioning.latencies.index(min_latency) + if min_energy_index == min_latency_index: + return min_energy_index + + # if the indices are different, return the provisioning option with the minimum energy*latency + energy_latency_list = [ + energy * latency + for energy, latency in zip(provisioning.energies, provisioning.latencies) + ] + min_energy_latency = min(energy_latency_list) + min_energy_latency_index = energy_latency_list.index(min_energy_latency) + return min_energy_latency_index + + +def find_pareto_frontier( + latency_list: list[float], + energy_list: list[float], + provision: list[float] +) -> tuple[list[float], list[float], list[float]]: + pareto_provision = [] + pareto_latency = [] + pareto_energy = [] + for i in range(len(latency_list)): + dominated = False + for j in range(len(latency_list)): + if i != j: + if latency_list[j] <= latency_list[i] and energy_list[j] <= energy_list[i]: + if latency_list[j] < latency_list[i] or energy_list[j] < energy_list[i]: + dominated = True + break + if not dominated: + pareto_provision.append(provision[i]) + pareto_latency.append(latency_list[i]) + pareto_energy.append(energy_list[i]) + return pareto_provision, pareto_latency, pareto_energy + + +def get_pareto_frontier_paper( + points: np.ndarray, + max_y: Optional[float] = None, + max_x: Optional[float] = None, +) -> np.ndarray: + """ + Calculate the Pareto frontier from a set of data points + """ + if points.size == 0: + return points.copy() + + # points = points[np.argsort(points[:, 0])] + points = points[np.lexsort((points[:, 1], points[:, 0]))] + + pareto_front = [points[0]] + for point in points[1:]: + if point[1] < pareto_front[-1][1]: + pareto_front.append(point) + + # Add extreme points to the Pareto frontier + extreme_point_0 = [pareto_front[0][0], max(points[:, 1])] + extreme_point_1 = [max(points[:, 0]), pareto_front[-1][1]] + pareto_front.append(extreme_point_0) + pareto_front.append(extreme_point_1) + + if max_x is not None: + candidate = np.array([max_x, min(points[:, 1])]) + if candidate[0] > pareto_front[-1][0] and candidate[1] <= pareto_front[-1][1]: + pareto_front.append(candidate) + if max_y is not None: + candidate = np.array([min(points[:, 0]), max_y]) + if candidate[1] > pareto_front[0][1] and candidate[0] <= pareto_front[0][0]: + pareto_front.append(candidate) + + pareto_front_np = np.array(pareto_front) + pareto_front_np = pareto_front_np[np.lexsort(( + -pareto_front_np[:, 1], + pareto_front_np[:, 0]))] + + # Avoid repeated points + _, idx = np.unique(pareto_front_np, axis=0, return_index=True) + pareto_front_np = pareto_front_np[np.sort(idx)] + + return pareto_front_np + + +def get_pareto_frontier( + ttff_list: list[float], + costs: list[float], + max_y: Optional[float] = None, + max_x: Optional[float] = None, +) -> np.ndarray: + points = np.array(list(zip(ttff_list, costs))) + return get_pareto_frontier_paper( + points, + max_x, + max_y, + ) + + +def clean_frontier( + frontier: np.ndarray +) -> np.ndarray: + F = frontier[np.argsort(frontier[:, 0])] + xs = [] + ys = [] + i = 0 + while i < len(F): + x = F[i, 0] + same_x = F[F[:, 0] == x] + xs.append(x) + ys.append(same_x[:, 1].min()) + i += len(same_x) + return np.column_stack([xs, ys]) + + +def area_between_frontiers( + A: np.ndarray, + B: np.ndarray, + n: int = 5000 +) -> np.ndarray: + A = clean_frontier(A) + B = clean_frontier(B) + xmin = max(A[:, 0].min(), B[:, 0].min()) + xmax = min(A[:, 0].max(), B[:, 0].max()) + xs = np.linspace(xmin, xmax, n) + fA = interp1d(A[:, 0], A[:, 1], kind="linear") + fB = interp1d(B[:, 0], B[:, 1], kind="linear") + yA = fA(xs) + yB = fB(xs) + # return np.trapezoid(yB - yA, xs) + delta = yB - yA + return 100.0 * delta / yB diff --git a/simulator/workflows.py b/simulator/workflows.py new file mode 100644 index 00000000..ba0caa46 --- /dev/null +++ b/simulator/workflows.py @@ -0,0 +1,253 @@ +from __future__ import annotations + +import math + +from typing import Optional + +from sim_types import WorkflowConfig +from sim_types import Model +from sim_types import QualityLevel + +from constants import FPS +from constants import FRAMES_OPTIONS +from constants import FRAMES_PER_STEP_IDX +from constants import NUM_STEPS +from constants import SECONDS_IN_HOUR, SECONDS_IN_MINUTE +from constants import TOTAL_INPUT_TOKENS + + +# Shared physical constants +MAX_FT_FRAMES: int = 1 + 80 +SUBSCENE_SECONDS: float = MAX_FT_FRAMES / FPS[Model.FT] # 81 frames @ 23 FPS → ~3.52 s +SUBSCENES_PER_SCENE: int = 4 # default subscene grouping +TOKENS_PER_FRAME = 500 # 1 frame generates around 500 tokens + + +def _get_num_subscenes(total_video_seconds: int) -> int: + """Return the number of subscenes needed to cover the given video duration.""" + return math.ceil(total_video_seconds / SUBSCENE_SECONDS) + + +def _get_num_scenes(total_video_seconds: int) -> int: + """Return the number of scenes needed to cover the given video duration.""" + return math.ceil(_get_num_subscenes(total_video_seconds) / SUBSCENES_PER_SCENE) + + +def _get_num_frames(total_video_seconds: int, model: Model) -> int: + """Return the number of frames needed for the given video duration and model.""" + return math.ceil(total_video_seconds * FPS[model]) + + +def _video_gen_work( + total_video_seconds: int, + num_scenes: int, + num_subscenes: int, + model_work_overrides: Optional[dict[Model, int | str | None]] = None, +) -> dict[Model, int]: + """Standard model work for video-generation workflows (Podcast, Movie, etc.).""" + ret = { + Model.GEMMA: 1, + Model.FLUX: 1, + Model.HF: num_subscenes, + Model.HF_VAE: _get_num_frames(total_video_seconds, Model.HF), + Model.FT: num_subscenes, + Model.FT_VAE: _get_num_frames(total_video_seconds, Model.FT), + Model.UPSCALER: _get_num_frames(total_video_seconds, Model.FT), + Model.OTHERS: 1, + } + if model_work_overrides: + for model, value in model_work_overrides.items(): + if value == "num_scenes": + ret[model] = num_scenes + elif value == "num_subscenes": + ret[model] = num_subscenes + elif isinstance(value, str): + raise ValueError(f"Invalid model_work override value: {value}") + elif value == 0 or value is None: + del ret[model] + else: + ret[model] = value + return ret + + +class WorkOverrideType: + def __init__(self, value: int | str | None = None): + self.value = value + + +def build_workflow_config( + total_video_seconds: int, + input_tokens: int, + model_work: dict[Model, int] | None = None, + *, + model_work_overrides: dict[Model, int | str | None] | None = None, + num_scenes_override: int | None = None, + num_steps_override: dict[Model, int] | None = None, + target_resolution: QualityLevel = QualityLevel.HIGH, +) -> WorkflowConfig: + """Build a ``WorkflowConfig`` from base parameters, computing all derived values. + + Parameters + ---------- + model_work: + Explicit model-work dictionary. When ``None`` (default), standard + video-generation work is auto-generated from the other parameters. + exclude_models: + Models to remove from auto-generated ``model_work``. + model_work_overrides: + Key-value overrides applied on top of auto-generated ``model_work``. + If a value is set to "num_scenes", it will be replaced with the number of scenes (i.e. per-scene work). + target_resolution: + The target output resolution for the workflow (default HIGH). + When not HIGH, UPSCALER is automatically removed from model_work. + """ + num_subscenes = _get_num_subscenes(total_video_seconds) + + num_scenes = _get_num_scenes(total_video_seconds) + if num_scenes_override is not None: + num_scenes = num_scenes_override + + num_steps = dict(NUM_STEPS) + if num_steps_override: + num_steps.update(num_steps_override) + + if model_work is None: + model_work = _video_gen_work( + total_video_seconds, + num_scenes, + num_subscenes, + model_work_overrides, + ) + + return WorkflowConfig( + total_video_seconds=total_video_seconds, + total_scenes=num_scenes, + total_subscenes=num_subscenes, + total_frames={ + Model.HF: _get_num_frames(total_video_seconds, Model.HF), + Model.FT: _get_num_frames(total_video_seconds, Model.FT), + }, + per_subscene_frames={ + Model.HF: math.ceil(_get_num_frames(total_video_seconds, Model.HF) / num_subscenes), + Model.FT: math.ceil(_get_num_frames(total_video_seconds, Model.FT) / num_subscenes), + }, + num_steps=num_steps, + hf_frames=FRAMES_OPTIONS[Model.HF], + ft_frames=FRAMES_OPTIONS[Model.FT], + frames_per_step_idx=FRAMES_PER_STEP_IDX, + target_resolution=target_resolution, + total_input_tokens=input_tokens, + model_work=model_work, + ) + + +WORKFLOW_DURATIONS = { # in seconds + "podcast": int(10 * SECONDS_IN_MINUTE), + # TODO The input is two hours but the output should be shorter something like 1 or 2 minutes + "short": int(2 * SECONDS_IN_HOUR), + "movie": int(2 * SECONDS_IN_HOUR), + "story": int(10 * SECONDS_IN_MINUTE), + "lecture": int(5 * SECONDS_IN_MINUTE), + "slide": int(10 * SECONDS_IN_MINUTE), + "dubbing": int(10 * SECONDS_IN_MINUTE), + "editing": int(10 * SECONDS_IN_MINUTE), + "chat": 5, +} + + +# Podcast: 10-minute video from text/PDF input +PODCAST_WORKFLOW = build_workflow_config( + total_video_seconds=WORKFLOW_DURATIONS["podcast"], + input_tokens=TOTAL_INPUT_TOKENS, +) + +# Shorts: short clips from a 2-hour input video +_SHORTS_SECONDS = WORKFLOW_DURATIONS["short"] +_SHORTS_SCENES = _SHORTS_SECONDS // 10 # 10-second scene segmentation → 720 +SHORTS_WORKFLOW = build_workflow_config( + total_video_seconds=_SHORTS_SECONDS, + input_tokens=int(_SHORTS_SECONDS * TOKENS_PER_FRAME), # 1 fps × 500 tokens/frame + model_work={ + Model.GEMMA: _SHORTS_SCENES, + Model.OTHERS: 1, # TODO isn't this 1 by default? + }, + num_scenes_override=_SHORTS_SCENES, +) + +# Movie: 2-hour movie +MOVIE_WORKFLOW = build_workflow_config( + total_video_seconds=WORKFLOW_DURATIONS["movie"], + input_tokens=TOTAL_INPUT_TOKENS, + model_work_overrides={ + Model.FLUX: "num_scenes", + }, +) + +# Animated Story: Podcast + 5% more HF denoising steps (LoRA overhead) +OVERHEAD_PCT = 5 +ANIMATED_STORY_WORKFLOW = build_workflow_config( + total_video_seconds=WORKFLOW_DURATIONS["story"], + input_tokens=TOTAL_INPUT_TOKENS, + num_steps_override={ + Model.HF: int(NUM_STEPS[Model.HF] * 1 + (OVERHEAD_PCT / 100.0)) + }, +) + +# Lecture: 5-minute video, Flux generates per-scene images +LECTURE_WORKFLOW = build_workflow_config( + total_video_seconds=WORKFLOW_DURATIONS["lecture"], + input_tokens=TOTAL_INPUT_TOKENS, + model_work_overrides={ + Model.FLUX: "num_scenes", + }, +) + +# Slide Persona: same as Podcast but at low resolution, no upscaler +SLIDE_PERSONA_WORKFLOW = build_workflow_config( + total_video_seconds=WORKFLOW_DURATIONS["slide"], + input_tokens=TOTAL_INPUT_TOKENS, + target_resolution=QualityLevel.LOW, + model_work_overrides={ + Model.UPSCALER: None, + }, +) + +# Dubbing: like Podcast but without Flux, and double the audio work +DUBBING_WORKFLOW = build_workflow_config( + total_video_seconds=WORKFLOW_DURATIONS["dubbing"], + input_tokens=TOTAL_INPUT_TOKENS, + model_work_overrides={ + Model.FLUX: None, + Model.OTHERS: 2, # Double audio work + }, +) + +# Editing: like Podcast but without GEMMA, FLUX, or OTHERS +EDITING_WORKFLOW = build_workflow_config( + total_video_seconds=WORKFLOW_DURATIONS["editing"], + input_tokens=TOTAL_INPUT_TOKENS, + model_work_overrides={ + Model.GEMMA: None, + Model.FLUX: None, + Model.OTHERS: None, + } +) + +# Video Chat: like Podcast but only 5 seconds of output video +VIDEO_CHAT_WORKFLOW = build_workflow_config( + total_video_seconds=WORKFLOW_DURATIONS["chat"], + input_tokens=TOTAL_INPUT_TOKENS, +) + + +WORKFLOWS = { + "podcast": PODCAST_WORKFLOW, + "chat": VIDEO_CHAT_WORKFLOW, + "dubbing": DUBBING_WORKFLOW, + "editing": EDITING_WORKFLOW, + "lecture": LECTURE_WORKFLOW, + "movie": MOVIE_WORKFLOW, + "short": SHORTS_WORKFLOW, + "slide": SLIDE_PERSONA_WORKFLOW, + "story": ANIMATED_STORY_WORKFLOW, +} diff --git a/streamwise/allocator_bridge.py b/streamwise/allocator_bridge.py new file mode 100644 index 00000000..44dd2512 --- /dev/null +++ b/streamwise/allocator_bridge.py @@ -0,0 +1,256 @@ +""" +Bridge between the model provisioner's allocator output and StreamWise pod deployment. + +Translates ModelAllocation results (abstract Model enum + GPU counts) into concrete +container deployment parameters compatible with pod_manager.add_pod(). +""" + +from __future__ import annotations + +import os +import sys + +# Add simulator/ to sys.path so foundation modules are importable. +_SIMULATOR_DIR = os.path.normpath( + os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "simulator") +) +if _SIMULATOR_DIR not in sys.path: + sys.path.insert(0, _SIMULATOR_DIR) + +from dataclasses import dataclass +from typing import Optional + +from sim_types import GPUType +from sim_types import Model +from sim_types import Result + +from auto_model_allocator import AutoModelAllocator +from data_loading import load_latency_data +from model_provisioner.policies import STREAMWISE_POLICY +from workflows import WORKFLOWS + + +# Mapping from simulator Model enum to concrete container names used by pod_manager. +# Some Model entries map to multiple containers (e.g., OTHERS -> kokoro + yolo). +MODEL_TO_CONTAINERS: dict[Model, list[str]] = { + Model.GEMMA: ["gemma"], + Model.FLUX: ["flux"], + Model.HF: ["hunyuanframepackf1"], + Model.HF_VAE: ["hunyuanframepackvae"], + Model.FT: ["fantasytalking"], + Model.FT_VAE: [], # FT_VAE is handled within fantasytalking container + Model.UPSCALER: ["realesrgan"], + Model.OTHERS: ["kokoro", "yolo"], +} + +# Default CPU/memory/storage for each container when deployed via auto-deploy. +# Format: (cpu_cores, memory_gib, ephemeral_storage_gib) +CONTAINER_RESOURCES: dict[str, tuple[int, int, int]] = { + "gemma": (16, 192, 64), + "flux": (12, 128, 64), + "hunyuanframepackf1": (24, 128, 64), + "hunyuanframepackvae": (4, 32, 16), + "fantasytalking": (12, 192, 64), + "realesrgan": (4, 32, 16), + "kokoro": (2, 8, 16), + "yolo": (4, 8, 16), +} + +# GPU type string used by pod_manager (lowercase) +GPU_TYPE_TO_POD_STR: dict[GPUType, str] = { + GPUType.A100: "a100", + GPUType.H100: "h100", + GPUType.H200: "h200", + GPUType.GB200: "gb200", +} + +# MIG containers: these use a MIG slice instead of a full GPU +MIG_CONTAINERS: dict[str, str] = { + "kokoro": "1g.10gb", + "yolo": "1g.10gb", + "realesrgan": "1g.10gb", +} + +# Mapping from StreamWise app name to simulator workflow key +APP_TO_WORKFLOW: dict[str, str] = { + "streamcast": "podcast", + "streampersona": "slide", + "streamchat": "chat", + "streamshort": "short", + "streammovie": "movie", + "streamanimate": "story", + "streamlecture": "lecture", + "streamdub": "dubbing", + "streamedit": "editing", +} + + +@dataclass +class DeploymentSpec: + """A single container deployment specification.""" + container_name: str + cpu: int + memory_gib: int + ephemeral_storage_gib: int + gpu: int + gpu_type: Optional[str] + mig_profile: Optional[str] + + +@dataclass +class DeploymentPlan: + """Complete deployment plan produced by the auto-allocator.""" + specs: list[DeploymentSpec] + result: Result + workflow_name: str + gpu_budget: dict[str, int] + + +def _get_data_dir() -> str: + """Get the path to the simulator data directory.""" + default_path = os.path.join(os.path.dirname(__file__), "..", "simulator", "data") + return os.getenv("SIMULATOR_DATA_DIR", default_path) + + +def get_available_workflows() -> list[str]: + """Return list of available workflow names for the UI.""" + return list(APP_TO_WORKFLOW.keys()) + + +def get_available_gpu_types() -> list[str]: + """Return list of available GPU type strings for the UI.""" + return [gpu_type.value for gpu_type in GPUType] + + +def run_allocator( + gpu_budget: dict[str, int], + workflow_name: str, +) -> DeploymentPlan: + """ + Run the greedy model allocator and return a deployment plan. + + Args: + gpu_budget: GPU counts keyed by GPU type string (e.g., {"A100": 8, "H100": 0}). + workflow_name: StreamWise app name (e.g., "streamcast"). + + Returns: + DeploymentPlan with concrete container deployment specs. + + Raises: + ValueError: If workflow_name or GPU types are invalid. + """ + # Validate workflow + workflow_key = APP_TO_WORKFLOW.get(workflow_name) + if workflow_key is None: + raise ValueError( + f"Unknown workflow '{workflow_name}'. " + f"Available: {list(APP_TO_WORKFLOW.keys())}") + + workflow = WORKFLOWS[workflow_key] + + # Parse GPU budget into GPUType enum + num_gpus: dict[GPUType, int] = {} + for gpu_str, count in gpu_budget.items(): + try: + gpu_type = GPUType(gpu_str) + except ValueError: + raise ValueError( + f"Unknown GPU type '{gpu_str}'. " + f"Available: {[g.value for g in GPUType]}") + if count > 0: + num_gpus[gpu_type] = count + + if not num_gpus or sum(num_gpus.values()) < 8: + raise ValueError("Total GPU budget must be at least 8 GPUs.") + + # Load latency data and run allocator + data_dir = _get_data_dir() + latency_data = load_latency_data(data_dir=data_dir) + + allocator = AutoModelAllocator( + workflow=workflow, + latency_data=latency_data, + policy=STREAMWISE_POLICY, + ) + + result = allocator.allocate(num_gpus=num_gpus, verbose=False) + + # Convert result to deployment specs + specs = result_to_deployment_specs(result) + + return DeploymentPlan( + specs=specs, + result=result, + workflow_name=workflow_name, + gpu_budget=gpu_budget, + ) + + +def result_to_deployment_specs(result: Result) -> list[DeploymentSpec]: + """ + Convert an allocator Result into a list of DeploymentSpec objects. + + Each ModelAllocation with replicas > 0 is mapped to one or more container deployments. + """ + specs: list[DeploymentSpec] = [] + + for gpu_type, model_dict in result.models.items(): + gpu_type_str = GPU_TYPE_TO_POD_STR[gpu_type] + + for model, allocations in model_dict.items(): + containers = MODEL_TO_CONTAINERS.get(model, []) + if not containers: + continue + + for allocation in allocations: + if allocation.replicas <= 0: + continue + + for container_name in containers: + resources = CONTAINER_RESOURCES.get(container_name, (4, 16, 16)) + cpu, memory_gib, ephemeral_storage_gib = resources + + mig_profile = MIG_CONTAINERS.get(container_name) + gpu_count = allocation.devices if not mig_profile else 1 + + for _ in range(allocation.replicas): + specs.append(DeploymentSpec( + container_name=container_name, + cpu=cpu, + memory_gib=memory_gib, + ephemeral_storage_gib=ephemeral_storage_gib, + gpu=gpu_count, + gpu_type=gpu_type_str, + mig_profile=mig_profile, + )) + + return specs + + +def deployment_plan_to_json(plan: DeploymentPlan) -> dict: + """Serialize a DeploymentPlan to a JSON-friendly dict.""" + return { + "workflow_name": plan.workflow_name, + "gpu_budget": plan.gpu_budget, + "metrics": { + "total_time_s": round(plan.result.total_time_s, 2), + "ttff_s": round(plan.result.ttff_s, 2), + "cost": round(plan.result.cost, 4), + "gpus_used": { + gpu_type.value: count + for gpu_type, count in plan.result.gpus_used.items() + }, + }, + "specs": [ + { + "container_name": spec.container_name, + "cpu": spec.cpu, + "memory_gib": spec.memory_gib, + "ephemeral_storage_gib": spec.ephemeral_storage_gib, + "gpu": spec.gpu, + "gpu_type": spec.gpu_type, + "mig_profile": spec.mig_profile, + } + for spec in plan.specs + ], + } diff --git a/streamwise/model_provisioner/__init__.py b/streamwise/model_provisioner/__init__.py new file mode 100644 index 00000000..c79b0cde --- /dev/null +++ b/streamwise/model_provisioner/__init__.py @@ -0,0 +1,15 @@ +""" +Model Provisioner — allocation policy implementations for GPU resource distribution. + +Contains greedy, naive, MILP, HexGen, and Helix allocation strategies. +The foundation types (sim_types, constants, models, etc.) live in simulator/. +""" +import os +import sys + +# Add simulator/ to sys.path so policy files can import foundation modules. +_SIMULATOR_DIR = os.path.normpath( + os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..", "simulator") +) +if _SIMULATOR_DIR not in sys.path: + sys.path.insert(0, _SIMULATOR_DIR) diff --git a/streamwise/model_provisioner/greedy.py b/streamwise/model_provisioner/greedy.py new file mode 100644 index 00000000..8c1a1dd0 --- /dev/null +++ b/streamwise/model_provisioner/greedy.py @@ -0,0 +1,573 @@ +""" +Greedy algorithm for the StreamWise workflow allocation problem. +""" + +from __future__ import annotations + +import logging + +from tabulate import tabulate + +from typing import Optional + +from operator import itemgetter + +from constants import NUM_GPUS_PER_SERVER +from constants import SECONDS_IN_MINUTE +from constants import SECONDS_IN_HOUR + +from sim_types import Result +from sim_types import GPUType +from sim_types import WorkflowConfig +from sim_types import LatencyData +from sim_types import PowerData +from sim_types import Model +from sim_types import ModelAllocation +from sim_types import Policy +from sim_types import Solver + +from utils import simplify_model_allocations + +from evaluator import calc_used_gpus +from evaluator import evaluate_model_allocation + +from model_allocator import ModelAllocator + +from .policies import STREAMWISE_POLICY +from .policies import MAX_ITERATIONS +from .policies import USE_ALL_GPUS + +from actions import gen_actions +from actions import choose_action +from actions import apply_action + + +class GreedyAllocator(ModelAllocator): + """ + Greedy allocator that iteratively applies the best action. + """ + def __init__( + self, + workflow: WorkflowConfig, + latency_data: LatencyData, + power_data: Optional[PowerData] = None, + policy: Policy = STREAMWISE_POLICY, + ) -> None: + super().__init__( + workflow, + latency_data, + power_data, + policy, + ) + assert self.policy.solver in {Solver.GREEDY, Solver.HEXGEN} + + def allocate( + self, + num_gpus: dict[GPUType, int], + verbose: bool = False, + # Greedy policy parameters + allow_removal: bool = False, + allow_merging: bool = False, + look_ahead_replicas: int = 3, + ) -> Result: + total_gpus = sum(num_gpus.values()) + assert total_gpus >= 8, f"Total number of GPUs must be at least 8 ({num_gpus})" + + gpu_types = [ + gpu_type + for gpu_type, count in num_gpus.items() + if count > 0 + ] + assert 1 <= len(gpu_types) <= 2, f"Only up to two GPU types are supported ({len(gpu_types)})" + gpu_type1 = gpu_types[0] + + if len(gpu_types) == 1 and num_gpus[gpu_type1] == 8: + # 8 x GPUs + return self._pick_from_single_server( + gpu_type=gpu_type1, + verbose=verbose, + ) + + if len(gpu_types) == 1: + # More than 8 x GPUs + return self._pick_from_single_device_mapping( + num_gpus.get(gpu_type1, 0), + gpu_type=gpu_type1, + verbose=verbose, + allow_removal=allow_removal, + allow_merging=allow_merging, + look_ahead_replicas=look_ahead_replicas, + ) + + # Mixed setup of GPU types (e.g., A100 and H100) + return self._pick_from_both_devices_mapping( + num_gpus, + verbose=verbose, + allow_removal=allow_removal, + allow_merging=allow_merging, + look_ahead_replicas=look_ahead_replicas, + ) + + def _pick_from_both_devices_mapping( + self, + num_gpus: dict[GPUType, int], + verbose: bool = False, + allow_removal: bool = False, + allow_merging: bool = False, + look_ahead_replicas: int = 3, + ) -> Result: + """ + Calculate based on two GPU types. + """ + gpu_types = list(num_gpus.keys()) + assert len(gpu_types) == 2 + assert len(num_gpus) == 2 + gpu_type1 = gpu_types[0] + gpu_type2 = gpu_types[1] + assert num_gpus[gpu_type1] >= NUM_GPUS_PER_SERVER[gpu_type1] + assert num_gpus[gpu_type2] >= NUM_GPUS_PER_SERVER[gpu_type2] + + # Initialize allocations with minimal setup + models = self._init_both_devices_models(gpu_type1, gpu_type2) + + remaining_gpus = {} + for gpu_type in num_gpus.keys(): + remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]}) + + # Optimization loop + if verbose: + evaluate_model_allocation( + models=models, + num_gpus=num_gpus, + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + round_up_cost_to_server=True, + ) + self._print_iteration(0, models, num_gpus) + + it = 1 + prev_metric = None + switch_objective = False + while sum(remaining_gpus.values()) > 0: + # Calculate current iteration times + evaluate_model_allocation( + models=models, + num_gpus=num_gpus, + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + round_up_cost_to_server=False, + ) + + # Calculate potential actions for each optimization option + actions = gen_actions( + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + num_gpus=num_gpus, + models=models, + policy=self.policy, + allow_removal=allow_removal, + allow_merging=allow_merging, + look_ahead_replicas=look_ahead_replicas, + ) + + if not actions: + logging.debug(f"No more actions possible after {it} iterations for {self.policy}.") + break + + best_action = choose_action(actions, self.policy.objective, switch_objective=switch_objective) + + if not best_action: + logging.debug("No actions selected.") + break + + new_metric = best_action.get_metric(self.policy.objective, switch_objective=switch_objective) + + if self.policy.objective.is_monotonic() and prev_metric is not None and new_metric >= prev_metric: + msg = f"No improvement after {it} iterations for {self.policy}." + msg += f" Best action: {best_action}, metric: {new_metric:.2f} >= previous {prev_metric:.2f}." + if verbose: + print(msg) + logging.debug(msg) + if not USE_ALL_GPUS: + logging.debug("Not using all GPUs as USE_ALL_GPUS is False. Stopping optimization loop.") + break + switch_objective = True + + prev_metric = new_metric + + models = apply_action(best_action, models=models) + + models = simplify_model_allocations(models) + + remaining_gpus.clear() + for gpu_type in num_gpus.keys(): + remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]}) + + if verbose: + self._print_iteration(it, models, num_gpus) + print(f"{len(actions)} actions:") + for action in actions: + if action == best_action: + print(f"* {action} (best)") + else: + print(f" {action}") + print(f"Metric: {new_metric:.2f}") + print("Remaining devices:") + for gpu_type in remaining_gpus.keys(): + print(f" {remaining_gpus[gpu_type]} x {gpu_type.value}") + + it += 1 + if it > MAX_ITERATIONS: + logging.debug(f"Reached max iterations ({MAX_ITERATIONS}). Stopping optimization loop.") + break + + # Adjust for no disaggregation + if not self.policy.is_disaggregated(Model.HF): + for models_gpu in models.values(): + for instance_id in range(len(models_gpu[Model.HF_VAE])): + assert models_gpu[Model.HF_VAE][instance_id].get_num_gpus() == 0, \ + "HF_VAE must have 0 GPUs when HF disaggregation is disabled" + if not self.policy.is_disaggregated(Model.FT): + for models_gpu in models.values(): + for instance_id in range(len(models_gpu[Model.FT_VAE])): + assert models_gpu[Model.FT_VAE][instance_id].get_num_gpus() == 0, \ + "FT_VAE must have 0 GPUs when FT disaggregation is disabled" + + # Final calculations + result = evaluate_model_allocation( + models=models, + num_gpus=num_gpus, + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + round_up_cost_to_server=True, + ) + + if verbose: + self._print_final_allocation( + models=models, + used_devices=result.gpus_used, + total_devices={ + gpu_type1: num_gpus.get(gpu_type1, 0), + gpu_type2: num_gpus.get(gpu_type2, 0), + }, + power_data=self.power_data, + total_time_s=result.total_time_s, + ttff_s=result.ttff_s, + first_chunk_time=result.first_chunk_time, + tbf_s=result.tbf_s, + total_energy=result.total_energy if self.power_data else 0.0, + cost=result.cost, + ) + + assert result.gpus_used[gpu_type1] <= num_gpus.get(gpu_type1, 0), \ + f"{gpu_type1.value}: {result.gpus_used[gpu_type1]} > {num_gpus.get(gpu_type1, 0)}" + assert result.gpus_used[gpu_type2] <= num_gpus.get(gpu_type2, 0), \ + f"{gpu_type2.value}: {result.gpus_used[gpu_type2]} > {num_gpus.get(gpu_type2, 0)}" + + return Result( + total_time_s=result.total_time_s, + models=models, + gpus_used=result.gpus_used, + ttff_s=result.ttff_s, + tbf_s=result.tbf_s, + total_energy=result.total_energy if self.power_data else 0.0, + cost=result.cost, + ) + + def _pick_from_single_server( + self, + gpu_type: GPUType, + verbose: bool = False, + ) -> Result: + """ + The minimal setup with a servers with a single server (8 GPUs or 4 for GB200). + No parallelism across scenes/subscenes. + """ + + # Number of devices + num_gpus = NUM_GPUS_PER_SERVER[gpu_type] + models = self._init_single_server_models(gpu_type) + + result = evaluate_model_allocation( + models=models, + num_gpus={gpu_type: num_gpus}, + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + round_up_cost_to_server=True, + ) + + if verbose: + model_device = models[gpu_type] + print_data = [ + [Model.GEMMA.value, round(model_device[Model.GEMMA][0].time, 2)], + [Model.FLUX.value, round(model_device[Model.FLUX][0].time, 2)], + [Model.HF.value, round(model_device[Model.HF][0].time, 2)], + [Model.HF_VAE.value, round(model_device[Model.HF_VAE][0].time, 2)], + [Model.FT.value, round(model_device[Model.FT][0].time, 2)], + [Model.FT_VAE.value, round(model_device[Model.FT_VAE][0].time, 2)], + ] + if self.policy.use_upscaler: + print_data.append([Model.UPSCALER.value, round(model_device[Model.UPSCALER][0].time, 2)]) + print(f"Total time: {result.total_time_s:.2f} seconds") + print(tabulate( + print_data, + headers=["Model", "Time (seconds)"], + tablefmt="pretty", + colalign=["left", "right"] + )) + self._print_final_allocation( + models=models, + used_devices={gpu_type: num_gpus}, + total_devices={gpu_type: num_gpus}, + power_data=self.power_data, + total_time_s=result.total_time_s, + ttff_s=result.ttff_s, + first_chunk_time=result.first_chunk_time, + tbf_s=result.tbf_s, + total_energy=result.total_energy if self.power_data else 0.0, + cost=result.cost, + ) + + return Result( + total_time_s=result.total_time_s, + models=models, + gpus_used={gpu_type: num_gpus}, + ttff_s=result.ttff_s, + tbf_s=result.tbf_s, + total_energy=result.total_energy if self.power_data else 0.0, + cost=result.cost, + ) + + def _pick_from_single_device_mapping( + self, + num_gpus: int, + gpu_type: GPUType, + verbose: bool = False, + allow_removal: bool = False, + allow_merging: bool = False, + look_ahead_replicas: int = 3, + ) -> Result: + """ + Calculate time and energy based on a single GPU type. + """ + assert num_gpus >= NUM_GPUS_PER_SERVER[gpu_type] + latency_gpu_data = self.latency_data[gpu_type] + assert gpu_type == latency_gpu_data.gpu_type + + if self.power_data is not None: + power_gpu_data = self.power_data[gpu_type] + assert gpu_type == power_gpu_data.gpu_type + + # Initialize allocations + models = self._init_single_device_models(gpu_type) + + remaining_gpus = num_gpus - calc_used_gpus(models) + + assert 0 <= remaining_gpus <= num_gpus + + # Optimization loop + it = 0 + prev_metric = None + switch_objective = False + while remaining_gpus > 0: + # Calculate current iteration times + evaluate_model_allocation( + models=models, + num_gpus={gpu_type: num_gpus}, + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + round_up_cost_to_server=False, + ) + + # Calculate potential actions for each optimization option + actions = gen_actions( + num_gpus={gpu_type: num_gpus}, + latency_data=self.latency_data, + power_data=self.power_data, + workflow=self.workflow, + models=models, + policy=self.policy, + allow_removal=allow_removal, + allow_merging=allow_merging, + look_ahead_replicas=look_ahead_replicas, + ) + + if not actions: + logging.debug(f"No more actions possible after {it} iterations for {self.policy}") + break + + best_action = choose_action( + actions, + self.policy.objective, + switch_objective=switch_objective) + + if not best_action: + logging.debug("No action selected.") + break + + new_metric = best_action.get_metric(self.policy.objective, switch_objective=switch_objective) + if self.policy.objective.is_monotonic() and prev_metric is not None and new_metric >= prev_metric: + msg = f"No improvement from actions after {it} iterations for {self.policy}." + msg += f" Best action: {best_action}, metric: {new_metric:.2f} >= previous {prev_metric:.2f}." + if verbose: + print(msg) + logging.debug(msg) + if not USE_ALL_GPUS: + logging.debug("Not using all GPUs as USE_ALL_GPUS is False. Stopping optimization loop.") + break + switch_objective = True + + models = apply_action(best_action, models) + + models = simplify_model_allocations(models) + + remaining_gpus = num_gpus - calc_used_gpus(models) + prev_metric = new_metric + + if verbose: + self._print_iteration(it, models, {gpu_type: num_gpus}) + print(f"Metric: {new_metric:.2f}") + print(f"{len(actions)} actions:") + for action in actions: + if action == best_action: + print(f" * {action} (best)") + else: + print(f" {action}") + print(f"Applied: {best_action}") + print(f"Remaining devices: {remaining_gpus}x{gpu_type}") + + it += 1 + if it > MAX_ITERATIONS: + logging.debug(f"Reached max iterations ({MAX_ITERATIONS}). Stopping optimization loop.") + break + + result = evaluate_model_allocation( + models=models, + num_gpus={gpu_type: num_gpus}, + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + round_up_cost_to_server=True, + ) + + if verbose: + self._print_final_allocation( + models=models, + used_devices=result.gpus_used, + total_devices={gpu_type: num_gpus}, + power_data=self.power_data, + total_time_s=result.total_time_s, + ttff_s=result.ttff_s, + first_chunk_time=result.first_chunk_time, + tbf_s=result.tbf_s, + total_energy=result.total_energy if self.power_data else 0.0, + cost=result.cost, + ) + + if not self.policy.is_disaggregated(Model.HF): + if models[gpu_type][Model.HF_VAE]: + assert models[gpu_type][Model.HF_VAE][0].get_num_gpus() == 0, \ + "HF_VAE must have 0 GPUs when HF disaggregation is disabled" + if not self.policy.is_disaggregated(Model.FT): + if models[gpu_type][Model.FT_VAE]: + assert models[gpu_type][Model.FT_VAE][0].get_num_gpus() == 0, \ + "FT_VAE must have 0 GPUs when FT disaggregation is disabled" + num_gpus_used = result.gpus_used[gpu_type] + assert num_gpus_used <= num_gpus, f"{num_gpus_used}>{num_gpus} for {gpu_type.value}" + + return Result( + total_time_s=result.total_time_s, + models=models, + gpus_used={gpu_type: num_gpus_used}, + gpus_total={gpu_type: num_gpus}, + ttff_s=result.ttff_s, + tbf_s=result.tbf_s, + total_energy=result.total_energy if self.power_data else 0.0, + cost=result.cost, + ) + + def _print_iteration( + self, + it: int, + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + num_gpus: dict[GPUType, int], + ) -> None: + print(f"--- Iteration {it} ---") + + for gpu_type in models.keys(): + total_gpus = calc_used_gpus({gpu_type: models[gpu_type]}) + print(f"Current {gpu_type.value} allocation: {total_gpus}/{num_gpus[gpu_type]} GPUs") + for model in Model: + for model_instance in models[gpu_type][model]: + if model_instance.get_num_gpus() > 0: + print(f" {model.value:10s}:\t{model_instance}") + + # Find the bottleneck stage + stage_times: dict[Model, float] = {} + ttff_times: dict[Model, float] = {} + for model_name in Model: + times = [] + times_first = [] + for gpu_type in models.keys(): + for model_alloc in models[gpu_type][model_name]: + times.append(model_alloc.time) + times_first.append(model_alloc.time_first) + stage_times[model_name] = max(times) if times else 0.0 + ttff_times[model_name] = max(times_first) if times_first else 0.0 + + bottleneck_stage, bottleneck_time = max( + stage_times.items(), + key=itemgetter(1) + ) + bottleneck_ttff_stage, bottleneck_ttff_time = max( + ttff_times.items(), + key=itemgetter(1) + ) + print(f"Bottleneck: {bottleneck_stage} ({bottleneck_time:.2f}s)") + print(f"Bottleneck TTFF: {bottleneck_ttff_stage} ({bottleneck_ttff_time:.2f}s)") + # bottleneck stage is not necessarily the stage with the + # highest potential gain from scaling up/out + + def _print_final_allocation( + self, + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + used_devices: dict[GPUType, int], + total_devices: dict[GPUType, int], + power_data: Optional[PowerData], + total_time_s: float, + ttff_s: float, + first_chunk_time: float, + tbf_s: float, + total_energy: float, + cost: float, + ) -> None: + print("=== FINAL ALLOCATION ===") + print("Total devices used/available:") + for gpu_type, total_device in total_devices.items(): + used_device = used_devices[gpu_type] + print(f" {gpu_type.value}: {used_device}/{total_device}") + print("Model allocations:") + for gpu_type in models.keys(): + print(f" {gpu_type.value} ({used_devices[gpu_type]} used):") + for model in Model: + for model_alloc in models[gpu_type][model]: + print(f" {model.value:10s}:\t{model_alloc}") + print(f"Total time: {total_time_s:.2f} seconds ({total_time_s / SECONDS_IN_MINUTE:.2f} minutes)") + print(f"TTFF: {ttff_s:.2f} seconds") + print(f"First chunk time: {first_chunk_time:.2f} seconds") + print(f"TBF: {tbf_s:.2f} seconds") + print(f"Total cost: ${cost:.2f}") + if power_data is not None: + print(f"Total energy: {total_energy:.2f} Ws ({total_energy / SECONDS_IN_HOUR / 1000:.2f} kWh)") diff --git a/streamwise/model_provisioner/helix.py b/streamwise/model_provisioner/helix.py new file mode 100644 index 00000000..e8fededf --- /dev/null +++ b/streamwise/model_provisioner/helix.py @@ -0,0 +1,403 @@ +""" +Helix algorithm for the StreamWise workflow allocation problem. + +Reference: https://github.com/Thesys-lab/Helix-ASPLOS25 + +Helix optimizes models one-by-one following MODEL_ORDER, using MILP +for each model's resource allocation. After each model reaches convergence +(solver optimality or per-model time limit), its allocation is fixed and the +remaining GPU budget is passed to the next model. + +Design rationale: + HelixAllocator does NOT inherit from MILPAllocator because the parent's + allocate() builds a single joint MILP for all models simultaneously. + Instead, HelixAllocator extends ModelAllocator and *composes* + MILPAllocator instances — one per model in the workflow. + + For each model, a per-model WorkflowConfig is created where only the + target model has non-zero work (all others set to 0). The existing MILP + constraints (is_active <= work, gpus <= num_gpus * is_active) naturally + force 0 GPU allocation for those 0-work models, so no changes to + milp.py are required. +""" + +from __future__ import annotations + +import logging + +from dataclasses import replace +from typing import Optional + +from sim_types import Result +from sim_types import GPUType +from sim_types import WorkflowConfig +from sim_types import PowerData +from sim_types import LatencyData +from sim_types import Model +from sim_types import ModelAllocation +from sim_types import Policy +from sim_types import Solver +from sim_types import MODEL_ORDER + +from model_allocator import ModelAllocator + +from evaluator import evaluate_model_allocation + +from .milp import MILPAllocator + +from .policies import HELIX_POLICY +from .policies import MAX_DEVICES + +from constants import DEVICE_OPTIONS + + +# Default per-model MILP solver time limit in seconds. +# Each model gets this long to converge before the solver moves on. +DEFAULT_PER_MODEL_TIME_LIMIT = 30 + + +def _compute_per_model_gpu_budget( + model_order: list[Model], + num_gpus: dict[GPUType, int], + workflow: WorkflowConfig, +) -> dict[Model, dict[GPUType, int]]: + """Compute a per-model GPU budget so every model gets a fair share. + + Budget is proportional to each model's ``MAX_DEVICES`` weight (capped + by the model's actual maximum useful device count from ``DEVICE_OPTIONS``). + Models not in ``MAX_DEVICES`` (e.g. OTHERS, UPSCALER) receive a minimum + allocation of ``min(DEVICE_OPTIONS)`` GPUs. + + The allocations are floored per model, and any remainder is distributed + round-robin starting from the first model. + + Returns: + Mapping ``model -> {gpu_type -> max_gpus}`` that the model may use. + """ + # Effective weight per model (max useful devices) + weights: dict[Model, int] = {} + for m in model_order: + if workflow.model_work.get(m, 0) == 0: + continue + if m in MAX_DEVICES: + weights[m] = MAX_DEVICES[m] + else: + # Models not in MAX_DEVICES (OTHERS, UPSCALER) get min allocation + weights[m] = min(DEVICE_OPTIONS.get(m, [1])) + + total_weight = sum(weights.values()) + if total_weight == 0: + # Fallback: equal split + total_weight = len(weights) or 1 + weights = {m: 1 for m in weights} + + budget: dict[Model, dict[GPUType, int]] = {} + for gpu_type, total in num_gpus.items(): + # Floor allocation per model + allocated = 0 + per_model: dict[Model, int] = {} + for m in model_order: + if m not in weights: + continue + share = int(total * weights[m] / total_weight) + # Ensure at least 1 GPU per model (if GPUs available) + share = max(share, 1) if total - allocated >= 1 else 0 + per_model[m] = share + allocated += share + + # Distribute remainder round-robin + remainder = total - allocated + idx = 0 + models_list = [m for m in model_order if m in per_model] + while remainder > 0 and models_list: + m = models_list[idx % len(models_list)] + per_model[m] += 1 + remainder -= 1 + idx += 1 + + for m in model_order: + if m not in per_model: + continue + if m not in budget: + budget[m] = {} + budget[m][gpu_type] = per_model[m] + + return budget + + +class HelixAllocator(ModelAllocator): + """ + Helix-style allocator that optimizes models one at a time + using MILP, sequentially following MODEL_ORDER. + + Reference: https://github.com/Thesys-lab/Helix-ASPLOS25 + + Key approach: + 1. For each model in MODEL_ORDER, create a per-model MILP sub-problem + where only the target model has non-zero work. + 2. Solve the MILP with the remaining GPU budget and a per-model time limit. + 3. Fix the allocation for that model and subtract used GPUs. + 4. Move to the next model with the remaining GPU budget. + 5. Combine all per-model allocations into the final result. + + The HelixAllocator uses composition (not inheritance) with MILPAllocator, + creating a separate MILPAllocator instance for each model's sub-problem. + This avoids modifying the joint MILP formulation and allows per-model + solver configurations. + """ + + def __init__( + self, + workflow: WorkflowConfig, + latency_data: LatencyData, + power_data: Optional[PowerData] = None, + policy: Policy = HELIX_POLICY, + ) -> None: + super().__init__( + workflow, + latency_data, + power_data, + policy, + ) + assert self.policy.solver == Solver.HELIX + + def allocate( + self, + num_gpus: dict[GPUType, int], + verbose: bool = False, + per_model_time_limit: int = DEFAULT_PER_MODEL_TIME_LIMIT, + milp_solver: Solver = Solver.HIGHS, + ) -> Result: + """ + Allocate resources model-by-model following MODEL_ORDER. + + For each model, a MILPAllocator is created with a workflow where + only the target model has non-zero work. The MILP solver optimizes + the allocation for that model within the remaining GPU budget. + + Args: + num_gpus: Available GPUs per type. + verbose: If True, print per-model allocation details. + per_model_time_limit: Time limit (seconds) for each per-model MILP solve. + milp_solver: MILP solver backend to use (GUROBI or HIGHS). + + Returns: + Combined Result across all models. + """ + assert milp_solver in (Solver.GUROBI, Solver.HIGHS), \ + f"milp_solver must be GUROBI or HIGHS, got {milp_solver}" + + model_order = self.workflow.get_model_order() + if not self.policy.use_upscaler and Model.UPSCALER in model_order: + # Remove UPSCALER from model_order if not using upscaler to avoid unnecessary MILP solve + model_order.remove(Model.UPSCALER) + remaining_gpus = dict(num_gpus) + + # ---- GPU budget partitioning ---- + # Pre-compute a per-model GPU budget proportional to MAX_DEVICES + # so that early models cannot starve later ones. Unused GPUs from + # one model roll over to subsequent models. + gpu_budget = _compute_per_model_gpu_budget( + model_order, num_gpus, self.workflow, + ) + + if verbose: + logging.info("Helix GPU budget per model:") + for m in model_order: + if m in gpu_budget: + logging.info(f" {m.value}: {gpu_budget[m]}") + + # Accumulated per-model allocations and metrics + all_model_allocations: dict[GPUType, dict[Model, list[ModelAllocation]]] = {} + total_makespan = 0.0 + total_ttff = 0.0 + total_cost = 0.0 + total_energy = 0.0 + total_gpus_used: dict[GPUType, int] = {gt: 0 for gt in num_gpus} + + for model in model_order: + work = self.workflow.model_work.get(model, 0) + if work == 0: + continue + + # Skip VAE models when disaggregation is disabled for the parent. + # Their latency is folded into the parent model's time calculation. + if model == Model.HF_VAE and not self.policy.is_disaggregated(Model.HF): + continue + if model == Model.FT_VAE and not self.policy.is_disaggregated(Model.FT): + continue + + # Check if any GPUs remain + if all(v <= 0 for v in remaining_gpus.values()): + logging.warning( + f"Helix: No GPUs remaining for {model.value}. Skipping.") + continue + + # Filter out GPU types with 0 remaining. + # Cap per-model GPUs to the budget so later models are not starved. + model_budget = gpu_budget.get(model, {}) + active_gpus = { + gt: min(count, model_budget.get(gt, count)) + for gt, count in remaining_gpus.items() + if count > 0 and (gt not in model_budget or model_budget[gt] > 0) + } + + if verbose: + logging.info( + f"--- Helix: Optimizing {model.value} " + f"(work={work}) with remaining GPUs: {active_gpus} ---" + ) + + # ---- build per-model workflow ---- + # Only the target model has work; other models are excluded from + # model_work so the MILP only builds variables/constraints for it. + per_model_work = {model: self.workflow.model_work[model]} + per_model_workflow = replace( + self.workflow, + model_work=per_model_work, + ) + + # ---- build MILP-compatible policy ---- + # The inner MILPAllocator requires solver ∈ {GUROBI, HIGHS}. + # Force disaggregation / use_upscaler flags so that the inner + # MILP's ``model_names`` list includes VAE / UPSCALER when those + # are the target model. Without this, the MILP would construct + # an empty model set and produce a trivial (infeasible) problem. + disag = {} # dict(self.policy.disaggregation) + if model == Model.HF_VAE and self.policy.is_disaggregated(Model.HF): + disag[Model.HF] = True + if model == Model.FT_VAE and self.policy.is_disaggregated(Model.FT): + disag[Model.FT] = True + milp_policy = Policy( + name=self.policy.name, + gpu_cost=self.policy.gpu_cost, + objective=self.policy.objective, + # disaggregation=self.policy.disaggregation or model == Model.HF_VAE, + disaggregation=disag, + use_upscaler=self.policy.use_upscaler or model == Model.UPSCALER, + hardware=self.policy.hardware, + solver=milp_solver, + ) + + # ---- solve per-model MILP ---- + milp_allocator = MILPAllocator( + workflow=per_model_workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=milp_policy, + ) + + result = milp_allocator.allocate( + num_gpus=active_gpus, + verbose=verbose, + time_limit=per_model_time_limit, + # Use running_cost=True for linear cost formulation (HiGHS-compatible) + running_cost=(milp_solver == Solver.HIGHS), + # Skip server constraint: per-model allocations don't need + # to be multiples of NUM_GPUS_PER_SERVER. + skip_server_constraint=True, + ) + + if result.total_time_s == 0.0 and not result.models: + logging.warning( + f"Helix: MILP failed for {model.value}. Skipping.") + continue + + # ---- record allocations & snap devices to DEVICE_OPTIONS ---- + # The MILP constrains devices to DEVICE_OPTIONS, but floating-point + # precision in the solver can occasionally produce off-by-one values + # (e.g. 31 instead of 32). Snap each replica to the nearest valid + # option, adjusting the GPU accounting so we don't exceed the total + # budget passed to evaluate_model_allocation at the end. + for gpu_type, model_dict in result.models.items(): + if gpu_type not in all_model_allocations: + all_model_allocations[gpu_type] = {} + for m_name, allocs in model_dict.items(): + for alloc in allocs: + valid_devices = DEVICE_OPTIONS.get(m_name, [1]) + if alloc.devices not in valid_devices: + nearest = min(valid_devices, key=lambda d: abs(d - alloc.devices)) + diff = nearest - alloc.devices # positive = round up + gpu_avail = remaining_gpus.get(gpu_type, 0) - result.gpus_used.get(gpu_type, 0) + if diff > 0 and gpu_avail < diff: + # Not enough spare GPUs to round up; round down instead + nearest = max( + (d for d in valid_devices if d <= alloc.devices), + default=valid_devices[0], + ) + diff = nearest - alloc.devices + logging.info( + f"Helix: snapping {m_name.value} from " + f"{alloc.devices} to {nearest} devices " + f"(solver precision fix, diff={diff:+d})") + # Adjust GPU accounting for this model's result + result.gpus_used[gpu_type] = result.gpus_used.get(gpu_type, 0) + diff + alloc.devices = nearest + all_model_allocations[gpu_type][m_name] = allocs + + # ---- accumulate metrics ---- + total_makespan += result.total_time_s + total_ttff += result.ttff_s + total_cost += result.cost + total_energy += result.total_energy + if verbose: + print(f'Model {model.value} - Time: {result.total_time_s:.2f}s,' + f'TTFF: {result.ttff_s:.2f}s, Cost: ${result.cost:.2f}') + print(f'Total cost so far: ${total_cost:.2f}, Total time so far: {total_makespan:.2f}s,' + f'Total TTFF so far: {total_ttff:.2f}s') + print(f'GPUs allocated for {model.value}: {result.gpus_used}') + + # ---- subtract used GPUs ---- + for gpu_type, used in result.gpus_used.items(): + remaining_gpus[gpu_type] = remaining_gpus.get(gpu_type, 0) - used + total_gpus_used[gpu_type] = total_gpus_used.get(gpu_type, 0) + used + + # ---- roll over unused budget to next models ---- + # If this model used fewer GPUs than its budget, the surplus + # is distributed evenly among the remaining models. + remaining_models = [ + m for m in model_order + if m in gpu_budget and MODEL_ORDER.get(m, 0) > MODEL_ORDER.get(model, 0) + ] + if remaining_models: + for gpu_type in num_gpus: + budget_for_model = model_budget.get(gpu_type, 0) + used_by_model = result.gpus_used.get(gpu_type, 0) + surplus = budget_for_model - used_by_model + if surplus > 0: + per_model_extra = surplus // len(remaining_models) + leftover = surplus % len(remaining_models) + for i, rm in enumerate(remaining_models): + extra = per_model_extra + (1 if i < leftover else 0) + gpu_budget[rm][gpu_type] = gpu_budget[rm].get(gpu_type, 0) + extra + + if verbose: + print( + f"Helix: {model.value} allocated. " + f"Time: {result.total_time_s:.2f}s, " + f"TTFF: {result.ttff_s:.2f}s, " + f"GPUs used: {result.gpus_used}, " + f"Remaining: {remaining_gpus}" + ) + + result = evaluate_model_allocation( + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + models=all_model_allocations, + num_gpus=num_gpus, + ) + + if verbose: + print( + f"=== Helix final: " + f"Makespan={result.total_time_s:.2f}s, " + f"TTFF={result.ttff_s:.2f}s, " + f"TBF={result.tbf_s:.4f}s, " + f"Cost=${result.cost:.2f}, " + f"Energy={result.total_energy:.2f}Ws, " + f"GPUs used={result.gpus_used} ===" + ) + + return result diff --git a/streamwise/model_provisioner/hexgen.py b/streamwise/model_provisioner/hexgen.py new file mode 100644 index 00000000..4f37768a --- /dev/null +++ b/streamwise/model_provisioner/hexgen.py @@ -0,0 +1,629 @@ +""" +HexGen algorithm for the StreamWise workflow allocation problem. + +Reference: https://arxiv.org/abs/2311.11514 + +HexGen treats each model in the workflow as an independent component for optimization. +It tracks metrics per model and optimizes models sequentially according to MODEL_ORDER. +When a model's metric converges (stops dropping), it moves to the next model. +After the last model converges, it cycles back to the first model and allocates +remaining GPUs until exhausted. +""" + +from __future__ import annotations +import logging +from typing import Optional + +from sim_types import Result +from sim_types import GPUType +from sim_types import WorkflowConfig +from sim_types import PowerData +from sim_types import LatencyData +from sim_types import Model +from sim_types import ModelAllocation +from sim_types import Policy +from sim_types import Solver +from sim_types import MODEL_ORDER + +from utils import simplify_model_allocations + +from evaluator import calc_used_gpus +from evaluator import evaluate_model_allocation + +from .greedy import GreedyAllocator + +from actions import gen_actions +from actions import choose_action +from actions import apply_action + +from .policies import HEXGEN_POLICY +from .policies import MAX_ITERATIONS +from .policies import USE_ALL_GPUS + + +def _get_model_order(workflow: WorkflowConfig) -> list[Model]: + """Get ordered list of models in the workflow, sorted by MODEL_ORDER.""" + return sorted( + [m for m in workflow.models if m in MODEL_ORDER], + key=lambda m: MODEL_ORDER[m], + ) + + +class HexGenAllocator(GreedyAllocator): + """ + HexGen-style allocator that optimizes models one at a time, + sequentially following MODEL_ORDER. + + Reference: https://arxiv.org/abs/2311.11514 + + Key differences from GreedyAllocator: + 1. Each model is treated as an independent optimization target. + 2. Per-model metrics are tracked separately. + 3. Models are optimized in MODEL_ORDER sequence. When a model's metric + converges, it moves to the next model. After the last model converges, + it cycles back to the first and allocates remaining GPUs. + """ + + def __init__( + self, + workflow: WorkflowConfig, + latency_data: LatencyData, + power_data: Optional[PowerData] = None, + policy: Policy = HEXGEN_POLICY, + ) -> None: + super().__init__( + workflow, + latency_data, + power_data, + policy, + ) + assert self.policy.solver == Solver.HEXGEN + + def _pick_from_single_device_mapping( + self, + num_gpus: int, + gpu_type: GPUType, + verbose: bool = False, + allow_removal: bool = False, + allow_merging: bool = False, + look_ahead_replicas: int = 3, + ) -> Result: + """ + HexGen-style allocation for a single GPU type (>8 GPUs). + Optimizes models one at a time following MODEL_ORDER. + """ + from constants import NUM_GPUS_PER_SERVER + + assert num_gpus >= NUM_GPUS_PER_SERVER[gpu_type] + + # Initialize allocations (same as GreedyAllocator) + models = self._init_single_device_models(gpu_type) + + remaining_gpus = num_gpus - calc_used_gpus(models) + assert 0 <= remaining_gpus <= num_gpus + + # --- HexGen per-model sequential optimization --- + model_order = _get_model_order(self.workflow) + per_model_metrics: dict[Model, Optional[float]] = {m: None for m in model_order} + + it = 0 + current_model_idx = 0 + cycles_without_progress = 0 # track full cycles without any improvement + total_models = len(model_order) + + while remaining_gpus > 0: + if current_model_idx >= total_models: + # Completed a full cycle, wrap around + current_model_idx = 0 + cycles_without_progress += 1 + if cycles_without_progress >= 1: + logging.debug( + f"HexGen: No progress after {cycles_without_progress} full cycles.") + break + + current_model = model_order[current_model_idx] + + if verbose: + print(f"--- HexGen: Optimizing {current_model.value} " + f"(model {current_model_idx + 1}/{total_models}) ---") + + # Inner loop: keep optimizing current model until convergence + inner_it = 0 + while remaining_gpus > 0: + # Evaluate current state + evaluate_model_allocation( + models=models, + num_gpus={gpu_type: num_gpus}, + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + round_up_cost_to_server=False, + ) + + # Generate actions only for the current model + all_actions = gen_actions( + num_gpus={gpu_type: num_gpus}, + latency_data=self.latency_data, + power_data=self.power_data, + workflow=self.workflow, + models=models, + policy=self.policy, + ) + + # Filter to actions targeting the current model only + model_actions = [a for a in all_actions if a.model == current_model] + + if not model_actions: + logging.debug( + f"HexGen: No actions for {current_model.value} after {inner_it} inner iterations.") + break + + best_action = choose_action(model_actions, self.policy.objective) + + if not best_action: + logging.debug(f"HexGen: No action selected for {current_model.value}.") + break + + new_metric = best_action.get_metric(self.policy.objective) + prev_metric = per_model_metrics[current_model] + + if self.policy.objective.is_monotonic() and prev_metric is not None and new_metric >= prev_metric: + msg = ( + f"HexGen: {current_model.value} converged after {inner_it} inner iterations. " + f"Metric: {new_metric:.2f} >= previous {prev_metric:.2f}." + ) + if verbose: + print(msg) + logging.debug(msg) + break + + per_model_metrics[current_model] = new_metric + + models = apply_action(best_action, models=models) + models = simplify_model_allocations(models) + + remaining_gpus = num_gpus - calc_used_gpus(models) + + if verbose: + self._print_iteration(it, models, {gpu_type: num_gpus}) + print(f"HexGen: Applied action for {current_model.value}, " + f"metric: {new_metric:.2f}, remaining: {remaining_gpus}") + + it += 1 + inner_it += 1 + + if it > MAX_ITERATIONS: + logging.debug(f"HexGen: Reached max iterations ({MAX_ITERATIONS}). Stopping.") + break + + if it > MAX_ITERATIONS: + break + + current_model_idx += 1 + + # --- USE_ALL_GPUS: fill remaining GPUs by cycling through MODEL_ORDER --- + remaining_gpus = num_gpus - calc_used_gpus(models) + if USE_ALL_GPUS and remaining_gpus > 0: + models = self._fill_remaining_gpus_single( + models=models, + num_gpus=num_gpus, + gpu_type=gpu_type, + model_order=model_order, + it=it, + verbose=verbose, + ) + + # Final evaluation + result = evaluate_model_allocation( + models=models, + num_gpus={gpu_type: num_gpus}, + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + round_up_cost_to_server=True, + ) + + if verbose: + self._print_final_allocation( + models=models, + used_devices=result.gpus_used, + total_devices={gpu_type: num_gpus}, + power_data=self.power_data, + total_time_s=result.total_time_s, + ttff_s=result.ttff_s, + first_chunk_time=result.first_chunk_time, + tbf_s=result.tbf_s, + total_energy=result.total_energy if self.power_data else 0.0, + cost=result.cost, + ) + + if not self.policy.is_disaggregated(Model.HF): + if models[gpu_type][Model.HF_VAE]: + assert models[gpu_type][Model.HF_VAE][0].get_num_gpus() == 0, \ + "HF_VAE must have 0 GPUs when HF disaggregation is disabled" + if not self.policy.is_disaggregated(Model.FT): + if models[gpu_type][Model.FT_VAE]: + assert models[gpu_type][Model.FT_VAE][0].get_num_gpus() == 0, \ + "FT_VAE must have 0 GPUs when FT disaggregation is disabled" + + num_gpus_used = result.gpus_used[gpu_type] + assert num_gpus_used <= num_gpus, f"{num_gpus_used}>{num_gpus} for {gpu_type.value}" + + return Result( + total_time_s=result.total_time_s, + models=models, + gpus_used={gpu_type: num_gpus_used}, + gpus_total={gpu_type: num_gpus}, + ttff_s=result.ttff_s, + tbf_s=result.tbf_s, + total_energy=result.total_energy if self.power_data else 0.0, + cost=result.cost, + ) + + def _pick_from_both_devices_mapping( + self, + num_gpus: dict[GPUType, int], + verbose: bool = False, + allow_removal: bool = False, + allow_merging: bool = False, + look_ahead_replicas: int = 3, + ) -> Result: + """ + HexGen-style allocation for two GPU types. + Optimizes models one at a time following MODEL_ORDER. + """ + from constants import NUM_GPUS_PER_SERVER + + gpu_types = list(num_gpus.keys()) + assert len(gpu_types) == 2 + gpu_type1 = gpu_types[0] + gpu_type2 = gpu_types[1] + assert num_gpus[gpu_type1] >= NUM_GPUS_PER_SERVER[gpu_type1] + assert num_gpus[gpu_type2] >= NUM_GPUS_PER_SERVER[gpu_type2] + + # Initialize allocations (same as GreedyAllocator) + models = self._init_both_devices_models(gpu_type1, gpu_type2) + + remaining_gpus: dict[GPUType, int] = {} + for gpu_type in num_gpus.keys(): + remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]}) + + # --- HexGen per-model sequential optimization --- + model_order = _get_model_order(self.workflow) + per_model_metrics: dict[Model, Optional[float]] = {m: None for m in model_order} + + if verbose: + evaluate_model_allocation( + models=models, + num_gpus=num_gpus, + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + round_up_cost_to_server=True, + ) + self._print_iteration(0, models, num_gpus) + + it = 1 + current_model_idx = 0 + cycles_without_progress = 0 + total_models = len(model_order) + + while sum(remaining_gpus.values()) > 0: + if current_model_idx >= total_models: + current_model_idx = 0 + cycles_without_progress += 1 + if cycles_without_progress >= 1: + logging.debug( + f"HexGen: No progress after {cycles_without_progress} full cycles.") + break + + current_model = model_order[current_model_idx] + + if verbose: + print(f"--- HexGen: Optimizing {current_model.value} " + f"(model {current_model_idx + 1}/{total_models}) ---") + + inner_it = 0 + + while sum(remaining_gpus.values()) > 0: + evaluate_model_allocation( + models=models, + num_gpus=num_gpus, + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + round_up_cost_to_server=False, + ) + + all_actions = gen_actions( + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + num_gpus=num_gpus, + models=models, + policy=self.policy, + ) + + # Filter to current model + model_actions = [a for a in all_actions if a.model == current_model] + + if not model_actions: + logging.debug( + f"HexGen: No actions for {current_model.value} after {inner_it} inner iterations.") + break + + best_action = choose_action(model_actions, self.policy.objective) + + if not best_action: + logging.debug(f"HexGen: No action selected for {current_model.value}.") + break + + new_metric = best_action.get_metric(self.policy.objective) + prev_metric = per_model_metrics[current_model] + + if self.policy.objective.is_monotonic() and prev_metric is not None and new_metric >= prev_metric: + msg = ( + f"HexGen: {current_model.value} converged. " + f"Metric: {new_metric:.2f} >= previous {prev_metric:.2f}." + ) + if verbose: + print(msg) + logging.debug(msg) + break + + per_model_metrics[current_model] = new_metric + + models = apply_action(best_action, models=models) + models = simplify_model_allocations(models) + + remaining_gpus.clear() + for gpu_type in num_gpus.keys(): + remaining_gpus[gpu_type] = num_gpus[gpu_type] - calc_used_gpus({gpu_type: models[gpu_type]}) + + if verbose: + self._print_iteration(it, models, num_gpus) + print(f"HexGen: Applied action for {current_model.value}, " + f"metric: {new_metric:.2f}") + print("Remaining devices:") + for gt in remaining_gpus: + print(f" {remaining_gpus[gt]} x {gt.value}") + + it += 1 + inner_it += 1 + + if it > MAX_ITERATIONS: + logging.debug(f"HexGen: Reached max iterations ({MAX_ITERATIONS}). Stopping.") + break + + if it > MAX_ITERATIONS: + break + + current_model_idx += 1 + + # --- USE_ALL_GPUS: fill remaining GPUs by cycling through MODEL_ORDER --- + remaining_gpus_total = sum( + num_gpus[gt] - calc_used_gpus({gt: models[gt]}) + for gt in num_gpus + ) + if USE_ALL_GPUS and remaining_gpus_total > 0: + models = self._fill_remaining_gpus_multi( + models=models, + num_gpus=num_gpus, + model_order=model_order, + it=it, + verbose=verbose, + ) + + # Adjust for no disaggregation + if not self.policy.is_disaggregated(Model.HF): + for models_gpu in models.values(): + for instance_id in range(len(models_gpu[Model.HF_VAE])): + assert models_gpu[Model.HF_VAE][instance_id].get_num_gpus() == 0, \ + "HF_VAE must have 0 GPUs when HF disaggregation is disabled" + if not self.policy.is_disaggregated(Model.FT): + for models_gpu in models.values(): + for instance_id in range(len(models_gpu[Model.FT_VAE])): + assert models_gpu[Model.FT_VAE][instance_id].get_num_gpus() == 0, \ + "FT_VAE must have 0 GPUs when FT disaggregation is disabled" + + # Final evaluation + result = evaluate_model_allocation( + models=models, + num_gpus=num_gpus, + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + round_up_cost_to_server=True, + ) + + if verbose: + self._print_final_allocation( + models=models, + used_devices=result.gpus_used, + total_devices={ + gpu_type1: num_gpus.get(gpu_type1, 0), + gpu_type2: num_gpus.get(gpu_type2, 0), + }, + power_data=self.power_data, + total_time_s=result.total_time_s, + ttff_s=result.ttff_s, + first_chunk_time=result.first_chunk_time, + tbf_s=result.tbf_s, + total_energy=result.total_energy if self.power_data else 0.0, + cost=result.cost, + ) + + assert result.gpus_used[gpu_type1] <= num_gpus.get(gpu_type1, 0), \ + f"{gpu_type1.value}: {result.gpus_used[gpu_type1]} > {num_gpus.get(gpu_type1, 0)}" + assert result.gpus_used[gpu_type2] <= num_gpus.get(gpu_type2, 0), \ + f"{gpu_type2.value}: {result.gpus_used[gpu_type2]} > {num_gpus.get(gpu_type2, 0)}" + + return Result( + total_time_s=result.total_time_s, + models=models, + gpus_used=result.gpus_used, + ttff_s=result.ttff_s, + tbf_s=result.tbf_s, + total_energy=result.total_energy if self.power_data else 0.0, + cost=result.cost, + ) + + def _fill_remaining_gpus_single( + self, + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + num_gpus: int, + gpu_type: GPUType, + model_order: list[Model], + it: int = 0, + verbose: bool = False, + ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: + """ + Fill remaining GPUs by cycling through MODEL_ORDER (single GPU type). + Applies any available action per model, ignoring metric convergence. + Stops when all GPUs are used or no model can accept more. + """ + remaining_gpus = num_gpus - calc_used_gpus(models) + total_models = len(model_order) + model_idx = 0 + models_exhausted: set[Model] = set() + + if verbose: + print(f"--- HexGen: USE_ALL_GPUS fill phase, {remaining_gpus} remaining ---") + + while remaining_gpus > 0 and len(models_exhausted) < total_models: + current_model = model_order[model_idx % total_models] + model_idx += 1 + + if current_model in models_exhausted: + continue + + evaluate_model_allocation( + models=models, + num_gpus={gpu_type: num_gpus}, + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + round_up_cost_to_server=False, + ) + + all_actions = gen_actions( + num_gpus={gpu_type: num_gpus}, + latency_data=self.latency_data, + power_data=self.power_data, + workflow=self.workflow, + models=models, + policy=self.policy, + ) + model_actions = [a for a in all_actions if a.model == current_model] + + if not model_actions: + models_exhausted.add(current_model) + logging.debug(f"HexGen fill: {current_model.value} exhausted (no actions).") + continue + + best_action = choose_action(model_actions, self.policy.objective) + if not best_action: + models_exhausted.add(current_model) + logging.debug(f"HexGen fill: {current_model.value} exhausted (no action selected).") + continue + + models = apply_action(best_action, models=models) + models = simplify_model_allocations(models) + remaining_gpus = num_gpus - calc_used_gpus(models) + + if verbose: + self._print_iteration(it, models, {gpu_type: num_gpus}) + print(f"HexGen fill: Allocated to {current_model.value}, remaining: {remaining_gpus}") + + it += 1 + if it > MAX_ITERATIONS: + logging.debug(f"HexGen fill: Reached max iterations ({MAX_ITERATIONS}). Stopping.") + break + + return models + + def _fill_remaining_gpus_multi( + self, + models: dict[GPUType, dict[Model, list[ModelAllocation]]], + num_gpus: dict[GPUType, int], + model_order: list[Model], + it: int = 0, + verbose: bool = False, + ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: + """ + Fill remaining GPUs by cycling through MODEL_ORDER (multi GPU type). + Applies any available action per model, ignoring metric convergence. + Stops when all GPUs are used or no model can accept more. + """ + total_remaining = sum( + num_gpus[gt] - calc_used_gpus({gt: models[gt]}) + for gt in num_gpus + ) + total_models = len(model_order) + model_idx = 0 + models_exhausted: set[Model] = set() + + if verbose: + print(f"--- HexGen: USE_ALL_GPUS fill phase, {total_remaining} remaining ---") + + while total_remaining > 0 and len(models_exhausted) < total_models: + current_model = model_order[model_idx % total_models] + model_idx += 1 + + if current_model in models_exhausted: + continue + + evaluate_model_allocation( + models=models, + num_gpus=num_gpus, + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + round_up_cost_to_server=False, + ) + + all_actions = gen_actions( + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + num_gpus=num_gpus, + models=models, + policy=self.policy, + ) + model_actions = [a for a in all_actions if a.model == current_model] + + if not model_actions: + models_exhausted.add(current_model) + logging.debug(f"HexGen fill: {current_model.value} exhausted (no actions).") + continue + + best_action = choose_action(model_actions, self.policy.objective) + if not best_action: + models_exhausted.add(current_model) + logging.debug(f"HexGen fill: {current_model.value} exhausted (no action selected).") + continue + + models = apply_action(best_action, models=models) + models = simplify_model_allocations(models) + total_remaining = sum( + num_gpus[gt] - calc_used_gpus({gt: models[gt]}) + for gt in num_gpus + ) + + if verbose: + self._print_iteration(it, models, num_gpus) + print(f"HexGen fill: Allocated to {current_model.value}, remaining: {total_remaining}") + + it += 1 + if it > MAX_ITERATIONS: + logging.debug(f"HexGen fill: Reached max iterations ({MAX_ITERATIONS}). Stopping.") + break + + return models diff --git a/streamwise/model_provisioner/milp.py b/streamwise/model_provisioner/milp.py new file mode 100644 index 00000000..67749258 --- /dev/null +++ b/streamwise/model_provisioner/milp.py @@ -0,0 +1,1070 @@ +""" +MILP formulation for the StreamWise workflow allocation problem. +""" + +from __future__ import annotations + +import json +import logging + +from typing import Callable +from typing import Optional + +from pyomo.environ import ConcreteModel +from pyomo.environ import Var +from pyomo.environ import Set +from pyomo.environ import Objective as OptObjective +from pyomo.environ import Binary +from pyomo.environ import NonNegativeIntegers +from pyomo.environ import NonNegativeReals +from pyomo.environ import minimize +from pyomo.environ import SolverFactory +from pyomo.environ import ConstraintList + +from sim_types import GPUType +from sim_types import Model +from sim_types import WorkflowConfig +from sim_types import LatencyData +from sim_types import PowerData +from sim_types import Result +from sim_types import Policy +from sim_types import ModelAllocation +from sim_types import Objective +from sim_types import Solver + +from models import get_model_allocation + +from model_allocator import ModelAllocator + +from constants import DEVICE_OPTIONS +from constants import NUM_GPUS_PER_SERVER +from constants import SECONDS_IN_HOUR + +from .policies import STREAMWISE_MILP_POLICY + + +MAX_INSTANCES = 16 + +# Maximum time it can take: 24 hours in seconds +# Used for big-M constraints to link TTFF and makespan to instance variables +MAX_TIME = 24 * SECONDS_IN_HOUR + + +# Allocators that require quadratic (bilinear) objectives - need Gurobi +QUADRATIC_OBJECTIVES = [ + Objective.TTFF_COST, + Objective.TIME_ENERGY, + Objective.ENERGY_COST, +] + + +def idx( + gpu_type: GPUType, + model_name: Model, + instance_id: int +) -> tuple[str, str, int]: + """Helper to convert enum to index key for instance variables.""" + return (gpu_type.value, model_name.value, instance_id) + + +def dev_idx( + gpu_type: GPUType, + model_name: Model, + instance_id: int, + num_devices: int +) -> tuple[str, str, int, int]: + """Helper to convert enum to index key for device variables.""" + return (gpu_type.value, model_name.value, instance_id, num_devices) + + +class MILPAllocator(ModelAllocator): + """ + MILP-based allocator that computes the optimal model allocation. + """ + def __init__( + self, + workflow: WorkflowConfig, + latency_data: LatencyData, + power_data: Optional[PowerData] = None, + policy: Policy = STREAMWISE_MILP_POLICY, + ) -> None: + super().__init__( + workflow, + latency_data, + power_data, + policy, + ) + assert self.policy.solver in [Solver.GUROBI, Solver.HIGHS] + + def allocate( + self, + num_gpus: dict[GPUType, int], + verbose: bool = False, + running_cost: bool = False, # If True, cost = active time only; False = makespan x GPUs + max_cost: Optional[float] = None, # If set, adds a constraint to limit cost + max_ttff: Optional[float] = None, # If set, adds a constraint to limit TTFF + max_makespan: Optional[float] = None, # If set, adds a constraint to limit makespan + time_limit: Optional[int] = None, # Time limit for the solver in seconds + save_solution_path: Optional[str] = None, # If set, saves the solution to a JSON file + warm_start_path: Optional[str] = None, # If set, loads a warm start solution from a JSON file + force_num_gpus: bool = False, # If True, adds constraints to force the use of all available GPUs + skip_server_constraint: bool = False, # If True, skips the GPU-per-server constraint + ) -> Result: + """ + Calculate the optimal model allocation and resulting metrics using MILP formulation. + """ + m = ConcreteModel() + + # Options: "gurobi", "highs" + solver_name = self.policy.solver.value + + # Define index sets + gpu_types = list(num_gpus.keys()) + + model_names = [ + Model.GEMMA, + Model.FLUX, + Model.HF, + # Model.HF_VAE, + Model.FT, + # Model.FT_VAE, + # Model.UPSCALER, + Model.OTHERS, + ] + if self.policy.use_upscaler: + model_names.append(Model.UPSCALER) + if self.policy.is_disaggregated(Model.HF): + model_names.append(Model.HF_VAE) + if self.policy.is_disaggregated(Model.FT): + model_names.append(Model.FT_VAE) + + # Remove models not in the workflow + model_names = [ + model_name + for model_name in model_names + if model_name in self.workflow.models + ] + + instance_ids = list(range(MAX_INSTANCES)) + + # The units of work that each model has to do + work: dict[Model, int] = self.workflow.work + + # Create Pyomo Sets + m.GPU_TYPES = Set(initialize=[g.value for g in gpu_types]) + m.MODEL_NAMES = Set(initialize=[mn.value for mn in model_names]) + m.INSTANCES = Set(initialize=instance_ids) + + # Create index set for device choices: (gpu_type, model_name, instance_id, device_count) + device_index_set = [ + (gpu_type.value, model_name.value, instance_id, num_devices) + for gpu_type in gpu_types + for model_name in model_names + for instance_id in instance_ids + for num_devices in [0] + DEVICE_OPTIONS[model_name] + ] + m.DEVICE_INDEX = Set(initialize=device_index_set) + + # Create index set for instance variables: (gpu_type, model_name, instance_id) + instance_index_set = [ + (gpu_type.value, model_name.value, instance_id) + for gpu_type in gpu_types + for model_name in model_names + for instance_id in instance_ids + ] + m.INSTANCE_INDEX = Set(initialize=instance_index_set) + + # Define indexed variables + m.device_choice = Var(m.DEVICE_INDEX, domain=Binary) + m.work_device = Var(m.DEVICE_INDEX, domain=NonNegativeIntegers) # Linearization: work per device choice + m.gpus = Var(m.INSTANCE_INDEX, domain=NonNegativeIntegers) + m.is_active = Var(m.INSTANCE_INDEX, domain=Binary) + m.is_min = Var(m.INSTANCE_INDEX, domain=Binary) + m.work = Var(m.INSTANCE_INDEX, domain=NonNegativeIntegers) + m.time = Var(m.INSTANCE_INDEX, domain=NonNegativeReals) + m.ttff = Var(m.INSTANCE_INDEX, domain=NonNegativeReals) + + # Objective variables + m.makespan = Var(domain=NonNegativeReals) + m.ttff_user = Var(domain=NonNegativeReals) + m.ttff_min = Var(m.MODEL_NAMES, domain=NonNegativeReals) # Per-model minimum TTFF + m.time_max = Var(m.MODEL_NAMES, domain=NonNegativeReals) # Per-model maximum time + m.cost = Var(domain=NonNegativeReals) + m.energy = Var(domain=NonNegativeReals) + + # Constraint list for dynamic constraints + m.constraints = ConstraintList() + + for gpu_type in gpu_types: + for model_name in model_names: + for instance_id in instance_ids: + key = idx(gpu_type, model_name, instance_id) + + # GPUs used = sum of num_devices * device_choice[num_devices] + m.constraints.add( + m.gpus[key] == sum( + num_devices * m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] + for num_devices in [0] + DEVICE_OPTIONS[model_name] + ) + ) + + # Cannot select inactive instance as min + m.constraints.add(m.is_min[key] <= m.is_active[key]) + # If active = 0 -> GPUs = 0 + m.constraints.add(m.gpus[key] <= num_gpus[gpu_type] * m.is_active[key]) + # If active = 1 -> GPUs ≥ 1 + m.constraints.add(m.gpus[key] >= m.is_active[key]) + # If work = 0 -> active = 0 -> GPUs = 0 + m.constraints.add(m.is_active[key] <= m.work[key]) + + # If device = 0 -> work = 0 + dev_idx_0 = dev_idx(gpu_type, model_name, instance_id, 0) + m.constraints.add( + m.work[key] + <= work[model_name] * (1 - m.device_choice[dev_idx_0]) + ) + + # Linearization: work_device links device_choice and work + # work = sum(work_device[d] for d in devices) - excludes 0 GPUs since they can't do work + m.constraints.add( + m.work[key] == sum( + m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] + for num_devices in DEVICE_OPTIONS[model_name] + ) + ) + # If any non-zero device is selected, work must be >= 1 + m.constraints.add( + m.work[key] >= sum( + m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] + for num_devices in DEVICE_OPTIONS[model_name] + ) + ) + # work_device[d] <= TOTAL_WORK * device_choice[d] + for num_devices in [0] + DEVICE_OPTIONS[model_name]: + didx = dev_idx(gpu_type, model_name, instance_id, num_devices) + m.constraints.add( + m.work_device[didx] <= work[model_name] * m.device_choice[didx] + ) + + # Link instance time to per-model max time + m.constraints.add(m.time[key] <= m.time_max[model_name.value]) + + # Link TTFF to per-model TTFF min + # If selected → ttff_min[model] == ttff_var + m.constraints.add(m.ttff_min[model_name.value] >= m.ttff[key] - MAX_TIME * (1 - m.is_min[key])) + m.constraints.add(m.ttff_min[model_name.value] <= m.ttff[key] + MAX_TIME * (1 - m.is_active[key])) + + # One device per instance + for instance_id in instance_ids: + m.constraints.add( + sum( + m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] + for num_devices in [0] + DEVICE_OPTIONS[model_name] + ) == 1 + ) + + # Symmetry breaking (fill earlier instances first) + for instance_id in range(MAX_INSTANCES - 1): + m.constraints.add( + m.gpus[idx(gpu_type, model_name, instance_id)] + >= m.gpus[idx(gpu_type, model_name, instance_id + 1)] + ) + + # Makespan is the sum of max times per model (models run sequentially) + m.constraints.add(m.makespan == sum(m.time_max[model_name.value] for model_name in model_names)) + + # User TTFF definition: sum of min TTFF per model + m.constraints.add(m.ttff_user >= sum(m.ttff_min[model_name.value] for model_name in model_names)) + m.constraints.add(m.ttff_user >= m.makespan - self.workflow.total_video_seconds) + + # Select exactly 1 instance as the min TTFF instance per model + for model_name in model_names: + m.constraints.add( + sum( + m.is_min[idx(gpu_type, model_name, instance_id)] + for gpu_type in gpu_types + for instance_id in instance_ids + ) == 1 + ) + + # Resolution scaling factor for HF/VAE/FT + latency_ratio = self.workflow.get_resolution_scale(self.policy.use_upscaler) + + # Time constraints + # Each model block is guarded by membership in model_names so that + # the MILP can be built for a subset of models (e.g. Helix per-model). + for gpu_type in gpu_types: + # Gemma + if Model.GEMMA in model_names and work[Model.GEMMA] > 0: + model_name = Model.GEMMA + for instance_id in instance_ids: + key = idx(gpu_type, model_name, instance_id) + # Makespan is the max time across all instances + # Linearized: use work_device instead of device_choice * work + if work[model_name] > 1: + # Parallel: each work unit = 1 scene + # Time for w scenes + # = gemma_first_scene + gemma_per_scene * (w - 1) + # = (gemma_first_scene - gemma_per_scene) * is_active + gemma_per_scene * work + # Using linearized variables: + # = (gemma_first_scene[d] - gemma_per_scene[d]) * \ + # device_choice[d] + gemma_per_scene[d] * work_device[d] + m.constraints.add( + m.time[key] == sum( + ( + self.latency_data[gpu_type].gemma_first_scene[num_devices] + - self.latency_data[gpu_type].gemma_per_scene[num_devices] + ) + * m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] + + self.latency_data[gpu_type].gemma_per_scene[num_devices] + * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] + for num_devices in DEVICE_OPTIONS[model_name] + ) + ) + else: + m.constraints.add( + m.time[key] == sum( + ( + self.latency_data[gpu_type].gemma_first_scene[num_devices] + + self.latency_data[gpu_type].gemma_per_scene[num_devices] + * (self.workflow.total_scenes - 1) + ) + * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] + for num_devices in DEVICE_OPTIONS[model_name] + ) + ) + # TTFF is for 1 work unit + m.constraints.add( + m.ttff[key] == sum( + m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] + * self.latency_data[gpu_type].gemma_first_scene[num_devices] + * 1 # TTFF for tokens in first scene + for num_devices in DEVICE_OPTIONS[model_name] + ) + ) + + # Flux + if Model.FLUX in model_names and work[Model.FLUX] > 0: + model_name = Model.FLUX + for instance_id in instance_ids: + key = idx(gpu_type, model_name, instance_id) + # Makespan is the max time across all instances + # Linearized: use work_device instead of device_choice * work + if work[model_name] > 1: + # Parallel: each work unit = 1 scene + # Time for w scenes = latency * num_steps_flux * w + m.constraints.add( + m.time[key] == sum( + self.latency_data[gpu_type][model_name, num_devices] + * self.workflow.num_steps[model_name] + * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] + for num_devices in DEVICE_OPTIONS[model_name] + ) + ) + else: + # Non-parallel: single work unit covers all scenes + m.constraints.add( + m.time[key] == sum( + self.latency_data[gpu_type][model_name, num_devices] + * self.workflow.num_steps[model_name] + * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] + for num_devices in DEVICE_OPTIONS[model_name] + ) + ) + # TTFF is for 1 work unit + m.constraints.add( + m.ttff[key] == sum( + m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] + * self.latency_data[gpu_type][model_name, num_devices] + * self.workflow.num_steps[model_name] + * 1 # TTFF for first work unit + for num_devices in DEVICE_OPTIONS[model_name] + ) + ) + + # Hunyuan FramePack + if Model.HF in model_names and work[Model.HF] > 0: + model_name = Model.HF + for instance_id in instance_ids: + key = idx(gpu_type, model_name, instance_id) + + """ + from models import HFModelAllocation + HFModelAllocation( + gpu_type, + num_devices, + replicas=1, + )._calc_time_per_subscene( + self.policy, + self.workflow, + self.latency_data[gpu_type] + ) + """ + + # Makespan is the max time across all instances + # Linearized: use work_device instead of device_choice * work + hf_time_expr = sum( + self.workflow.per_subscene_frames[model_name] + / self.workflow.hf_frames[self.workflow.frames_per_step_idx] + * self.latency_data[gpu_type][model_name, num_devices] + * latency_ratio + * self.workflow.num_steps[model_name] + * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] + for num_devices in DEVICE_OPTIONS[model_name] + ) + # When not disaggregated, VAE runs on the same instance + if not self.policy.is_disaggregated(Model.HF): + hf_vae_time_per_work = ( + self.latency_data[gpu_type][Model.HF_VAE, 1] + * latency_ratio + / self.workflow.hf_frames[self.workflow.frames_per_step_idx] + ) + hf_time_expr += hf_vae_time_per_work * m.work[key] + m.constraints.add(m.time[key] == hf_time_expr) + # TTFF is for first chunk (can be smaller than subscene when disaggregated) + ttff_frames_hf = min( + self.workflow.hf_frames[0], + self.workflow.per_subscene_frames[model_name]) + hf_ttff_expr = sum( + m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] + * ttff_frames_hf + / self.workflow.hf_frames[self.workflow.frames_per_step_idx] + * self.latency_data[gpu_type][model_name, num_devices] + * latency_ratio + * self.workflow.num_steps[model_name] + * 1 # TTFF for first chunk + for num_devices in DEVICE_OPTIONS[model_name] + ) + # When not disaggregated, add VAE decode time for first chunk + if not self.policy.is_disaggregated(Model.HF): + hf_vae_ttff = ( + ttff_frames_hf + / self.workflow.hf_frames[self.workflow.frames_per_step_idx] + * self.latency_data[gpu_type][Model.HF_VAE, 1] + * latency_ratio + ) + hf_ttff_expr += hf_vae_ttff * m.is_active[key] + m.constraints.add(m.ttff[key] == hf_ttff_expr) + + # Hunyuan FramePack VAE + if Model.HF_VAE in model_names and work[Model.HF_VAE] > 0: + model_name = Model.HF_VAE + for instance_id in instance_ids: + key = idx(gpu_type, model_name, instance_id) + # Makespan is the max time across all instances + # Linearized: use work_device instead of device_choice * work + m.constraints.add( + m.time[key] == sum( + self.latency_data[gpu_type][model_name, num_devices] + * latency_ratio + / self.workflow.hf_frames[self.workflow.frames_per_step_idx] + * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] + for num_devices in DEVICE_OPTIONS[model_name] + ) + ) + # TTFF is for 1 subscene + m.constraints.add( + m.ttff[key] == sum( + m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] + * self.workflow.per_subscene_frames[Model.HF] + * self.latency_data[gpu_type][model_name, num_devices] + * latency_ratio + / self.workflow.hf_frames[self.workflow.frames_per_step_idx] # frames_per_step_hf + * 1 # TTFF for first subscene + for num_devices in DEVICE_OPTIONS[model_name] + ) + ) + + # Fantasy Talking + if Model.FT in model_names and work[Model.FT] > 0: + model_name = Model.FT + for instance_id in instance_ids: + key = idx(gpu_type, model_name, instance_id) + # Makespan is the max time across all instances + # Linearized: use work_device instead of device_choice * work + ft_time_expr = sum( + self.workflow.per_subscene_frames[model_name] + / self.workflow.ft_frames[self.workflow.frames_per_step_idx] + * self.latency_data[gpu_type][model_name, num_devices] + * latency_ratio + * self.workflow.num_steps[model_name] + * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] + for num_devices in DEVICE_OPTIONS[model_name] + ) + # When not disaggregated, VAE runs on the same instance + if not self.policy.is_disaggregated(Model.FT): + ft_vae_time_per_work = ( + self.latency_data[gpu_type][Model.FT_VAE, 1] + * latency_ratio + / self.workflow.ft_frames[self.workflow.frames_per_step_idx] + ) + ft_time_expr += ft_vae_time_per_work * m.work[key] + m.constraints.add(m.time[key] == ft_time_expr) + # TTFF is for 1 work unit (e.g., subscene) + ft_ttff_expr = sum( + m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] + * self.workflow.per_subscene_frames[model_name] + / self.workflow.ft_frames[self.workflow.frames_per_step_idx] + * self.latency_data[gpu_type][model_name, num_devices] + * latency_ratio + * self.workflow.num_steps[model_name] + * 1 # TTFF for first subscene + for num_devices in DEVICE_OPTIONS[model_name] + ) + # When not disaggregated, add VAE decode time for first subscene + if not self.policy.is_disaggregated(Model.FT): + ft_vae_ttff = ( + self.workflow.per_subscene_frames[Model.FT] + / self.workflow.ft_frames[self.workflow.frames_per_step_idx] + * self.latency_data[gpu_type][Model.FT_VAE, 1] + * latency_ratio + ) + ft_ttff_expr += ft_vae_ttff * m.is_active[key] + m.constraints.add(m.ttff[key] == ft_ttff_expr) + + # Fantasy Talking VAE + if Model.FT_VAE in model_names and work[Model.FT_VAE] > 0: + model_name = Model.FT_VAE + for instance_id in instance_ids: + key = idx(gpu_type, model_name, instance_id) + # Makespan is the max time across all instances + # Linearized: use work_device instead of device_choice * work + m.constraints.add( + m.time[key] == sum( + self.latency_data[gpu_type][model_name, num_devices] + * latency_ratio + / self.workflow.ft_frames[self.workflow.frames_per_step_idx] + * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] + for num_devices in DEVICE_OPTIONS[model_name] + ) + ) + # TTFF is for 1 subscene + m.constraints.add( + m.ttff[key] == sum( + m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] + * self.workflow.per_subscene_frames[Model.FT] + * self.latency_data[gpu_type][model_name, num_devices] + * latency_ratio + / self.workflow.ft_frames[self.workflow.frames_per_step_idx] # frames_per_step_ft + * 1 # TTFF for first subscene + for num_devices in DEVICE_OPTIONS[model_name] + ) + ) + + # Upscaler + if Model.UPSCALER in model_names and work[Model.UPSCALER] > 0 and self.policy.use_upscaler: + model_name = Model.UPSCALER + for instance_id in instance_ids: + key = idx(gpu_type, model_name, instance_id) + # Linearized: use work_device instead of device_choice * work + m.constraints.add( + m.time[key] == sum( + self.latency_data[gpu_type][model_name, num_devices] + * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] + for num_devices in DEVICE_OPTIONS[model_name] + ) + ) + # TTFF is for 1 work unit (e.g., subscene) + m.constraints.add( + m.ttff[key] == sum( + m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] + * self.latency_data[gpu_type][model_name, num_devices] + * self.workflow.per_subscene_frames[Model.FT] + * 1 # TTFF is for first subscene + for num_devices in DEVICE_OPTIONS[model_name] + ) + ) + + # Others + if Model.OTHERS in model_names and work[Model.OTHERS] > 0: + model_name = Model.OTHERS + for instance_id in instance_ids: + key = idx(gpu_type, model_name, instance_id) + # Makespan is the max time across all instances + m.constraints.add( + m.time[key] == sum( + m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] + * self.latency_data[gpu_type][model_name, num_devices] + * self.workflow.total_scenes + for num_devices in DEVICE_OPTIONS[model_name] + ) + ) + # TTFF is for 1 work unit + m.constraints.add( + m.ttff[key] == sum( + m.device_choice[dev_idx(gpu_type, model_name, instance_id, num_devices)] + * self.latency_data[gpu_type][model_name, num_devices] + * 1 # TTFF is for first scene + for num_devices in DEVICE_OPTIONS[model_name] + ) + ) + + # Total work to do for each model + for model_name in model_names: + m.constraints.add( + sum( + m.work[idx(gpu_type, model_name, instance_id)] + for gpu_type in gpu_types + for instance_id in instance_ids + ) == work[model_name] + ) + + # Number of GPUs per type + # Add a variable to represent the number of servers for each GPU type + m.num_servers = Var(m.GPU_TYPES, domain=NonNegativeIntegers) + + for gpu_type in gpu_types: + total_gpus = sum( + m.gpus[idx(gpu_type, model_name, instance_id)] + for model_name in model_names + for instance_id in instance_ids + ) + if force_num_gpus: + m.constraints.add(total_gpus == num_gpus[gpu_type]) + else: + m.constraints.add(total_gpus <= num_gpus[gpu_type]) + + # GPUs used must be a multiple of NUM_GPUS_PER_SERVER + if not skip_server_constraint: + m.constraints.add(total_gpus == m.num_servers[gpu_type.value] * NUM_GPUS_PER_SERVER[gpu_type]) + + # Cost calculation + # running_cost=True: cost based only on active model running time + if running_cost: + cost_expr = sum( + self._get_latency_per_work( + gpu_type, + model_name, + num_devices, + ) + * num_devices + * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] + * self.policy.gpu_cost[gpu_type] / SECONDS_IN_HOUR + for gpu_type in gpu_types + for model_name in model_names + for instance_id in instance_ids + for num_devices in DEVICE_OPTIONS[model_name] + ) + # running_cost=False: cost = makespan × total_GPUs_used (GPUs allocated for full job duration) + else: + cost_expr = m.makespan * sum( + m.gpus[idx(gpu_type, model_name, instance_id)] + * self.policy.gpu_cost[gpu_type] / SECONDS_IN_HOUR + for gpu_type in gpu_types + for model_name in model_names + for instance_id in instance_ids + ) + m.constraints.add(m.cost == cost_expr) + + # Energy: model-specific power * active time + idle power * (makespan - active time) + if self.power_data is None: + energy_expr = 0.0 + else: + # Active energy: Use model-specific power values (not TDP) + energy_expr = sum( + self._get_latency_per_work( + gpu_type, + model_name, + num_devices, + ) + * num_devices + * m.work_device[dev_idx(gpu_type, model_name, instance_id, num_devices)] + * ( + self._get_power_per_work( + gpu_type, + model_name, + num_devices, + ) - self.power_data[gpu_type]["idle"] + ) + for gpu_type in gpu_types + for model_name in model_names + for instance_id in instance_ids + for num_devices in DEVICE_OPTIONS[model_name] + ) + # Idle energy: idle power * num_gpus * makespan + energy_expr += sum( + self.power_data[gpu_type]["idle"] * num_gpus[gpu_type] * m.makespan + for gpu_type in gpu_types + ) + m.constraints.add(m.energy == energy_expr) + + # Bounds + if max_cost is not None: + m.constraints.add(m.cost <= max_cost) + if max_ttff is not None: + m.constraints.add(m.ttff_user <= max_ttff) + if max_makespan is not None: + m.constraints.add(m.makespan <= max_makespan) + + # Objective functions + obj = get_objective( + m=m, + allocator=self.policy.objective, + solver_name=solver_name, + ) + if obj is not None: + m.objective = obj + + # Solve + solver = SolverFactory(solver_name) + if solver_name == "gurobi" and time_limit: + solver.options["TimeLimit"] = time_limit + if solver_name == "highs" and time_limit: + solver.options["time_limit"] = time_limit + if self.policy.objective in QUADRATIC_OBJECTIVES and solver_name == "gurobi": + solver.options['NonConvex'] = 2 # Option for bilinear objectives + if solver_name == "highs": + solver.options["time_limit"] = 50 # seconds + + if warm_start_path is not None: + _load_warm_start(m, warm_start_path) + + if solver_name == "gurobi": + opt_result = solver.solve( + m, + tee=verbose, + warmstart=warm_start_path is not None, + ) + else: + opt_result = solver.solve(m, tee=verbose) + + if opt_result.solver.status != "ok": + logging.error(f"Solver failed with status: {opt_result.solver.status}") + + if save_solution_path is not None: + _save_solution(m, save_solution_path) + + models = milp_to_models_dict( + m=m, + gpu_types=gpu_types, + model_names=model_names, + instance_ids=instance_ids, + idx=idx, + workflow=self.workflow, + power_data=self.power_data, + policy=self.policy, + ) + + if not self._is_valid_result(m): + return Result() + + tbf_s = 0.0 + if m.makespan.value and self.workflow.num_frames > 0: + tbf_s = m.makespan.value / self.workflow.num_frames + return Result( + models=models, + gpus_used=self._get_num_gpus(m, gpu_types, model_names, instance_ids), + total_time_s=m.makespan.value, + ttff_s=m.ttff_user.value, + tbf_s=tbf_s, + cost=m.cost.value, + total_energy=m.energy.value, + ) + + def _is_valid_result(self, m: ConcreteModel) -> bool: + for gpu_type in m.GPU_TYPES: + for model_name in m.MODEL_NAMES: + for instance_id in m.INSTANCES: + if m.gpus[gpu_type, model_name, instance_id].value is None: + return False + return True + + def _get_num_gpus( + self, + m: ConcreteModel, + gpu_types: list[GPUType], + model_names: list[Model], + instance_ids: list[int], + ) -> dict[GPUType, int]: + if not self._is_valid_result(m): + return {} + return { + gpu_type: sum( + # round() snaps solver float to nearest int (e.g. 1.9999 -> 2) + int(round(m.gpus[idx(gpu_type, model_name, instance_id)].value)) + for model_name in model_names + for instance_id in instance_ids + if m.gpus[idx(gpu_type, model_name, instance_id)].value is not None + ) + for gpu_type in gpu_types + } + + def _get_latency_per_work( + self, + gpu_type: GPUType, + model_name: Model, + num_devices: int, + ) -> float: + """ + Cost per unit of work for a given model and GPU type, based on latency data. + Cost: Linearized - sum of (latency * work_device * num_devices * ratio) + This replaces the bilinear makespan * GPUs. + """ + # Resolution scaling factor for HF/VAE/FT + latency_ratio = self.workflow.get_resolution_scale(self.policy.use_upscaler) + + if model_name == Model.GEMMA: + return ( + self.latency_data[gpu_type].gemma_first_scene[num_devices] + + self.latency_data[gpu_type].gemma_per_scene[num_devices] * (self.workflow.total_scenes - 1) + ) + + if model_name == Model.FLUX: + return ( + self.latency_data[gpu_type][model_name, num_devices] + * self.workflow.num_steps[Model.FLUX] + ) + + if model_name == Model.HF: + time_per_work = ( + self.workflow.per_subscene_frames[Model.HF] + / self.workflow.hf_frames[self.workflow.frames_per_step_idx] + * self.latency_data[gpu_type][model_name, num_devices] + * latency_ratio + * self.workflow.num_steps[Model.HF] + ) + if not self.policy.is_disaggregated(Model.HF): + time_per_work += self._get_latency_per_work( + gpu_type, + Model.HF_VAE, + 1, # VAE is single-device only in current policy + ) + return time_per_work + + if model_name == Model.HF_VAE: + return ( + self.latency_data[gpu_type][model_name, num_devices] + * latency_ratio + / self.workflow.hf_frames[self.workflow.frames_per_step_idx] + ) + + if model_name == Model.FT: + time_per_work = ( + self.workflow.per_subscene_frames[Model.FT] + / self.workflow.ft_frames[self.workflow.frames_per_step_idx] + * self.latency_data[gpu_type][model_name, num_devices] + * latency_ratio + * self.workflow.num_steps[Model.FT] + ) + if not self.policy.is_disaggregated(Model.FT): + time_per_work += self._get_latency_per_work( + gpu_type, + Model.FT_VAE, + 1, # VAE is single-device only in current policy + ) + return time_per_work + + if model_name == Model.FT_VAE: + return ( + self.latency_data[gpu_type][model_name, num_devices] + * latency_ratio + / self.workflow.ft_frames[self.workflow.frames_per_step_idx] + ) + + if model_name == Model.UPSCALER: + return self.latency_data[gpu_type][model_name, num_devices] + + if model_name == Model.OTHERS: + return self.latency_data[gpu_type][model_name, num_devices] * self.workflow.total_scenes + + raise ValueError(f"Unknown model_name {model_name}") + + def _get_power_per_work( + self, + gpu_type: GPUType, + model_name: Model, + num_devices: int, + ) -> float: + """ + Average power per unit of work for a given model and GPU type. + Returns the time-weighted average power consumption in watts. + For energy calculation: + energy = _get_latency_per_work(...) * _get_power_per_work(...) * num_devices * work + """ + if self.power_data is None: + return 0.0 + + if model_name == Model.GEMMA: + # For Gemma, power varies between first scene and subsequent scenes + # Compute energy then divide by total time to get average power + power_first = self.power_data[gpu_type].gemma_first_scene[num_devices] + power_per_scene = self.power_data[gpu_type].gemma_per_scene[num_devices] + latency_first = self.latency_data[gpu_type].gemma_first_scene[num_devices] + latency_per_scene = self.latency_data[gpu_type].gemma_per_scene[num_devices] + + total_energy = ( + power_first * latency_first + + power_per_scene * latency_per_scene * (self.workflow.total_scenes - 1) + ) + total_time = latency_first + latency_per_scene * (self.workflow.total_scenes - 1) + + return total_energy / total_time if total_time > 0 else power_first + + if model_name == Model.FLUX: + return self.power_data[gpu_type][model_name, num_devices] + + if model_name == Model.HF: + return self.power_data[gpu_type][model_name, num_devices] + + if model_name == Model.HF_VAE: + return self.power_data[gpu_type][model_name, num_devices] + + if model_name == Model.FT: + return self.power_data[gpu_type][model_name, num_devices] + + if model_name == Model.FT_VAE: + return self.power_data[gpu_type][model_name, num_devices] + + if model_name == Model.UPSCALER: + return self.power_data[gpu_type][model_name, num_devices] + + if model_name == Model.OTHERS: + # OTHERS model uses minimal GPU power (mostly idle) + # See models.py OthersModelAllocation.calculate_energy - only uses idle power + return self.power_data[gpu_type]["idle"] + + raise ValueError(f"Unknown model_name {model_name}") + + +def milp_to_models_dict( + m: ConcreteModel, + gpu_types: list[GPUType], + model_names: list[Model], + instance_ids: list[int], + idx: Callable[[GPUType, Model, int], tuple[str, str, int]], + workflow: WorkflowConfig, + power_data: Optional[PowerData], + policy: Policy, +) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: + """ + MILP result to models dictionary. + """ + if m is None: + return {} + + models: dict[GPUType, dict[Model, list[ModelAllocation]]] = {} + for gpu_type in gpu_types: + models[gpu_type] = {} + for model_name in model_names: + models[gpu_type][model_name] = [] + for instance_id in instance_ids: + key = idx(gpu_type, model_name, instance_id) + gpus_val = m.gpus[key].value + work_val = m.work[key].value + if gpus_val is None or work_val is None: + continue + # round() snaps solver floats to nearest int (e.g. 1.9999 -> 2); + # banker's rounding is irrelevant here since MILP values can be + # near-integer, like 1.999 and 2.001 + gpus = int(round(gpus_val)) + work = int(round(work_val)) + if gpus > 0 and work > 0: + model_allocation = get_model_allocation( + model=model_name, + gpu_type=gpu_type, + devices=gpus, + replicas=1, + ) + model_allocation.work = work + model_allocation.time = m.time[key].value + model_allocation.time_first = m.ttff[key].value + model_allocation.calculate_energy( + workflow=workflow, + power_data=power_data, + total_time_s=m.makespan.value + ) + model_allocation.calculate_cost( + policy, + total_time_s=m.makespan.value + ) + models[gpu_type][model_name].append(model_allocation) + merged_models = models # coalesce_models(models) + return merged_models + + +def get_objective( + m: ConcreteModel, + allocator: Objective, + solver_name: str, +) -> Optional[OptObjective]: + if allocator == Objective.TIME: + return OptObjective(expr=m.makespan, sense=minimize) + + if allocator == Objective.TTFF: + return OptObjective(expr=m.ttff_user, sense=minimize) + + if allocator == Objective.TTFF_COST: + # Note: This creates a bilinear (nonconvex) objective - requires Gurobi + if solver_name == "gurobi": + return OptObjective(expr=m.ttff_user * m.cost, sense=minimize) + logging.warning("TTFF_COST using linear utility function.") + a = 1.0 + b = 1.0 + return OptObjective(expr=a * m.ttff_user + b * m.cost, sense=minimize) + + if allocator == Objective.COST: + return OptObjective(expr=m.cost, sense=minimize) + + if allocator == Objective.ENERGY: + return OptObjective(expr=m.energy, sense=minimize) + + if allocator == Objective.TIME_ENERGY: + # Note: This creates a bilinear objective - requires Gurobi + if solver_name == "gurobi": + return OptObjective(expr=m.makespan * m.energy, sense=minimize) + logging.warning("TIME_ENERGY using linear utility function.") + a = 1.0 + b = 1.0 + return OptObjective(expr=a * m.makespan + b * m.energy, sense=minimize) + + if allocator == Objective.ENERGY_COST: + if solver_name == "gurobi": + return OptObjective(expr=m.energy * m.cost, sense=minimize) + logging.warning("ENERGY_COST using linear utility function.") + a = 1.0 + b = 1.0 + return OptObjective(expr=a * m.energy + b * m.cost, sense=minimize) + + if allocator == Objective.FIFO: + logging.error("FIFO not implemented in MILP") + + if allocator == Objective.RANDOM: + return None # No objective, just find a feasible solution + + if allocator == Objective.NONE: + return None + + return OptObjective(expr=m.makespan, sense=minimize) + + +def _save_solution( + m: ConcreteModel, + save_solution_path: str, +) -> None: + solution = { + var.name: var.value + for var in m.component_data_objects(Var, active=True) + if var.value is not None + } + with open(save_solution_path, "w", encoding="utf-8") as output_file: + json.dump(solution, output_file, indent=2) + + +def _load_warm_start( + m: ConcreteModel, + warm_start_path: str, +) -> None: + """Load warm start values from a JSON file and apply them to the model variables.""" + with open(warm_start_path, "r", encoding="utf-8") as input_file: + warm_start_values = json.load(input_file) + + warm_start_applied = 0 + for var in m.component_data_objects(Var, active=True): + if var.name in warm_start_values: + var.set_value(warm_start_values[var.name]) + warm_start_applied += 1 + + logging.info( + f"Warm start loaded from {warm_start_path}. " + f"Applied values to {warm_start_applied} variables." + ) diff --git a/streamwise/model_provisioner/naive_baseline.py b/streamwise/model_provisioner/naive_baseline.py new file mode 100644 index 00000000..ec95904e --- /dev/null +++ b/streamwise/model_provisioner/naive_baseline.py @@ -0,0 +1,484 @@ +""" +Naive baseline for the StreamWise workflow allocation problem. +""" + +from __future__ import annotations + +from typing import Optional + +from constants import NUM_GPUS_PER_SERVER +from constants import DEVICE_OPTIONS + +from sim_types import Result +from sim_types import GPUType +from sim_types import WorkflowConfig +from sim_types import LatencyData +from sim_types import PowerData +from sim_types import Policy +from sim_types import Solver +from sim_types import Model +from sim_types import ModelAllocation +from sim_types import Objective + +from models import FluxModelAllocation +from models import GemmaModelAllocation +from models import HFModelAllocation +from models import HFVAEModelAllocation +from models import FTModelAllocation +from models import FTVAEModelAllocation +from models import UpscalerModelAllocation +from models import OthersModelAllocation + +from evaluator import evaluate_model_allocation + +from .policies import NAIVE_POLICY +from .policies import MAX_DEVICES + +from model_allocator import ModelAllocator + + +class NaiveAllocator(ModelAllocator): + """ + Naive allocator that implements a simple heuristic. + """ + def __init__( + self, + workflow: WorkflowConfig, + latency_data: LatencyData, + power_data: Optional[PowerData] = None, + policy: Policy = NAIVE_POLICY, + ) -> None: + super().__init__( + workflow, + latency_data, + power_data, + policy, + ) + assert self.policy.solver == Solver.NAIVE + assert self.policy.objective == Objective.TTFF + + def allocate( + self, + num_gpus: dict[GPUType, int], + verbose: bool = False, + ) -> Result: + total_gpus = sum(num_gpus.values()) + assert total_gpus >= 8, f"Total number of GPUs must be at least 8 ({num_gpus})" + + gpu_types = [ + gpu_type + for gpu_type, count in num_gpus.items() + if count > 0 + ] + assert 1 <= len(gpu_types) <= 2, f"Only up to two GPU types are supported ({len(gpu_types)})" + gpu_type1 = gpu_types[0] + + if len(gpu_types) == 1: + models = self._naive_single( + num_gpus.get(gpu_type1, 0), + gpu_type=gpu_type1, + ) + else: + # Mixed setup of GPU types (e.g., A100 and H100) + models = self._naive_two(num_gpus) + + result = evaluate_model_allocation( + models=models, + num_gpus=num_gpus, + workflow=self.workflow, + latency_data=self.latency_data, + power_data=self.power_data, + policy=self.policy, + round_up_cost_to_server=True, + ) + return result + + def _naive_single( + self, + num_gpus: int, + gpu_type: GPUType, + ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: + """Naive allocation for single GPU type.""" + return self._naive_parallelism_allocation(gpu_type, num_gpus) + + def _naive_two( + self, + num_gpus: dict[GPUType, int], + ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: + """Naive allocation for two GPU types.""" + gpu_types = list(num_gpus.keys()) + assert len(gpu_types) == 2 + assert len(num_gpus) == 2 + gpu_type1 = gpu_types[0] + gpu_type2 = gpu_types[1] + assert num_gpus[gpu_type1] >= NUM_GPUS_PER_SERVER[gpu_type1] + assert num_gpus[gpu_type2] >= NUM_GPUS_PER_SERVER[gpu_type2] + + # Initialize allocations with minimal setup + models: dict[GPUType, dict[Model, list[ModelAllocation]]] = { + gpu_type1: { # 3 x A100s (type1) + Model.GEMMA: [GemmaModelAllocation( + gpu_type=gpu_type1, + devices=1, replicas=1)], + Model.FLUX: [FluxModelAllocation( + gpu_type=gpu_type1, + devices=1, replicas=1)], + Model.HF: [], + Model.HF_VAE: [], + Model.FT: [], + Model.FT_VAE: [], + Model.UPSCALER: [], + Model.OTHERS: [OthersModelAllocation( + gpu_type=gpu_type1, + devices=1, replicas=1)], # + 1 for Kokoro/YOLO + }, + gpu_type2: { # 4 (+1) X H100 GPUs (type2) + Model.GEMMA: [], + Model.FLUX: [], + Model.HF: [HFModelAllocation( + gpu_type=gpu_type2, + devices=1, replicas=1)], + Model.HF_VAE: [HFVAEModelAllocation( + gpu_type=gpu_type2, + devices=1, replicas=1)], + Model.FT: [FTModelAllocation( + gpu_type=gpu_type2, + devices=2, replicas=1)], + Model.FT_VAE: [FTVAEModelAllocation( + gpu_type=gpu_type2, + devices=1, replicas=1)], + Model.UPSCALER: [UpscalerModelAllocation( + gpu_type=gpu_type2)], + Model.OTHERS: [], + }, + } + + # Calculate remaining: starting - assigned + if not self.policy.is_disaggregated(Model.HF): + models[gpu_type2][Model.HF][0].replicas = 2 + models[gpu_type2][Model.HF_VAE][0].replicas = 0 + if not self.policy.is_disaggregated(Model.FT): + models[gpu_type2][Model.FT_VAE][0].replicas = 0 + + if self.policy.use_upscaler: + models[gpu_type2][Model.UPSCALER][0].replicas = 1 + + models_gpu_type1 = self._naive_parallelism_allocation( + gpu_type1, + num_gpus.get(gpu_type1, 0), + ) + models_gpu_type2 = self._naive_parallelism_allocation( + gpu_type2, + num_gpus.get(gpu_type2, 0), + # Already allocated in first GPU type + skip_non_paralelizable_models=True, + ) + models[gpu_type1] = models_gpu_type1[gpu_type1] + models[gpu_type2] = models_gpu_type2[gpu_type2] + + # Apply per-GPU-type overrides after allocation + if self.policy.use_upscaler: + models[gpu_type2][Model.UPSCALER][0].replicas = 1 + + return models + + def _naive_parallelism_allocation( + self, + gpu_type: GPUType, + num_devices: int, + skip_non_paralelizable_models: bool = False, + ) -> dict[GPUType, dict[Model, list[ModelAllocation]]]: + """ + Device allocation for naive parallelism. + Max devices for each model. + Allocate devices to each model proportional to their max devices. + """ + models: dict[GPUType, dict[Model, list[ModelAllocation]]] = { + gpu_type: { + Model.GEMMA: [GemmaModelAllocation( + gpu_type=gpu_type, + replicas=1)], + Model.FLUX: [FluxModelAllocation( + gpu_type=gpu_type, + replicas=1)], + Model.HF: [HFModelAllocation( + gpu_type=gpu_type, + replicas=1)], + Model.HF_VAE: [HFVAEModelAllocation( + gpu_type=gpu_type, + replicas=1 if self.policy.is_disaggregated(Model.HF) else 0)], + Model.FT: [FTModelAllocation( + gpu_type=gpu_type, + replicas=4)], + Model.FT_VAE: [FTVAEModelAllocation( + gpu_type=gpu_type, + replicas=1 if self.policy.is_disaggregated(Model.FT) else 0)], + Model.OTHERS: [OthersModelAllocation( + gpu_type=gpu_type, + replicas=1)], # + 1 for Kokoro/YOLO + Model.UPSCALER: [UpscalerModelAllocation( + gpu_type=gpu_type, + replicas=1 if self.policy.use_upscaler else 0)], + }, + } + + # Zero out replicas for models not in workflow + for model in Model: + if model not in self.workflow.models: + for alloc in models[gpu_type][model]: + alloc.replicas = 0 + + # Zero out replicas for models that are not parallelizable when skip_non_paralelizable_models is True + if skip_non_paralelizable_models: + for model in Model: + if not self.workflow.is_parallelizable(model): + for alloc in models[gpu_type][model]: + alloc.replicas = 0 + + # Assert only 1 allocation instance per model for naive parallelism + for model in Model: + assert len(models[gpu_type][model]) == 1, \ + f"Expected only 1 allocation instance for {model}, got {len(models[gpu_type][model])}" + + alloc_id = 0 + model_gemma = models[gpu_type][Model.GEMMA][alloc_id] + model_flux = models[gpu_type][Model.FLUX][alloc_id] + model_hf = models[gpu_type][Model.HF][alloc_id] + model_vae = models[gpu_type][Model.HF_VAE][alloc_id] + model_ft = models[gpu_type][Model.FT][alloc_id] + model_ft_vae = models[gpu_type][Model.FT_VAE][alloc_id] + model_upscaler = models[gpu_type][Model.UPSCALER][alloc_id] + + # TODO do we need to do something for Model.OTHERS + + if num_devices == 8: + # single server case, use fixed allocation + if Model.FT in self.workflow.models: + model_ft.replicas = 4 + if self.policy.use_upscaler and Model.UPSCALER in self.workflow.models: + model_upscaler.replicas = 1 + if Model.FT in self.workflow.models: + model_ft.replicas -= 1 + if self.policy.is_disaggregated(Model.HF) and Model.HF_VAE in self.workflow.models: + model_vae.replicas = 1 + if Model.FT in self.workflow.models: + model_ft.replicas -= 1 + if self.policy.is_disaggregated(Model.FT) and Model.FT_VAE in self.workflow.models: + model_ft_vae.replicas = 1 + if Model.FT in self.workflow.models: + model_ft.replicas -= 1 + return models + + init_num_devices = sum([ + model[0].devices * model[0].replicas + for model in models[gpu_type].values() + ]) + + # Allocate devices proportional to each model's max devices + max_devices = MAX_DEVICES + models_in_workflow = [ + model + for model in max_devices.keys() + if model in self.workflow.models + ] + if skip_non_paralelizable_models: + for model in max_devices.keys(): + if not self.workflow.is_parallelizable(model): + models_in_workflow.remove(model) + + total_max_devices = sum([ + max_devices[model] + for model in models_in_workflow + ]) + for model in models_in_workflow: + # Calculate the number of devices to allocate for the model, proportional to its max devices among others + alloc_devices = int((num_devices - init_num_devices) * max_devices[model] / total_max_devices) + if model == Model.GEMMA: + max_devices_gemma = max_devices[Model.GEMMA] + if self.latency_data: + max_devices_gemma = min(max_devices_gemma, self.latency_data[gpu_type].get_max_parallelism(model)) + model_gemma.devices += min(alloc_devices, max_devices_gemma) + # Round down nearest in DEVICE_OPTIONS_GEMMA + num_gemma_devices = max([ + d + for d in DEVICE_OPTIONS[Model.GEMMA] + if d <= model_gemma.devices + ]) + model_gemma.devices = num_gemma_devices + elif model == Model.FLUX: + max_devices_flux = max_devices[Model.FLUX] + if self.latency_data: + max_devices_flux = min(max_devices_flux, self.latency_data[gpu_type].get_max_parallelism(model)) + model_flux.devices += min(alloc_devices, max_devices_flux) + # Round down nearest in DEVICE_OPTIONS_FLUX + model_flux.devices = max([ + d + for d in DEVICE_OPTIONS[Model.FLUX] + if d <= model_flux.devices + ]) + elif model == Model.HF: + max_devices_hf = max_devices[Model.HF] + if self.latency_data: + max_devices_hf = min(max_devices_hf, self.latency_data[gpu_type].get_max_parallelism(model)) + model_hf.replicas += min(alloc_devices, max_devices_hf) + elif model == Model.HF_VAE: + if self.policy.is_disaggregated(Model.HF): + max_devices_vae = max_devices[Model.HF_VAE] + if self.latency_data: + max_devices_vae = min(max_devices_vae, self.latency_data[gpu_type].get_max_parallelism(model)) + model_vae.replicas += min(alloc_devices, max_devices_vae) + elif model == Model.FT: + max_devices_ft = max_devices[Model.FT] + if self.latency_data: + max_devices_ft = min(max_devices_ft, self.latency_data[gpu_type].get_max_parallelism(model)) + model_ft.replicas += min(alloc_devices, max_devices_ft) + elif model == Model.FT_VAE: + if self.policy.is_disaggregated(Model.FT): + max_devices_ft_vae = max_devices[Model.FT_VAE] + if self.latency_data: + max_devices_ft_vae = min( + max_devices_ft_vae, self.latency_data[gpu_type].get_max_parallelism(model) + ) + model_ft_vae.replicas += min(alloc_devices, max_devices_ft_vae) + else: + raise ValueError(f"Unrecognized model {model}") + + remaining_devices = num_devices + for model_name in models[gpu_type].keys(): + for model_alloc in models[gpu_type][model_name]: + remaining_devices -= model_alloc.get_num_gpus() + + # Distribute remaining devices to parallelizable models + distribute_models = self.workflow.filter_parallelizable_models( + models_in_workflow, + disaggregation=self.policy.disaggregation, + ) + # Prioritise models that already hold more GPUs + distribute_models.sort( + key=lambda m: models[gpu_type][m][alloc_id].get_num_gpus(), + reverse=True, + ) + num_distribute = len(distribute_models) + if num_distribute > 0 and remaining_devices > 0: + made_progress = True + while remaining_devices > 0 and made_progress: + made_progress = False + for model_name in distribute_models: + gpus_per_replica = models[gpu_type][model_name][alloc_id].devices + if gpus_per_replica <= 0 or remaining_devices < gpus_per_replica: + continue + models[gpu_type][model_name][alloc_id].replicas += 1 + remaining_devices -= gpus_per_replica + made_progress = True + if remaining_devices <= 0: + break + + remaining_devices = num_devices + for model_name in models[gpu_type].keys(): + for model_alloc in models[gpu_type][model_name]: + remaining_devices -= model_alloc.get_num_gpus() + + # TODO we should try to assign all resources + # assert remaining_devices == 0, \ + assert remaining_devices >= 0, \ + f"remaining={remaining_devices} != 0: " \ + f"gpu={gpu_type.value} total={num_devices} remaining={remaining_devices}" + + # Update replicas based on total devices + # Gemma (when parallelizable) + if self.workflow.is_parallelizable(Model.GEMMA) and Model.GEMMA in models_in_workflow: + model_gemma.devices, model_gemma.replicas, remaining_devices = _calculate_naive_num_devices( + model_gemma.devices, + model_gemma.replicas, + remaining_devices, + device_options=DEVICE_OPTIONS[Model.GEMMA], + replica_upper_bound=self.workflow.total_scenes) + + # Flux (when parallelizable) + if self.workflow.is_parallelizable(Model.FLUX) and Model.FLUX in models_in_workflow: + model_flux.devices, model_flux.replicas, remaining_devices = _calculate_naive_num_devices( + model_flux.devices, + model_flux.replicas, + remaining_devices, + device_options=DEVICE_OPTIONS[Model.FLUX], + replica_upper_bound=self.workflow.total_scenes) + + # Hunyuan FramePack + if Model.HF in self.workflow.models: + model_hf.devices, model_hf.replicas, remaining_devices = _calculate_naive_num_devices( + model_hf.devices, + model_hf.replicas, + remaining_devices, + device_options=DEVICE_OPTIONS[Model.HF], + replica_upper_bound=self.workflow.total_scenes) + + # Hunyuan FramePack VAE + if self.policy.is_disaggregated(Model.HF) and Model.HF_VAE in self.workflow.models: + model_vae.devices, model_vae.replicas, remaining_devices = _calculate_naive_num_devices( + model_vae.devices, + model_vae.replicas, + remaining_devices, + device_options=None, + replica_upper_bound=self.workflow.total_frames[Model.HF], + ) + + # Fantasy Talking + if Model.FT in self.workflow.models: + model_ft.devices, model_ft.replicas, remaining_devices = _calculate_naive_num_devices( + model_ft.devices, + model_ft.replicas, + remaining_devices, + device_options=DEVICE_OPTIONS[Model.FT], + replica_upper_bound=self.workflow.total_subscenes, + ) + + # Fantasy Talking VAE + if self.policy.is_disaggregated(Model.FT) and Model.FT_VAE in self.workflow.models: + model_ft_vae.devices, model_ft_vae.replicas, remaining_devices = _calculate_naive_num_devices( + model_ft_vae.devices, + model_ft_vae.replicas, + remaining_devices, + device_options=None, + replica_upper_bound=self.workflow.total_frames[Model.FT], + ) + + return models + + +def _calculate_naive_num_devices( + num_devices: int, + num_replicas: int, + remaining_devices: int, + device_options: Optional[list[int]] = [1], + replica_upper_bound: Optional[int] = None, +) -> tuple[int, int, int]: + """Find the parallelism that maximizes the device usage.""" + assert remaining_devices >= 0 + + model_quota = num_devices * num_replicas + + if device_options: + best_product = 0 + best_devices_per_replica = 1 + best_replicas = 1 + for devices_per_replica in device_options: + if devices_per_replica > model_quota: + continue + max_replicas = model_quota // devices_per_replica + if replica_upper_bound and max_replicas > replica_upper_bound: + max_replicas = replica_upper_bound + product = devices_per_replica * max_replicas + if product > best_product: + best_product = product + best_devices_per_replica = devices_per_replica + best_replicas = max_replicas + else: + # start with parallelism=1 instead + best_devices_per_replica = 1 + best_replicas = model_quota + + num_devices = best_devices_per_replica + num_replicas = best_replicas + remaining_devices += model_quota - num_replicas * num_devices + + return num_devices, num_replicas, remaining_devices diff --git a/streamwise/model_provisioner/policies.py b/streamwise/model_provisioner/policies.py new file mode 100644 index 00000000..3f670f93 --- /dev/null +++ b/streamwise/model_provisioner/policies.py @@ -0,0 +1,252 @@ +from __future__ import annotations + +from sim_types import Objective +from sim_types import Policy +from sim_types import GPUType +from sim_types import Model +from sim_types import Solver + +from constants import GPU_RESERVED_COST +from constants import GPU_SPOT_COST + + +# Max devices for each model +# the logic is to allocate devices to each model proportional to their max devices +MAX_DEVICES = { + Model.GEMMA: 8, + Model.FLUX: 16, + Model.HF: 40, + Model.HF_VAE: 1, + Model.FT: 40, + Model.FT_VAE: 1, +} + +# Max iterations for the optimization loop to prevent infinite loops in case of non-monotonic allocators or other issues +MAX_ITERATIONS = 100 + +# Set to True if we want to use up all GPUs if there's no further improvements in the greedy optimization loop +USE_ALL_GPUS = True + +# Default StreamWise policy configuration +# TODO: Add a meta policy that picks the best among disaggregation options for HF/FT +STREAMWISE_POLICY = Policy( + name="streamwise", + gpu_cost=GPU_SPOT_COST, + objective=Objective.TTFF_COST, + disaggregation={ + Model.HF: True, + Model.FT: False, + }, + use_upscaler=True, + hardware=list(GPUType), +) + +STREAMWISE_MILP_POLICY = Policy( + name="streamwise", + gpu_cost=GPU_SPOT_COST, + objective=Objective.TTFF_COST, + disaggregation={ + Model.HF: True, + Model.FT: False, + }, + use_upscaler=True, + hardware=list(GPUType), + solver=Solver.GUROBI, +) + + +""" +HexGen policy configuration. +""" +HEXGEN_POLICY = Policy( + name="hexgen", + gpu_cost=GPU_RESERVED_COST, + objective=Objective.TTFF, # Does not account for cost + disaggregation={ + Model.HF: True, + Model.FT: False, + }, # Dissagregation + use_upscaler=False, + hardware=[ # Multiple hardware + GPUType.A100, + GPUType.H100, + GPUType.H200, + GPUType.GB200, + ], + solver=Solver.HEXGEN, +) + + +""" +Helix policy configuration. +Reference: https://github.com/Thesys-lab/Helix-ASPLOS25 +Optimizes models one-by-one following MODEL_ORDER using MILP. +""" +HELIX_POLICY = Policy( + name="helix", + gpu_cost=GPU_RESERVED_COST, + objective=Objective.TTFF, # Does not account for cost + disaggregation={ + Model.HF: True, + Model.FT: False, + }, + use_upscaler=False, + hardware=list(GPUType), + solver=Solver.HELIX, +) + + +""" +DDiT policy configuration. +Reference: https://arxiv.org/html/2506.13497v1 +""" +DDIT_POLICY = Policy( + name="ddit", + gpu_cost=GPU_RESERVED_COST, + objective=Objective.TTFF, + disaggregation={ + Model.HF: True, + Model.FT: False, + }, + use_upscaler=False, + hardware=list(GPUType), + solver=Solver.NAIVE, +) + + +STREAMWISE_ENERGY_POLICY = Policy( + name="streamwise energy", + gpu_cost=GPU_SPOT_COST, + objective=Objective.TIME_ENERGY, + disaggregation={ + Model.HF: True, + Model.FT: False, + }, + use_upscaler=True, + hardware=list(GPUType), +) + +NAIVE_POLICY = Policy( + name="naive", + gpu_cost=GPU_RESERVED_COST, + objective=Objective.TTFF, + disaggregation={}, + use_upscaler=False, + hardware=[GPUType.A100], + solver=Solver.NAIVE, +) + + +BASELINE_POLICIES = { + "naive": NAIVE_POLICY, + "naive disag": Policy( + "naive disag", + gpu_cost=GPU_RESERVED_COST, + objective=Objective.TTFF, + disaggregation={ + Model.HF: True, + Model.FT: True, + }, + use_upscaler=False, + hardware=[GPUType.A100], + solver=Solver.NAIVE, + ), + "naive upscaler": Policy( + "naive upscaler", + gpu_cost=GPU_RESERVED_COST, + objective=Objective.TTFF, + disaggregation={}, + use_upscaler=True, # Changed to True + hardware=[GPUType.A100], + solver=Solver.NAIVE, + ), + "naive spot": Policy( + "naive spot", + gpu_cost=GPU_SPOT_COST, # Changed to SPOT_COST + objective=Objective.TTFF, + disaggregation={}, + use_upscaler=False, + hardware=[GPUType.A100], + solver=Solver.NAIVE, + ), + "naive ttff*cost allocator": Policy( + "naive ttff*cost allocator", + GPU_RESERVED_COST, + objective=Objective.TTFF_COST, # Changed to TTFF_COST + disaggregation={}, + use_upscaler=False, + hardware=[GPUType.A100], + solver=Solver.GREEDY, + ), + "naive hardware": Policy( + "naive hardware", + GPU_RESERVED_COST, + objective=Objective.TTFF, + disaggregation={}, + use_upscaler=False, + hardware=list(GPUType), # Changed hardware + solver=Solver.NAIVE, + ), +} + + +STREAMWISE_POLICIES = { + "streamwise": STREAMWISE_POLICY, + "streamwise no disag": Policy( + name="streamwise no disag", + gpu_cost=GPU_SPOT_COST, + objective=Objective.TTFF_COST, + disaggregation={}, + use_upscaler=True, + hardware=list(GPUType), + solver=Solver.GREEDY, + ), + "streamwise no upscaler": Policy( + name="streamwise no upscaler", + gpu_cost=GPU_SPOT_COST, + objective=Objective.TTFF_COST, + disaggregation={ + Model.HF: True, + Model.FT: False, + }, + use_upscaler=False, + hardware=list(GPUType), + solver=Solver.GREEDY, + ), + "streamwise no spot": Policy( + name="streamwise no spot", + gpu_cost=GPU_RESERVED_COST, + objective=Objective.TTFF_COST, + disaggregation={ + Model.HF: True, + Model.FT: False, + }, + use_upscaler=True, + hardware=list(GPUType), + solver=Solver.GREEDY, + ), + "streamwise naive allocator": Policy( + name="streamwise naive allocator", + gpu_cost=GPU_SPOT_COST, + objective=Objective.TTFF, + disaggregation={ + Model.HF: True, + Model.FT: False, + }, + use_upscaler=True, + hardware=list(GPUType), + solver=Solver.NAIVE, + ), + "streamwise A100": Policy( + name="streamwise single hardware", + gpu_cost=GPU_SPOT_COST, + objective=Objective.TTFF_COST, + disaggregation={ + Model.HF: True, + Model.FT: False, + }, + use_upscaler=True, + hardware=[GPUType.A100], + solver=Solver.NAIVE, + ), +} diff --git a/tests/simulator/conftest.py b/tests/simulator/conftest.py new file mode 100644 index 00000000..d8e52f08 --- /dev/null +++ b/tests/simulator/conftest.py @@ -0,0 +1,24 @@ +""" +Conftest for simulator tests. + +Sets PYTHONPATH so that child processes spawned by ProcessPoolExecutor +can find the simulator and streamwise modules. +""" +import os +import sys + +_REPO_ROOT = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..")) +_SIMULATOR_DIR = os.path.join(_REPO_ROOT, "simulator") +_STREAMWISE_DIR = os.path.join(_REPO_ROOT, "streamwise") + +# Propagate paths to child processes via PYTHONPATH. +_EXTRA = os.pathsep.join((_REPO_ROOT, _SIMULATOR_DIR, _STREAMWISE_DIR)) +_EXISTING = os.environ.get("PYTHONPATH", "") +if _SIMULATOR_DIR not in _EXISTING: + os.environ["PYTHONPATH"] = ( + _EXTRA + os.pathsep + _EXISTING if _EXISTING else _EXTRA + ) + +for _p in (_REPO_ROOT, _SIMULATOR_DIR, _STREAMWISE_DIR): + if _p not in sys.path: + sys.path.insert(0, _p) diff --git a/tests/simulator/test_auto_model_allocator.py b/tests/simulator/test_auto_model_allocator.py index 18ff1871..f7550822 100644 --- a/tests/simulator/test_auto_model_allocator.py +++ b/tests/simulator/test_auto_model_allocator.py @@ -24,21 +24,21 @@ from tests.test_utils import temp_sys_path with temp_sys_path("simulator", "streamwise"): - from model_provisioner.sim_types import GPUType - from model_provisioner.sim_types import Model - from model_provisioner.sim_types import QualityLevel - from model_provisioner.sim_types import Solver + from sim_types import GPUType + from sim_types import Model + from sim_types import QualityLevel + from sim_types import Solver - from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG + from constants import DEFAULT_WORKFLOW_CONFIG - from model_provisioner.data_loading import load_latency_data + from data_loading import load_latency_data from model_provisioner.policies import STREAMWISE_POLICY from model_provisioner.policies import NAIVE_POLICY from model_provisioner.policies import HEXGEN_POLICY from model_provisioner.policies import HELIX_POLICY - from model_provisioner.auto_model_allocator import AutoModelAllocator + from auto_model_allocator import AutoModelAllocator from model_provisioner.greedy import GreedyAllocator from model_provisioner.naive_baseline import NaiveAllocator @@ -46,7 +46,7 @@ from model_provisioner.helix import HelixAllocator from model_provisioner.milp import MILPAllocator - from model_provisioner.workflows import PODCAST_WORKFLOW + from workflows import PODCAST_WORKFLOW # --------------------------------------------------------------------------- diff --git a/tests/simulator/test_data_loading.py b/tests/simulator/test_data_loading.py index de883d35..72337375 100644 --- a/tests/simulator/test_data_loading.py +++ b/tests/simulator/test_data_loading.py @@ -12,11 +12,11 @@ from tests.test_utils import temp_sys_path with temp_sys_path("simulator", "streamwise"): - from model_provisioner.sim_types import QualityLevel + from sim_types import QualityLevel - from model_provisioner.data_loading import load_latency_data - from model_provisioner.data_loading import load_power_data - from model_provisioner.data_loading import load_adaptive_quality_data + from data_loading import load_latency_data + from data_loading import load_power_data + from data_loading import load_adaptive_quality_data def test_latency() -> None: diff --git a/tests/simulator/test_evaluator.py b/tests/simulator/test_evaluator.py index 6f3a5aa7..b3c37e73 100644 --- a/tests/simulator/test_evaluator.py +++ b/tests/simulator/test_evaluator.py @@ -9,28 +9,28 @@ from tests.test_utils import temp_sys_path with temp_sys_path("simulator", "streamwise"): - from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG - from model_provisioner.constants import SECONDS_IN_HOUR + from constants import DEFAULT_WORKFLOW_CONFIG + from constants import SECONDS_IN_HOUR - from model_provisioner.sim_types import GPUType - from model_provisioner.sim_types import Model + from sim_types import GPUType + from sim_types import Model - from model_provisioner.data_loading import load_latency_data - from model_provisioner.data_loading import load_power_data + from data_loading import load_latency_data + from data_loading import load_power_data - from model_provisioner.evaluator import evaluate_model_allocation + from evaluator import evaluate_model_allocation from model_provisioner.policies import STREAMWISE_POLICY - from model_provisioner.models import FluxModelAllocation - from model_provisioner.models import GemmaModelAllocation - from model_provisioner.models import HFModelAllocation - from model_provisioner.models import HFVAEModelAllocation - from model_provisioner.models import FTModelAllocation - from model_provisioner.models import UpscalerModelAllocation - from model_provisioner.models import OthersModelAllocation + from models import FluxModelAllocation + from models import GemmaModelAllocation + from models import HFModelAllocation + from models import HFVAEModelAllocation + from models import FTModelAllocation + from models import UpscalerModelAllocation + from models import OthersModelAllocation - from model_provisioner.utils import to_models_df + from utils import to_models_df def test_empty() -> None: diff --git a/tests/simulator/test_greedy.py b/tests/simulator/test_greedy.py index 786cc2c2..bfa2996e 100644 --- a/tests/simulator/test_greedy.py +++ b/tests/simulator/test_greedy.py @@ -9,17 +9,17 @@ from tests.test_utils import temp_sys_path with temp_sys_path("simulator", "streamwise"): - from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG - from model_provisioner.constants import SECONDS_IN_HOUR + from constants import DEFAULT_WORKFLOW_CONFIG + from constants import SECONDS_IN_HOUR - from model_provisioner.workflows import WORKFLOWS + from workflows import WORKFLOWS - from model_provisioner.sim_types import GPUType - from model_provisioner.sim_types import QualityLevel - from model_provisioner.sim_types import WorkflowConfig + from sim_types import GPUType + from sim_types import QualityLevel + from sim_types import WorkflowConfig - from model_provisioner.data_loading import load_latency_data - from model_provisioner.data_loading import load_power_data + from data_loading import load_latency_data + from data_loading import load_power_data from model_provisioner.greedy import GreedyAllocator diff --git a/tests/simulator/test_helix.py b/tests/simulator/test_helix.py index 06ec8f3a..7261b902 100644 --- a/tests/simulator/test_helix.py +++ b/tests/simulator/test_helix.py @@ -13,13 +13,13 @@ from tests.test_utils import temp_sys_path with temp_sys_path("simulator", "streamwise"): - from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG - from model_provisioner.sim_types import GPUType - from model_provisioner.sim_types import Model - from model_provisioner.sim_types import MODEL_ORDER - from model_provisioner.sim_types import Solver - from model_provisioner.data_loading import load_latency_data - from model_provisioner.data_loading import load_power_data + from constants import DEFAULT_WORKFLOW_CONFIG + from sim_types import GPUType + from sim_types import Model + from sim_types import MODEL_ORDER + from sim_types import Solver + from data_loading import load_latency_data + from data_loading import load_power_data from model_provisioner.helix import HelixAllocator from model_provisioner.policies import HELIX_POLICY diff --git a/tests/simulator/test_hexgen.py b/tests/simulator/test_hexgen.py index 3317a82e..3d77867b 100644 --- a/tests/simulator/test_hexgen.py +++ b/tests/simulator/test_hexgen.py @@ -8,12 +8,12 @@ from tests.test_utils import temp_sys_path with temp_sys_path("simulator", "streamwise"): - from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG - from model_provisioner.sim_types import GPUType - from model_provisioner.data_loading import load_latency_data + from constants import DEFAULT_WORKFLOW_CONFIG + from sim_types import GPUType + from data_loading import load_latency_data from model_provisioner.hexgen import HexGenAllocator from model_provisioner.hexgen import _get_model_order - from model_provisioner.sim_types import MODEL_ORDER + from sim_types import MODEL_ORDER def test_get_model_order() -> None: diff --git a/tests/simulator/test_milp.py b/tests/simulator/test_milp.py index 9b0e909e..52a308bd 100644 --- a/tests/simulator/test_milp.py +++ b/tests/simulator/test_milp.py @@ -14,28 +14,28 @@ from tests.test_utils import temp_sys_path with temp_sys_path("simulator", "streamwise"): - from model_provisioner.sim_types import LatencyData - from model_provisioner.sim_types import PowerData - from model_provisioner.sim_types import GPUType - from model_provisioner.sim_types import Objective - from model_provisioner.sim_types import Solver - from model_provisioner.sim_types import QualityLevel + from sim_types import LatencyData + from sim_types import PowerData + from sim_types import GPUType + from sim_types import Objective + from sim_types import Solver + from sim_types import QualityLevel - from model_provisioner.data_loading import load_latency_data - from model_provisioner.data_loading import load_power_data + from data_loading import load_latency_data + from data_loading import load_power_data - from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG - from model_provisioner.constants import SECONDS_IN_HOUR + from constants import DEFAULT_WORKFLOW_CONFIG + from constants import SECONDS_IN_HOUR from model_provisioner.policies import STREAMWISE_MILP_POLICY - from model_provisioner.workflows import WORKFLOWS + from workflows import WORKFLOWS from model_provisioner.milp import MILPAllocator - from model_provisioner.evaluator import evaluate_model_allocation + from evaluator import evaluate_model_allocation - from model_provisioner.utils import to_models_df + from utils import to_models_df def test_base() -> None: diff --git a/tests/simulator/test_models.py b/tests/simulator/test_models.py index c0171d99..eccb449b 100644 --- a/tests/simulator/test_models.py +++ b/tests/simulator/test_models.py @@ -17,33 +17,33 @@ from tests.test_utils import temp_sys_path with temp_sys_path("simulator", "streamwise"): - from model_provisioner.sim_types import GPUType - from model_provisioner.sim_types import Model - from model_provisioner.sim_types import ModelAllocation - from model_provisioner.sim_types import QualityLevel - from model_provisioner.sim_types import LatencyData - from model_provisioner.sim_types import PowerData + from sim_types import GPUType + from sim_types import Model + from sim_types import ModelAllocation + from sim_types import QualityLevel + from sim_types import LatencyData + from sim_types import PowerData - from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG + from constants import DEFAULT_WORKFLOW_CONFIG - from model_provisioner.data_loading import load_latency_data - from model_provisioner.data_loading import load_power_data + from data_loading import load_latency_data + from data_loading import load_power_data from model_provisioner.policies import STREAMWISE_POLICY from model_provisioner.policies import NAIVE_POLICY - from model_provisioner.models import get_model_allocation - from model_provisioner.models import _calculate_total_time - from model_provisioner.models import assert_pixel_config - from model_provisioner.models import _MODEL_ALLOCATION_REGISTRY - from model_provisioner.models import GemmaModelAllocation - from model_provisioner.models import FluxModelAllocation - from model_provisioner.models import HFModelAllocation - from model_provisioner.models import HFVAEModelAllocation - from model_provisioner.models import FTModelAllocation - from model_provisioner.models import FTVAEModelAllocation - from model_provisioner.models import UpscalerModelAllocation - from model_provisioner.models import OthersModelAllocation + from models import get_model_allocation + from models import _calculate_total_time + from models import assert_pixel_config + from models import _MODEL_ALLOCATION_REGISTRY + from models import GemmaModelAllocation + from models import FluxModelAllocation + from models import HFModelAllocation + from models import HFVAEModelAllocation + from models import FTModelAllocation + from models import FTVAEModelAllocation + from models import UpscalerModelAllocation + from models import OthersModelAllocation # --------------------------------------------------------------------------- @@ -152,7 +152,7 @@ def test_assert_pixel_config() -> None: assert_pixel_config(DEFAULT_WORKFLOW_CONFIG) # Patching MEDIUM > HIGH violates the ordering constraint → AssertionError. - with patch.dict("model_provisioner.sim_types.RESOLUTION_PIXELS", + with patch.dict("sim_types.RESOLUTION_PIXELS", {QualityLevel.MEDIUM: 1000, QualityLevel.HIGH: 500}): with pytest.raises(AssertionError): assert_pixel_config(DEFAULT_WORKFLOW_CONFIG) diff --git a/tests/simulator/test_multirequests_derive.py b/tests/simulator/test_multirequests_derive.py index d5286121..c809ccd0 100644 --- a/tests/simulator/test_multirequests_derive.py +++ b/tests/simulator/test_multirequests_derive.py @@ -8,9 +8,9 @@ from tests.test_utils import temp_sys_path with temp_sys_path("simulator", "streamwise"): - from model_provisioner.sim_types import GPUType - from model_provisioner.sim_types import Model - from model_provisioner.sim_types import QualityLevel + from sim_types import GPUType + from sim_types import Model + from sim_types import QualityLevel from multirequests import TIME_PER_REQ from multirequests import INIT_REPLICAS diff --git a/tests/simulator/test_simulator.py b/tests/simulator/test_simulator.py index d621cd33..d698bb9d 100644 --- a/tests/simulator/test_simulator.py +++ b/tests/simulator/test_simulator.py @@ -14,18 +14,18 @@ from tests.test_utils import temp_sys_path with temp_sys_path("simulator", "streamwise"): - from model_provisioner.sim_types import WorkflowConfig - from model_provisioner.sim_types import Model - from model_provisioner.sim_types import Objective - from model_provisioner.sim_types import GPUType + from sim_types import WorkflowConfig + from sim_types import Model + from sim_types import Objective + from sim_types import GPUType - from model_provisioner.constants import SECONDS_IN_HOUR - from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG + from constants import SECONDS_IN_HOUR + from constants import DEFAULT_WORKFLOW_CONFIG - from model_provisioner.data_loading import load_latency_data - from model_provisioner.data_loading import load_power_data + from data_loading import load_latency_data + from data_loading import load_power_data - from model_provisioner.auto_model_allocator import AutoModelAllocator + from auto_model_allocator import AutoModelAllocator from model_provisioner.greedy import GreedyAllocator from model_provisioner.policies import STREAMWISE_POLICY diff --git a/tests/simulator/test_simulator_actions.py b/tests/simulator/test_simulator_actions.py index 11efd7b2..539946c5 100644 --- a/tests/simulator/test_simulator_actions.py +++ b/tests/simulator/test_simulator_actions.py @@ -8,11 +8,11 @@ from tests.test_utils import temp_sys_path with temp_sys_path("simulator", "streamwise"): - from model_provisioner.sim_types import Action - from model_provisioner.sim_types import ActionName - from model_provisioner.sim_types import GPUType - from model_provisioner.sim_types import Model - from model_provisioner.sim_types import Result + from sim_types import Action + from sim_types import ActionName + from sim_types import GPUType + from sim_types import Model + from sim_types import Result def test_action() -> None: diff --git a/tests/simulator/test_simulator_baseline.py b/tests/simulator/test_simulator_baseline.py index 24749ffb..b195a1cf 100644 --- a/tests/simulator/test_simulator_baseline.py +++ b/tests/simulator/test_simulator_baseline.py @@ -12,18 +12,18 @@ from tests.test_utils import temp_sys_path with temp_sys_path("simulator", "streamwise"): - from model_provisioner.sim_types import GPUType - from model_provisioner.sim_types import Model + from sim_types import GPUType + from sim_types import Model - from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG - from model_provisioner.constants import SECONDS_IN_HOUR - from model_provisioner.constants import POWER_GPU_IDLE - from model_provisioner.constants import POWER_GPU_TDP + from constants import DEFAULT_WORKFLOW_CONFIG + from constants import SECONDS_IN_HOUR + from constants import POWER_GPU_IDLE + from constants import POWER_GPU_TDP - from model_provisioner.data_loading import load_latency_data - from model_provisioner.data_loading import load_power_data + from data_loading import load_latency_data + from data_loading import load_power_data - from model_provisioner.auto_model_allocator import AutoModelAllocator + from auto_model_allocator import AutoModelAllocator from model_provisioner.naive_baseline import NaiveAllocator from model_provisioner.greedy import GreedyAllocator @@ -31,8 +31,8 @@ from model_provisioner.policies import BASELINE_POLICIES from model_provisioner.policies import STREAMWISE_POLICY - from model_provisioner.workflows import SHORTS_WORKFLOW - from model_provisioner.workflows import WORKFLOWS + from workflows import SHORTS_WORKFLOW + from workflows import WORKFLOWS def test_baseline() -> None: diff --git a/tests/simulator/test_simulator_energy.py b/tests/simulator/test_simulator_energy.py index a739f698..c96fd128 100644 --- a/tests/simulator/test_simulator_energy.py +++ b/tests/simulator/test_simulator_energy.py @@ -10,17 +10,17 @@ from tests.test_utils import temp_sys_path with temp_sys_path("simulator", "streamwise"): - from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG + from constants import DEFAULT_WORKFLOW_CONFIG - from model_provisioner.sim_types import GPUType - from model_provisioner.sim_types import Model - from model_provisioner.sim_types import Objective - from model_provisioner.sim_types import Solver + from sim_types import GPUType + from sim_types import Model + from sim_types import Objective + from sim_types import Solver - from model_provisioner.data_loading import load_latency_data - from model_provisioner.data_loading import load_power_data + from data_loading import load_latency_data + from data_loading import load_power_data - from model_provisioner.auto_model_allocator import AutoModelAllocator + from auto_model_allocator import AutoModelAllocator from model_provisioner.greedy import GreedyAllocator from model_provisioner.naive_baseline import NaiveAllocator diff --git a/tests/simulator/test_simulator_multirequests.py b/tests/simulator/test_simulator_multirequests.py index 3d3e350a..6403baba 100644 --- a/tests/simulator/test_simulator_multirequests.py +++ b/tests/simulator/test_simulator_multirequests.py @@ -21,12 +21,12 @@ from multirequests import TIME_PER_REQ_ADAPTIVE from multirequests import get_time_per_request_baseline - from model_provisioner.data_loading import load_latency_data - from model_provisioner.workflows import PODCAST_WORKFLOW + from data_loading import load_latency_data + from workflows import PODCAST_WORKFLOW - from model_provisioner.constants import GPU_SPOT_COST - from model_provisioner.sim_types import GPUType - from model_provisioner.sim_types import Model + from constants import GPU_SPOT_COST + from sim_types import GPUType + from sim_types import Model def test_multirequests() -> None: diff --git a/tests/simulator/test_simulator_plotutils.py b/tests/simulator/test_simulator_plotutils.py index 2d3b35e2..b3bdead9 100644 --- a/tests/simulator/test_simulator_plotutils.py +++ b/tests/simulator/test_simulator_plotutils.py @@ -14,10 +14,10 @@ from plot_utils import plot_cost_vs_qpm from plot_utils import _get_time_ticklabels - from model_provisioner.sim_types import ProvisioningResult - from model_provisioner.sim_types import GPUType - from model_provisioner.sim_types import QualityLevel - from model_provisioner.sim_types import Model + from sim_types import ProvisioningResult + from sim_types import GPUType + from sim_types import QualityLevel + from sim_types import Model def test_plot_ttff_vs_cost() -> None: diff --git a/tests/simulator/test_simulator_policies.py b/tests/simulator/test_simulator_policies.py index 42bf69db..d9e1421f 100644 --- a/tests/simulator/test_simulator_policies.py +++ b/tests/simulator/test_simulator_policies.py @@ -15,7 +15,7 @@ from model_provisioner.policies import STREAMWISE_POLICY from model_provisioner.policies import BASELINE_POLICIES - from model_provisioner.sim_types import Objective + from sim_types import Objective def test_streamwise_policies() -> None: diff --git a/tests/simulator/test_simulator_provisioning.py b/tests/simulator/test_simulator_provisioning.py index d781bc2e..fb5d46fd 100644 --- a/tests/simulator/test_simulator_provisioning.py +++ b/tests/simulator/test_simulator_provisioning.py @@ -8,7 +8,7 @@ from tests.test_utils import temp_sys_path with temp_sys_path("simulator", "streamwise"): - from model_provisioner.constants import DEFAULT_WORKFLOW_CONFIG + from constants import DEFAULT_WORKFLOW_CONFIG from provisioning import get_provisioning_results from provisioning import get_provisioning_adaptive_results @@ -17,11 +17,11 @@ from provisioning import GPU_PROVISIONS from provisioning import GPU_PROVISIONS_SHORT - from model_provisioner.sim_types import GPUType - from model_provisioner.sim_types import QualityLevel - from model_provisioner.sim_types import Solver + from sim_types import GPUType + from sim_types import QualityLevel + from sim_types import Solver - from model_provisioner.data_loading import load_latency_data + from data_loading import load_latency_data from model_provisioner.policies import NAIVE_POLICY from model_provisioner.policies import STREAMWISE_POLICY diff --git a/tests/simulator/test_simulator_types.py b/tests/simulator/test_simulator_types.py index 9e2384ed..223a3260 100644 --- a/tests/simulator/test_simulator_types.py +++ b/tests/simulator/test_simulator_types.py @@ -9,20 +9,20 @@ from tests.test_utils import temp_sys_path with temp_sys_path("simulator", "streamwise"): - from model_provisioner.sim_types import Model - from model_provisioner.sim_types import GPUType + from sim_types import Model + from sim_types import GPUType - from model_provisioner.sim_types_json import models_to_json - from model_provisioner.sim_types_json import workflow_to_json - from model_provisioner.sim_types_json import policy_to_json - from model_provisioner.sim_types_json import model_list_to_json + from sim_types_json import models_to_json + from sim_types_json import workflow_to_json + from sim_types_json import policy_to_json + from sim_types_json import model_list_to_json - from model_provisioner.models import GemmaModelAllocation - from model_provisioner.models import FluxModelAllocation + from models import GemmaModelAllocation + from models import FluxModelAllocation from model_provisioner.policies import STREAMWISE_POLICY - from model_provisioner.workflows import PODCAST_WORKFLOW + from workflows import PODCAST_WORKFLOW def test_serialize_models() -> None: diff --git a/tests/simulator/test_simulator_utils.py b/tests/simulator/test_simulator_utils.py index e1575d9a..b78d675d 100644 --- a/tests/simulator/test_simulator_utils.py +++ b/tests/simulator/test_simulator_utils.py @@ -7,18 +7,18 @@ from tests.test_utils import temp_sys_path with temp_sys_path("simulator", "streamwise"): - from model_provisioner.sim_types import Model - from model_provisioner.sim_types import GPUType - from model_provisioner.sim_types import ModelAllocation - from model_provisioner.sim_types import ProvisioningResult - - from model_provisioner.utils import get_pareto_frontier - from model_provisioner.utils import find_most_cost_effective_provisioning - from model_provisioner.utils import find_most_energy_efficient_provisioning - from model_provisioner.utils import find_pareto_frontier - from model_provisioner.utils import coalesce_models - - from model_provisioner.models import FTModelAllocation + from sim_types import Model + from sim_types import GPUType + from sim_types import ModelAllocation + from sim_types import ProvisioningResult + + from utils import get_pareto_frontier + from utils import find_most_cost_effective_provisioning + from utils import find_most_energy_efficient_provisioning + from utils import find_pareto_frontier + from utils import coalesce_models + + from models import FTModelAllocation def test_get_pareto_frontier() -> None: diff --git a/tests/simulator/test_workflows.py b/tests/simulator/test_workflows.py index 19a7ff0c..b38dc2ab 100644 --- a/tests/simulator/test_workflows.py +++ b/tests/simulator/test_workflows.py @@ -16,8 +16,8 @@ from tests.test_utils import temp_sys_path with temp_sys_path("simulator", "streamwise"): - from model_provisioner.sim_types import WorkflowConfig, Model, QualityLevel, GPUType - from model_provisioner.constants import ( + from sim_types import WorkflowConfig, Model, QualityLevel, GPUType + from constants import ( FPS, FRAMES_OPTIONS, FRAMES_PER_STEP_IDX, @@ -26,10 +26,10 @@ SECONDS_IN_MINUTE, TOTAL_INPUT_TOKENS, ) - from model_provisioner.data_loading import load_latency_data - from model_provisioner.auto_model_allocator import AutoModelAllocator + from data_loading import load_latency_data + from auto_model_allocator import AutoModelAllocator from model_provisioner.policies import STREAMWISE_POLICY, NAIVE_POLICY - from model_provisioner.workflows import ( + from workflows import ( MAX_FT_FRAMES, SUBSCENE_SECONDS, SUBSCENES_PER_SCENE, diff --git a/tests/streamwise/test_allocator_bridge.py b/tests/streamwise/test_allocator_bridge.py new file mode 100644 index 00000000..bd45f8a6 --- /dev/null +++ b/tests/streamwise/test_allocator_bridge.py @@ -0,0 +1,280 @@ +""" +Tests for streamwise/allocator_bridge.py. + +Covers: +- Model-to-container name mapping. +- Result to deployment specs conversion. +- run_allocator end-to-end (with real latency data). +- Error handling for invalid inputs. +""" + +from __future__ import annotations + +import sys +import os + +import pytest + +# Add current path +sys.path.append(os.getcwd()) + +from tests.test_utils import temp_sys_path + +with temp_sys_path("streamwise", "simulator"): + from allocator_bridge import ( + MODEL_TO_CONTAINERS, + CONTAINER_RESOURCES, + GPU_TYPE_TO_POD_STR, + APP_TO_WORKFLOW, + DeploymentSpec, + DeploymentPlan, + get_available_workflows, + get_available_gpu_types, + result_to_deployment_specs, + deployment_plan_to_json, + run_allocator, + ) + from sim_types import GPUType, Model, Result + from models import ( + GemmaModelAllocation, + FluxModelAllocation, + HFModelAllocation, + HFVAEModelAllocation, + FTModelAllocation, + OthersModelAllocation, + UpscalerModelAllocation, + ) + + +# --------------------------------------------------------------------------- +# Mapping correctness +# --------------------------------------------------------------------------- + +def test_model_to_containers_covers_all_models() -> None: + """Every Model enum value must have a mapping entry.""" + for model in Model: + assert model in MODEL_TO_CONTAINERS, f"Missing mapping for {model}" + + +def test_container_resources_covers_all_mapped_containers() -> None: + """Every container referenced in MODEL_TO_CONTAINERS must have resource defaults.""" + for model, containers in MODEL_TO_CONTAINERS.items(): + for container in containers: + assert container in CONTAINER_RESOURCES, ( + f"Missing CONTAINER_RESOURCES for '{container}' (from {model})") + + +def test_gpu_type_to_pod_str_covers_all_gpu_types() -> None: + """Every GPUType enum value must have a pod string mapping.""" + for gpu_type in GPUType: + assert gpu_type in GPU_TYPE_TO_POD_STR + + +def test_app_to_workflow_has_expected_entries() -> None: + """Key StreamWise apps should map to workflows.""" + assert "streamcast" in APP_TO_WORKFLOW + assert "streampersona" in APP_TO_WORKFLOW + assert "streamchat" in APP_TO_WORKFLOW + + +# --------------------------------------------------------------------------- +# Utility functions +# --------------------------------------------------------------------------- + +def test_get_available_workflows() -> None: + workflows = get_available_workflows() + assert isinstance(workflows, list) + assert "streamcast" in workflows + assert len(workflows) >= 5 + + +def test_get_available_gpu_types() -> None: + gpu_types = get_available_gpu_types() + assert isinstance(gpu_types, list) + assert "A100" in gpu_types + assert "H100" in gpu_types + + +# --------------------------------------------------------------------------- +# result_to_deployment_specs +# --------------------------------------------------------------------------- + +def test_result_to_deployment_specs_basic() -> None: + """A simple result with one active allocation maps to the right container.""" + models = { + GPUType.A100: { + Model.GEMMA: [GemmaModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=1)], + Model.FLUX: [FluxModelAllocation(gpu_type=GPUType.A100, devices=2, replicas=1)], + Model.HF: [HFModelAllocation(gpu_type=GPUType.A100, devices=2, replicas=2)], + Model.HF_VAE: [HFVAEModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=1)], + Model.FT: [FTModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=0)], + Model.FT_VAE: [], + Model.UPSCALER: [UpscalerModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=0)], + Model.OTHERS: [OthersModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=1)], + } + } + result = Result( + total_time_s=100.0, + ttff_s=10.0, + cost=1.0, + gpus_used={GPUType.A100: 8}, + gpus_total={GPUType.A100: 8}, + models=models, + ) + + specs = result_to_deployment_specs(result) + assert isinstance(specs, list) + assert len(specs) > 0 + + container_names = [s.container_name for s in specs] + assert "gemma" in container_names + assert "flux" in container_names + assert "hunyuanframepackf1" in container_names # HF model + assert "hunyuanframepackvae" in container_names # HF_VAE model + + # OTHERS maps to kokoro + yolo + assert "kokoro" in container_names + assert "yolo" in container_names + + # Check GPU type mapping + gemma_spec = next(s for s in specs if s.container_name == "gemma") + assert gemma_spec.gpu_type == "a100" + assert gemma_spec.gpu == 1 + + # MIG containers get mig_profile set + kokoro_spec = next(s for s in specs if s.container_name == "kokoro") + assert kokoro_spec.mig_profile == "1g.10gb" + + +def test_result_to_deployment_specs_skips_zero_replicas() -> None: + """Allocations with zero replicas should not produce deployment specs.""" + models = { + GPUType.A100: { + Model.GEMMA: [GemmaModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=0)], + Model.FLUX: [FluxModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=0)], + Model.HF: [HFModelAllocation(gpu_type=GPUType.A100, devices=1, replicas=0)], + Model.HF_VAE: [], + Model.FT: [], + Model.FT_VAE: [], + Model.UPSCALER: [], + Model.OTHERS: [], + } + } + result = Result( + total_time_s=0.0, + ttff_s=0.0, + cost=0.0, + gpus_used={GPUType.A100: 0}, + gpus_total={GPUType.A100: 8}, + models=models, + ) + specs = result_to_deployment_specs(result) + assert specs == [] + + +def test_result_to_deployment_specs_multiple_replicas() -> None: + """Multiple replicas should produce multiple deployment specs for same container.""" + models = { + GPUType.H100: { + Model.GEMMA: [GemmaModelAllocation(gpu_type=GPUType.H100, devices=1, replicas=1)], + Model.FLUX: [FluxModelAllocation(gpu_type=GPUType.H100, devices=1, replicas=1)], + Model.HF: [HFModelAllocation(gpu_type=GPUType.H100, devices=2, replicas=3)], + Model.HF_VAE: [], + Model.FT: [], + Model.FT_VAE: [], + Model.UPSCALER: [], + Model.OTHERS: [], + } + } + result = Result( + total_time_s=50.0, + ttff_s=5.0, + cost=0.5, + gpus_used={GPUType.H100: 8}, + gpus_total={GPUType.H100: 16}, + models=models, + ) + specs = result_to_deployment_specs(result) + hf_specs = [s for s in specs if s.container_name == "hunyuanframepackf1"] + assert len(hf_specs) == 3 # 3 replicas + for spec in hf_specs: + assert spec.gpu == 2 + assert spec.gpu_type == "h100" + + +# --------------------------------------------------------------------------- +# deployment_plan_to_json +# --------------------------------------------------------------------------- + +def test_deployment_plan_to_json() -> None: + """Serialization should produce all expected keys.""" + result = Result( + total_time_s=100.0, + ttff_s=10.0, + cost=1.5, + gpus_used={GPUType.A100: 8}, + gpus_total={GPUType.A100: 8}, + models={}, + ) + plan = DeploymentPlan( + specs=[ + DeploymentSpec( + container_name="gemma", cpu=16, memory_gib=192, + ephemeral_storage_gib=64, gpu=2, gpu_type="a100", mig_profile=None) + ], + result=result, + workflow_name="streamcast", + gpu_budget={"A100": 8}, + ) + data = deployment_plan_to_json(plan) + assert data["workflow_name"] == "streamcast" + assert data["gpu_budget"] == {"A100": 8} + assert data["metrics"]["total_time_s"] == 100.0 + assert data["metrics"]["ttff_s"] == 10.0 + assert len(data["specs"]) == 1 + assert data["specs"][0]["container_name"] == "gemma" + + +# --------------------------------------------------------------------------- +# run_allocator (integration with real data) +# --------------------------------------------------------------------------- + +def test_run_allocator_streamcast_8_a100() -> None: + """Run allocator for StreamCast with 8 A100s — should produce a valid plan.""" + plan = run_allocator( + gpu_budget={"A100": 8}, + workflow_name="streamcast", + ) + assert isinstance(plan, DeploymentPlan) + assert len(plan.specs) > 0 + assert plan.result.total_time_s > 0 + assert plan.result.ttff_s > 0 + assert plan.workflow_name == "streamcast" + + +def test_run_allocator_streamchat_8_h100() -> None: + """Run allocator for StreamChat with 8 H100s.""" + plan = run_allocator( + gpu_budget={"H100": 8}, + workflow_name="streamchat", + ) + assert isinstance(plan, DeploymentPlan) + assert len(plan.specs) > 0 + + +def test_run_allocator_invalid_workflow() -> None: + """Unknown workflow name raises ValueError.""" + with pytest.raises(ValueError, match="Unknown workflow"): + run_allocator(gpu_budget={"A100": 8}, workflow_name="nonexistent") + + +def test_run_allocator_invalid_gpu_type() -> None: + """Unknown GPU type raises ValueError.""" + with pytest.raises(ValueError, match="Unknown GPU type"): + run_allocator(gpu_budget={"RTX4090": 8}, workflow_name="streamcast") + + +def test_run_allocator_insufficient_gpus() -> None: + """Too few GPUs raises ValueError.""" + with pytest.raises(ValueError, match="at least 8"): + run_allocator(gpu_budget={"A100": 4}, workflow_name="streamcast") From bccbbd26e144f75ffd0e4d20e6d8503f4441ef56 Mon Sep 17 00:00:00 2001 From: Haoran Qiu Date: Fri, 15 May 2026 15:57:38 -0700 Subject: [PATCH 3/9] Update tests --- .gitignore | 3 + streamwise/streamwise.py | 118 +++++++++ streamwise/templates/add_pod.html | 190 +++++++++++++++ tests/streamwise/conftest.py | 18 ++ .../streamwise/test_streamwise_auto_deploy.py | 227 ++++++++++++++++++ 5 files changed, 556 insertions(+) create mode 100644 tests/streamwise/conftest.py create mode 100644 tests/streamwise/test_streamwise_auto_deploy.py diff --git a/.gitignore b/.gitignore index 51130c5b..9807bf14 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,9 @@ *.sln.docstates *.env +# Environment files +.venv/ + # User-specific files (MonoDevelop/Xamarin Studio) *.userprefs diff --git a/streamwise/streamwise.py b/streamwise/streamwise.py index 1c63eacf..0ce24ac5 100644 --- a/streamwise/streamwise.py +++ b/streamwise/streamwise.py @@ -34,6 +34,7 @@ import pod_manager import node_manager import job_manager +import allocator_bridge from service_manager import get_services from service_manager import get_service_timestamps @@ -726,6 +727,123 @@ async def api_add_pod() -> QuartReturn: return jsonify({"error": str(ex)}), HTTPStatus.INTERNAL_SERVER_ERROR +@route("/api/auto_deploy", methods=["POST"]) +async def api_auto_deploy() -> QuartReturn: + """Run the model allocator to produce an optimized deployment plan. + + Expects JSON body: + { + "gpu_budget": {"A100": 8, "H100": 0, ...}, + "workflow": "streamcast" + } + + Returns the deployment plan with estimated metrics and per-container specs. + """ + try: + data = await request.get_json() + if not data: + return jsonify({"error": "Request body must be JSON"}), HTTPStatus.BAD_REQUEST + + gpu_budget = data.get("gpu_budget") + workflow_name = data.get("workflow") + + if not gpu_budget or not isinstance(gpu_budget, dict): + return jsonify({"error": "Missing or invalid 'gpu_budget' field"}), HTTPStatus.BAD_REQUEST + if not workflow_name or not isinstance(workflow_name, str): + return jsonify({"error": "Missing or invalid 'workflow' field"}), HTTPStatus.BAD_REQUEST + + plan = allocator_bridge.run_allocator( + gpu_budget=gpu_budget, + workflow_name=workflow_name, + ) + return jsonify(allocator_bridge.deployment_plan_to_json(plan)), HTTPStatus.OK + + except ValueError as ve: + return jsonify({"error": str(ve)}), HTTPStatus.BAD_REQUEST + except Exception as ex: + logging.exception("Error in auto_deploy: %s", ex) + return jsonify({"error": str(ex)}), HTTPStatus.INTERNAL_SERVER_ERROR + + +@route("/api/auto_deploy/confirm", methods=["POST"]) +async def api_auto_deploy_confirm() -> QuartReturn: + """Execute a deployment plan produced by /api/auto_deploy. + + Expects JSON body: + { + "specs": [ + { + "container_name": "gemma", + "cpu": 16, + "memory_gib": 192, + "ephemeral_storage_gib": 64, + "gpu": 2, + "gpu_type": "a100", + "mig_profile": null + }, + ... + ] + } + + Deploys all containers in the plan. + """ + try: + data = await request.get_json() + if not data: + return jsonify({"error": "Request body must be JSON"}), HTTPStatus.BAD_REQUEST + + specs = data.get("specs") + if not specs or not isinstance(specs, list): + return jsonify({"error": "Missing or invalid 'specs' field"}), HTTPStatus.BAD_REQUEST + + deployed: List[str] = [] + errors: List[str] = [] + + for spec in specs: + container_name = spec.get("container_name") + if not container_name: + errors.append("Spec missing 'container_name'") + continue + + try: + await pod_manager.add_pod( + container_name=container_name, + cpu=int(spec.get("cpu", 4)), + memory_gib=int(spec.get("memory_gib", 16)), + ephemeral_storage_gib=int(spec.get("ephemeral_storage_gib", 16)), + gpu=int(spec.get("gpu", 0)), + gpu_type=spec.get("gpu_type"), + mig_profile=spec.get("mig_profile"), + namespace=NAMESPACE, + k8s_cluster=k8s_cluster, + ) + deployed.append(container_name) + except Exception as pod_ex: + msg = f"Failed to deploy '{container_name}': {pod_ex}" + logging.error(msg) + errors.append(msg) + + status = HTTPStatus.OK if not errors else HTTPStatus.MULTI_STATUS + return jsonify({ + "deployed": deployed, + "errors": errors, + "message": f"Deployed {len(deployed)}/{len(specs)} containers.", + }), status + + except Exception as ex: + logging.exception("Error in auto_deploy/confirm: %s", ex) + return jsonify({"error": str(ex)}), HTTPStatus.INTERNAL_SERVER_ERROR + + +@route("/api/auto_deploy/workflows", methods=["GET"]) +async def api_auto_deploy_workflows() -> QuartReturn: + """Return available workflows and GPU types for the auto-deploy UI.""" + return jsonify({ + "workflows": allocator_bridge.get_available_workflows(), + "gpu_types": allocator_bridge.get_available_gpu_types(), + }), HTTPStatus.OK + + @route("/api/node/", methods=["DELETE"]) async def api_remove_node(node_name: str) -> QuartReturn: return await node_manager.remove_node( diff --git a/streamwise/templates/add_pod.html b/streamwise/templates/add_pod.html index d61952aa..f5496e10 100644 --- a/streamwise/templates/add_pod.html +++ b/streamwise/templates/add_pod.html @@ -384,6 +384,94 @@

🧩 Applications

{% endif %} + +

🤖 Auto Deploy

+

Specify your GPU budget and the optimizer will determine the best allocation for each component:

+ +
+
+ + 💰 GPU Budget + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+
+ +
+ + 🎬 Workflow + +
+ + +
+
+ +
+ +
+
+ + + + + +