From 77c066559897188c90193dc0440418e5b04db03e Mon Sep 17 00:00:00 2001 From: Wouter Devriendt Date: Fri, 20 Feb 2026 00:35:29 -0800 Subject: [PATCH 1/3] fix: queue reservations when no single node has enough GPUs + bin-pack pods The availability check was summing GPUs across all nodes, so a request for 8 GPUs would see "12 available" (4+3+5 across 3 nodes) and try to create the pod. But k8s needs all GPUs on one node, so the pod would get stuck in Pending for 600s then fail. Now check_gpu_availability returns (total, max_per_node) and scheduling decisions use max_per_node. When no single node can fulfill the request, the reservation queues properly instead of creating an unschedulable pod. Also adds pod affinity (weight=50) to prefer nodes already running gpu-dev pods. This bin-packs smaller reservations onto the same nodes, keeping whole nodes free for large (8-GPU) requests. Co-Authored-By: Claude Opus 4.6 --- .../lambda/reservation_processor/index.py | 96 +++++++++++++------ 1 file changed, 66 insertions(+), 30 deletions(-) diff --git a/terraform-gpu-devservers/lambda/reservation_processor/index.py b/terraform-gpu-devservers/lambda/reservation_processor/index.py index a8c8c679..d2de4f4a 100644 --- a/terraform-gpu-devservers/lambda/reservation_processor/index.py +++ b/terraform-gpu-devservers/lambda/reservation_processor/index.py @@ -1329,7 +1329,8 @@ def coordinate_multinode_reservation(master_reservation_id: str, total_nodes: in f"Multinode reservation needs {total_gpus_needed} {gpu_type} GPUs ({total_nodes} nodes × {gpus_per_node} GPUs)") # Check if enough resources are available for the entire multinode reservation - available_gpus = check_gpu_availability(gpu_type) + # For multinode, we check total across nodes (each node gets gpus_per_node) + available_gpus, _ = check_gpu_availability(gpu_type) if available_gpus >= total_gpus_needed: # Sufficient resources - start parallel processing for all nodes @@ -1845,18 +1846,20 @@ def process_reservation_request(record: dict[str, Any]) -> bool: if is_multinode: logger.info( f"Multinode node: skipping individual resource check, coordinator already validated resources") - available_gpus = requested_gpus # Assume coordinator validated + total_available_gpus = requested_gpus + max_per_node = requested_gpus else: - available_gpus = check_gpu_availability(gpu_type) + total_available_gpus, max_per_node = check_gpu_availability(gpu_type) - if available_gpus >= requested_gpus: + # Use max_per_node for scheduling decision: all GPUs must come from a single node + if max_per_node >= requested_gpus: # Update status to show we're preparing the machine reservation_id = reservation_request.get("reservation_id") if reservation_id: update_reservation_status( reservation_id, "preparing", - f"Found {available_gpus} available {gpu_type.upper()} GPUs - preparing resources", + f"Found {total_available_gpus} available {gpu_type.upper()} GPUs - preparing resources", ) # Create reservation @@ -1867,14 +1870,14 @@ def process_reservation_request(record: dict[str, Any]) -> bool: allocate_gpu_resources(reservation_id, reservation_request) return True # Successfully processed else: - # Insufficient resources - set to queued and let scheduled Lambda handle it + # Insufficient resources on any single node - queue and let scheduled Lambda handle it reservation_id = reservation_request.get("reservation_id") if reservation_id: # Calculate queue position and estimated wait time gpu_type = reservation_request.get("gpu_type", "a100") queue_info = calculate_queue_position_and_wait_time( - reservation_id, requested_gpus, gpu_type, available_gpus + reservation_id, requested_gpus, gpu_type, total_available_gpus ) # Update reservation with queue information and set to queued status @@ -1882,14 +1885,16 @@ def process_reservation_request(record: dict[str, Any]) -> bool: reservation_id, queue_info["position"], queue_info["estimated_wait_minutes"], - available_gpus, + total_available_gpus, ) # Provide more specific queued message based on availability - if available_gpus == 0: + if total_available_gpus == 0: queue_message = f"No {gpu_type.upper()} nodes available - position #{queue_info.get('position', '?')} in queue" + elif max_per_node == 0: + queue_message = f"No schedulable {gpu_type.upper()} nodes with free GPUs - position #{queue_info.get('position', '?')} in queue" else: - queue_message = f"Need {requested_gpus} {gpu_type.upper()} GPUs, only {available_gpus} available - position #{queue_info.get('position', '?')}" + queue_message = f"Need {requested_gpus} {gpu_type.upper()} GPUs on one node, max {max_per_node} available on any single node - position #{queue_info.get('position', '?')}" update_reservation_status( reservation_id, @@ -1975,37 +1980,44 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]: return True, "Valid request" -def check_gpu_availability(gpu_type: str = None) -> int: - """Check available GPU capacity using K8s API, optionally filtered by GPU type""" +def check_gpu_availability(gpu_type: str = None) -> tuple: + """Check available GPU capacity using K8s API, optionally filtered by GPU type. + + Returns (total_available, max_on_single_node). All GPUs for a pod must be + scheduled on a single node, so max_on_single_node is the correct value to + use when deciding whether a request of size N can be fulfilled immediately. + """ try: # Set up K8s client k8s_client = get_k8s_client() if gpu_type: # Check for schedulable nodes with specific GPU type - available_gpus = check_schedulable_gpus_for_type( + total_gpus, max_per_node = check_schedulable_gpus_for_type( k8s_client, gpu_type) logger.info( - f"Schedulable {gpu_type.upper()} GPUs: {available_gpus}") + f"Schedulable {gpu_type.upper()} GPUs: {total_gpus} total, {max_per_node} max on single node") # Update availability table with real-time data try: update_gpu_availability_table( - gpu_type, available_gpus, k8s_client) + gpu_type, total_gpus, k8s_client) except Exception as update_error: logger.warning( f"Failed to update availability table for {gpu_type}: {update_error}" ) - # Don't fail the reservation processing if availability update fails - return available_gpus + return (total_gpus, max_per_node) else: gpu_tracker = K8sGPUTracker(k8s_client) capacity_info = gpu_tracker.get_gpu_capacity_info() + available = capacity_info["available_gpus"] logger.info( - f"K8s GPU status: {capacity_info['available_gpus']}/{capacity_info['total_gpus']} GPUs available" + f"K8s GPU status: {available}/{capacity_info['total_gpus']} GPUs available" ) - return capacity_info["available_gpus"] + # Without type filter we can't determine per-node max easily, + # return total for both (callers without gpu_type don't schedule pods directly) + return (available, available) except Exception as e: logger.error(f"Error checking GPU availability from K8s: {str(e)}") @@ -2014,14 +2026,20 @@ def check_gpu_availability(gpu_type: str = None) -> int: ) from e -def check_schedulable_gpus_for_type(k8s_client, gpu_type: str) -> int: - """Check how many GPUs are available on schedulable nodes of the specified type""" +def check_schedulable_gpus_for_type(k8s_client, gpu_type: str) -> tuple: + """Check GPU availability on schedulable nodes of the specified type. + + Returns (total_available, max_on_single_node) because all GPUs for a pod + must come from a single node - the total across nodes is not schedulable + as a single request. + """ try: v1 = client.CoreV1Api(k8s_client) # Get all nodes with the specified GPU type that are ready and schedulable nodes = v1.list_node() schedulable_gpus = 0 + max_on_single_node = 0 for node in nodes.items: # Check if node has the right GPU type label @@ -2039,11 +2057,12 @@ def check_schedulable_gpus_for_type(k8s_client, gpu_type: str) -> int: # Get available GPUs on this node node_gpus = get_available_gpus_on_node(v1, node) schedulable_gpus += node_gpus + max_on_single_node = max(max_on_single_node, node_gpus) logger.info( f"Node {node.metadata.name}: {node_gpus} available {gpu_type.upper()} GPUs" ) - return schedulable_gpus + return (schedulable_gpus, max_on_single_node) except Exception as e: logger.error( @@ -4594,9 +4613,11 @@ def create_pod( "GpuType": gpu_type, **({} if target_az is None else {"topology.kubernetes.io/zone": target_az}) }, - # Node affinity for profiling-dedicated preference - # If user requests nsight=true, prefer profiling-dedicated nodes - # Otherwise, prefer non-profiling-dedicated nodes (DCGM nodes) + # Affinity rules for GPU scheduling: + # 1. Node affinity: prefer profiling-dedicated nodes if nsight requested + # 2. Pod affinity: prefer nodes already running gpu-dev pods (bin-packing) + # This fills up nodes before spreading to empty ones, keeping whole + # nodes free for large (e.g. 8-GPU) reservations. affinity=client.V1Affinity( node_affinity=client.V1NodeAffinity( preferred_during_scheduling_ignored_during_execution=[ @@ -4613,7 +4634,20 @@ def create_pod( ) ) ] - ) + ), + pod_affinity=client.V1PodAffinity( + preferred_during_scheduling_ignored_during_execution=[ + client.V1WeightedPodAffinityTerm( + weight=50, + pod_affinity_term=client.V1PodAffinityTerm( + label_selector=client.V1LabelSelector( + match_labels={"app": "gpu-dev-pod"} + ), + topology_key="kubernetes.io/hostname", + ) + ) + ] + ), ) if not gpu_type.startswith("cpu-") else None, tolerations=[ client.V1Toleration( @@ -6639,17 +6673,19 @@ def process_scheduled_queue_management(): gpu_type = reservation.get("gpu_type", "h100") # Check if this reservation can be allocated now - validate GPU type availability - type_available_gpus = check_gpu_availability(gpu_type) - if type_available_gpus >= requested_gpus: + # Use max_per_node: all GPUs for a pod must come from a single node + type_total_gpus, type_max_per_node = check_gpu_availability(gpu_type) + type_available_gpus = type_total_gpus # Keep for messages/ETA + if type_max_per_node >= requested_gpus: logger.info( - f"Allocating {requested_gpus} {gpu_type.upper()} GPUs for reservation {reservation_id} - {type_available_gpus} available" + f"Allocating {requested_gpus} {gpu_type.upper()} GPUs for reservation {reservation_id} - {type_max_per_node} available on single node ({type_total_gpus} total)" ) # Update status to preparing update_reservation_status( reservation_id, "preparing", - f"Found {type_available_gpus} available {gpu_type.upper()} GPUs - preparing environment", + f"Found {type_total_gpus} available {gpu_type.upper()} GPUs - preparing environment", ) # Try to create the actual resources From e624662d5c1f3fe4717b47b32a72069366901eba Mon Sep 17 00:00:00 2001 From: Wouter Devriendt Date: Fri, 20 Feb 2026 00:56:20 -0800 Subject: [PATCH 2/3] fix: prevent on-demand instances from stealing capacity reservation slots On-demand ASG launch templates had no capacity_reservation_specification, which defaults to AWS "open" behavior - auto-matching instances to any available targeted CR in the same AZ with the same instance type. This caused the b200-cr2 (on-demand) instance to consume a slot in cr-08e7fee0b8dc3de5e (cr1), preventing the cr1 ASG from launching its 3rd instance. Now on-demand launch templates explicitly set capacity_reservation_preference = "none". Co-Authored-By: Claude Opus 4.6 --- terraform-gpu-devservers/eks.tf | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/terraform-gpu-devservers/eks.tf b/terraform-gpu-devservers/eks.tf index 5808cfeb..b51467af 100644 --- a/terraform-gpu-devservers/eks.tf +++ b/terraform-gpu-devservers/eks.tf @@ -362,7 +362,10 @@ resource "aws_launch_template" "gpu_dev_launch_template" { } } - # Add capacity reservation specification for instances that have reservations configured + # Capacity reservation specification: + # - With CR: target the specific reservation + # - Without CR (on-demand): explicitly set "none" so AWS doesn't auto-match + # on-demand instances to targeted CRs in the same AZ (steals CR slots) dynamic "capacity_reservation_specification" { for_each = each.value.capacity_reservation_id != null ? [1] : [] content { @@ -373,6 +376,13 @@ resource "aws_launch_template" "gpu_dev_launch_template" { } } + dynamic "capacity_reservation_specification" { + for_each = each.value.capacity_reservation_id == null ? [1] : [] + content { + capacity_reservation_preference = "none" + } + } + user_data = base64encode(templatefile("${path.module}/templates/al2023-user-data.sh", { cluster_name = aws_eks_cluster.gpu_dev_cluster.name cluster_endpoint = aws_eks_cluster.gpu_dev_cluster.endpoint From 7bd26044bbe6fff0514c69b7abefca5c7fd52abe Mon Sep 17 00:00:00 2001 From: Wouter Devriendt Date: Fri, 20 Feb 2026 01:15:44 -0800 Subject: [PATCH 3/3] =?UTF-8?q?chore:=20reduce=20B200=20to=203=20nodes=20?= =?UTF-8?q?=E2=80=94=20free=201=20CR=20slot=20for=20other=20team?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - cr1: 3→2 instances (use 2 of 3 CR slots, freeing 1 for other team; cr1 already only has 2 running so no instances terminated) - cr2: removed (kills the cordoned on-demand node that was occupying a CR slot due to the old index-shift bug) Only the cordoned node gets terminated. End state: 3 B200 nodes (24 GPUs) = cr0(1) + cr1(2) Co-Authored-By: Claude Opus 4.6 --- terraform-gpu-devservers/main.tf | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/terraform-gpu-devservers/main.tf b/terraform-gpu-devservers/main.tf index b3bf3ccc..d38ec01f 100644 --- a/terraform-gpu-devservers/main.tf +++ b/terraform-gpu-devservers/main.tf @@ -270,9 +270,8 @@ locals { { key = "cr2", id = null, instance_count = 2 }, # H200 on-demand (2 instances) ] b200 = [ - { key = "cr0", id = "cr-0c366fb8339a10f69", instance_count = 1 }, # B200 reservation (1 instance) - { key = "cr1", id = "cr-08e7fee0b8dc3de5e", instance_count = 3 }, # B200 reservation (3 instances) - { key = "cr2", id = null, instance_count = 2 }, # B200 on-demand (2 instances) + { key = "cr0", id = "cr-0c366fb8339a10f69", instance_count = 1 }, # B200 reservation (1 instance, us-east-2a) + { key = "cr1", id = "cr-08e7fee0b8dc3de5e", instance_count = 2 }, # B200 reservation (2 of 3 CR slots, 1 freed for other team) ] # T4 and L4 don't have capacity reservations - managed via supported_gpu_types fallback }