From 77c066559897188c90193dc0440418e5b04db03e Mon Sep 17 00:00:00 2001
From: Wouter Devriendt <wouterdevriendt@meta.com>
Date: Fri, 20 Feb 2026 00:35:29 -0800
Subject: [PATCH 1/3] fix: queue reservations when no single node has enough
 GPUs + bin-pack pods

The availability check was summing GPUs across all nodes, so a request
for 8 GPUs would see "12 available" (4+3+5 across 3 nodes) and try to
create the pod. But k8s needs all GPUs on one node, so the pod would
get stuck in Pending for 600s then fail.

Now check_gpu_availability returns (total, max_per_node) and scheduling
decisions use max_per_node. When no single node can fulfill the request,
the reservation queues properly instead of creating an unschedulable pod.

Also adds pod affinity (weight=50) to prefer nodes already running
gpu-dev pods. This bin-packs smaller reservations onto the same nodes,
keeping whole nodes free for large (8-GPU) requests.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../lambda/reservation_processor/index.py     | 96 +++++++++++++------
 1 file changed, 66 insertions(+), 30 deletions(-)

diff --git a/terraform-gpu-devservers/lambda/reservation_processor/index.py b/terraform-gpu-devservers/lambda/reservation_processor/index.py
index a8c8c679..d2de4f4a 100644
--- a/terraform-gpu-devservers/lambda/reservation_processor/index.py
+++ b/terraform-gpu-devservers/lambda/reservation_processor/index.py
@@ -1329,7 +1329,8 @@ def coordinate_multinode_reservation(master_reservation_id: str, total_nodes: in
             f"Multinode reservation needs {total_gpus_needed} {gpu_type} GPUs ({total_nodes} nodes × {gpus_per_node} GPUs)")
 
         # Check if enough resources are available for the entire multinode reservation
-        available_gpus = check_gpu_availability(gpu_type)
+        # For multinode, we check total across nodes (each node gets gpus_per_node)
+        available_gpus, _ = check_gpu_availability(gpu_type)
 
         if available_gpus >= total_gpus_needed:
             # Sufficient resources - start parallel processing for all nodes
@@ -1845,18 +1846,20 @@ def process_reservation_request(record: dict[str, Any]) -> bool:
         if is_multinode:
             logger.info(
                 f"Multinode node: skipping individual resource check, coordinator already validated resources")
-            available_gpus = requested_gpus  # Assume coordinator validated
+            total_available_gpus = requested_gpus
+            max_per_node = requested_gpus
         else:
-            available_gpus = check_gpu_availability(gpu_type)
+            total_available_gpus, max_per_node = check_gpu_availability(gpu_type)
 
-        if available_gpus >= requested_gpus:
+        # Use max_per_node for scheduling decision: all GPUs must come from a single node
+        if max_per_node >= requested_gpus:
             # Update status to show we're preparing the machine
             reservation_id = reservation_request.get("reservation_id")
             if reservation_id:
                 update_reservation_status(
                     reservation_id,
                     "preparing",
-                    f"Found {available_gpus} available {gpu_type.upper()} GPUs - preparing resources",
+                    f"Found {total_available_gpus} available {gpu_type.upper()} GPUs - preparing resources",
                 )
 
             # Create reservation
@@ -1867,14 +1870,14 @@ def process_reservation_request(record: dict[str, Any]) -> bool:
             allocate_gpu_resources(reservation_id, reservation_request)
             return True  # Successfully processed
         else:
-            # Insufficient resources - set to queued and let scheduled Lambda handle it
+            # Insufficient resources on any single node - queue and let scheduled Lambda handle it
             reservation_id = reservation_request.get("reservation_id")
 
             if reservation_id:
                 # Calculate queue position and estimated wait time
                 gpu_type = reservation_request.get("gpu_type", "a100")
                 queue_info = calculate_queue_position_and_wait_time(
-                    reservation_id, requested_gpus, gpu_type, available_gpus
+                    reservation_id, requested_gpus, gpu_type, total_available_gpus
                 )
 
                 # Update reservation with queue information and set to queued status
@@ -1882,14 +1885,16 @@ def process_reservation_request(record: dict[str, Any]) -> bool:
                     reservation_id,
                     queue_info["position"],
                     queue_info["estimated_wait_minutes"],
-                    available_gpus,
+                    total_available_gpus,
                 )
 
                 # Provide more specific queued message based on availability
-                if available_gpus == 0:
+                if total_available_gpus == 0:
                     queue_message = f"No {gpu_type.upper()} nodes available - position #{queue_info.get('position', '?')} in queue"
+                elif max_per_node == 0:
+                    queue_message = f"No schedulable {gpu_type.upper()} nodes with free GPUs - position #{queue_info.get('position', '?')} in queue"
                 else:
-                    queue_message = f"Need {requested_gpus} {gpu_type.upper()} GPUs, only {available_gpus} available - position #{queue_info.get('position', '?')}"
+                    queue_message = f"Need {requested_gpus} {gpu_type.upper()} GPUs on one node, max {max_per_node} available on any single node - position #{queue_info.get('position', '?')}"
 
                 update_reservation_status(
                     reservation_id,
@@ -1975,37 +1980,44 @@ def validate_reservation_request(request: dict[str, Any]) -> tuple[bool, str]:
     return True, "Valid request"
 
 
-def check_gpu_availability(gpu_type: str = None) -> int:
-    """Check available GPU capacity using K8s API, optionally filtered by GPU type"""
+def check_gpu_availability(gpu_type: str = None) -> tuple:
+    """Check available GPU capacity using K8s API, optionally filtered by GPU type.
+
+    Returns (total_available, max_on_single_node). All GPUs for a pod must be
+    scheduled on a single node, so max_on_single_node is the correct value to
+    use when deciding whether a request of size N can be fulfilled immediately.
+    """
     try:
         # Set up K8s client
         k8s_client = get_k8s_client()
 
         if gpu_type:
             # Check for schedulable nodes with specific GPU type
-            available_gpus = check_schedulable_gpus_for_type(
+            total_gpus, max_per_node = check_schedulable_gpus_for_type(
                 k8s_client, gpu_type)
             logger.info(
-                f"Schedulable {gpu_type.upper()} GPUs: {available_gpus}")
+                f"Schedulable {gpu_type.upper()} GPUs: {total_gpus} total, {max_per_node} max on single node")
 
             # Update availability table with real-time data
             try:
                 update_gpu_availability_table(
-                    gpu_type, available_gpus, k8s_client)
+                    gpu_type, total_gpus, k8s_client)
             except Exception as update_error:
                 logger.warning(
                     f"Failed to update availability table for {gpu_type}: {update_error}"
                 )
-                # Don't fail the reservation processing if availability update fails
 
-            return available_gpus
+            return (total_gpus, max_per_node)
         else:
             gpu_tracker = K8sGPUTracker(k8s_client)
             capacity_info = gpu_tracker.get_gpu_capacity_info()
+            available = capacity_info["available_gpus"]
             logger.info(
-                f"K8s GPU status: {capacity_info['available_gpus']}/{capacity_info['total_gpus']} GPUs available"
+                f"K8s GPU status: {available}/{capacity_info['total_gpus']} GPUs available"
             )
-            return capacity_info["available_gpus"]
+            # Without type filter we can't determine per-node max easily,
+            # return total for both (callers without gpu_type don't schedule pods directly)
+            return (available, available)
 
     except Exception as e:
         logger.error(f"Error checking GPU availability from K8s: {str(e)}")
@@ -2014,14 +2026,20 @@ def check_gpu_availability(gpu_type: str = None) -> int:
         ) from e
 
 
-def check_schedulable_gpus_for_type(k8s_client, gpu_type: str) -> int:
-    """Check how many GPUs are available on schedulable nodes of the specified type"""
+def check_schedulable_gpus_for_type(k8s_client, gpu_type: str) -> tuple:
+    """Check GPU availability on schedulable nodes of the specified type.
+
+    Returns (total_available, max_on_single_node) because all GPUs for a pod
+    must come from a single node - the total across nodes is not schedulable
+    as a single request.
+    """
     try:
         v1 = client.CoreV1Api(k8s_client)
 
         # Get all nodes with the specified GPU type that are ready and schedulable
         nodes = v1.list_node()
         schedulable_gpus = 0
+        max_on_single_node = 0
 
         for node in nodes.items:
             # Check if node has the right GPU type label
@@ -2039,11 +2057,12 @@ def check_schedulable_gpus_for_type(k8s_client, gpu_type: str) -> int:
             # Get available GPUs on this node
             node_gpus = get_available_gpus_on_node(v1, node)
             schedulable_gpus += node_gpus
+            max_on_single_node = max(max_on_single_node, node_gpus)
             logger.info(
                 f"Node {node.metadata.name}: {node_gpus} available {gpu_type.upper()} GPUs"
             )
 
-        return schedulable_gpus
+        return (schedulable_gpus, max_on_single_node)
 
     except Exception as e:
         logger.error(
@@ -4594,9 +4613,11 @@ def create_pod(
                 "GpuType": gpu_type,
                 **({} if target_az is None else {"topology.kubernetes.io/zone": target_az})
             },
-            # Node affinity for profiling-dedicated preference
-            # If user requests nsight=true, prefer profiling-dedicated nodes
-            # Otherwise, prefer non-profiling-dedicated nodes (DCGM nodes)
+            # Affinity rules for GPU scheduling:
+            # 1. Node affinity: prefer profiling-dedicated nodes if nsight requested
+            # 2. Pod affinity: prefer nodes already running gpu-dev pods (bin-packing)
+            #    This fills up nodes before spreading to empty ones, keeping whole
+            #    nodes free for large (e.g. 8-GPU) reservations.
             affinity=client.V1Affinity(
                 node_affinity=client.V1NodeAffinity(
                     preferred_during_scheduling_ignored_during_execution=[
@@ -4613,7 +4634,20 @@ def create_pod(
                             )
                         )
                     ]
-                )
+                ),
+                pod_affinity=client.V1PodAffinity(
+                    preferred_during_scheduling_ignored_during_execution=[
+                        client.V1WeightedPodAffinityTerm(
+                            weight=50,
+                            pod_affinity_term=client.V1PodAffinityTerm(
+                                label_selector=client.V1LabelSelector(
+                                    match_labels={"app": "gpu-dev-pod"}
+                                ),
+                                topology_key="kubernetes.io/hostname",
+                            )
+                        )
+                    ]
+                ),
             ) if not gpu_type.startswith("cpu-") else None,
             tolerations=[
                 client.V1Toleration(
@@ -6639,17 +6673,19 @@ def process_scheduled_queue_management():
                 gpu_type = reservation.get("gpu_type", "h100")
 
                 # Check if this reservation can be allocated now - validate GPU type availability
-                type_available_gpus = check_gpu_availability(gpu_type)
-                if type_available_gpus >= requested_gpus:
+                # Use max_per_node: all GPUs for a pod must come from a single node
+                type_total_gpus, type_max_per_node = check_gpu_availability(gpu_type)
+                type_available_gpus = type_total_gpus  # Keep for messages/ETA
+                if type_max_per_node >= requested_gpus:
                     logger.info(
-                        f"Allocating {requested_gpus} {gpu_type.upper()} GPUs for reservation {reservation_id} - {type_available_gpus} available"
+                        f"Allocating {requested_gpus} {gpu_type.upper()} GPUs for reservation {reservation_id} - {type_max_per_node} available on single node ({type_total_gpus} total)"
                     )
 
                     # Update status to preparing
                     update_reservation_status(
                         reservation_id,
                         "preparing",
-                        f"Found {type_available_gpus} available {gpu_type.upper()} GPUs - preparing environment",
+                        f"Found {type_total_gpus} available {gpu_type.upper()} GPUs - preparing environment",
                     )
 
                     # Try to create the actual resources

From e624662d5c1f3fe4717b47b32a72069366901eba Mon Sep 17 00:00:00 2001
From: Wouter Devriendt <wouterdevriendt@meta.com>
Date: Fri, 20 Feb 2026 00:56:20 -0800
Subject: [PATCH 2/3] fix: prevent on-demand instances from stealing capacity
 reservation slots

On-demand ASG launch templates had no capacity_reservation_specification,
which defaults to AWS "open" behavior - auto-matching instances to any
available targeted CR in the same AZ with the same instance type.

This caused the b200-cr2 (on-demand) instance to consume a slot in
cr-08e7fee0b8dc3de5e (cr1), preventing the cr1 ASG from launching its
3rd instance. Now on-demand launch templates explicitly set
capacity_reservation_preference = "none".

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 terraform-gpu-devservers/eks.tf | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/terraform-gpu-devservers/eks.tf b/terraform-gpu-devservers/eks.tf
index 5808cfeb..b51467af 100644
--- a/terraform-gpu-devservers/eks.tf
+++ b/terraform-gpu-devservers/eks.tf
@@ -362,7 +362,10 @@ resource "aws_launch_template" "gpu_dev_launch_template" {
     }
   }
 
-  # Add capacity reservation specification for instances that have reservations configured
+  # Capacity reservation specification:
+  # - With CR: target the specific reservation
+  # - Without CR (on-demand): explicitly set "none" so AWS doesn't auto-match
+  #   on-demand instances to targeted CRs in the same AZ (steals CR slots)
   dynamic "capacity_reservation_specification" {
     for_each = each.value.capacity_reservation_id != null ? [1] : []
     content {
@@ -373,6 +376,13 @@ resource "aws_launch_template" "gpu_dev_launch_template" {
     }
   }
 
+  dynamic "capacity_reservation_specification" {
+    for_each = each.value.capacity_reservation_id == null ? [1] : []
+    content {
+      capacity_reservation_preference = "none"
+    }
+  }
+
   user_data = base64encode(templatefile("${path.module}/templates/al2023-user-data.sh", {
     cluster_name        = aws_eks_cluster.gpu_dev_cluster.name
     cluster_endpoint    = aws_eks_cluster.gpu_dev_cluster.endpoint

From 7bd26044bbe6fff0514c69b7abefca5c7fd52abe Mon Sep 17 00:00:00 2001
From: Wouter Devriendt <wouterdevriendt@meta.com>
Date: Fri, 20 Feb 2026 01:15:44 -0800
Subject: [PATCH 3/3] =?UTF-8?q?chore:=20reduce=20B200=20to=203=20nodes=20?=
 =?UTF-8?q?=E2=80=94=20free=201=20CR=20slot=20for=20other=20team?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- cr1: 3→2 instances (use 2 of 3 CR slots, freeing 1 for other team;
  cr1 already only has 2 running so no instances terminated)
- cr2: removed (kills the cordoned on-demand node that was occupying
  a CR slot due to the old index-shift bug)

Only the cordoned node gets terminated.
End state: 3 B200 nodes (24 GPUs) = cr0(1) + cr1(2)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 terraform-gpu-devservers/main.tf | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/terraform-gpu-devservers/main.tf b/terraform-gpu-devservers/main.tf
index b3bf3ccc..d38ec01f 100644
--- a/terraform-gpu-devservers/main.tf
+++ b/terraform-gpu-devservers/main.tf
@@ -270,9 +270,8 @@ locals {
         { key = "cr2", id = null, instance_count = 2 },                   # H200 on-demand (2 instances)
       ]
       b200 = [
-        { key = "cr0", id = "cr-0c366fb8339a10f69", instance_count = 1 }, # B200 reservation (1 instance)
-        { key = "cr1", id = "cr-08e7fee0b8dc3de5e", instance_count = 3 }, # B200 reservation (3 instances)
-        { key = "cr2", id = null, instance_count = 2 },                   # B200 on-demand (2 instances)
+        { key = "cr0", id = "cr-0c366fb8339a10f69", instance_count = 1 }, # B200 reservation (1 instance, us-east-2a)
+        { key = "cr1", id = "cr-08e7fee0b8dc3de5e", instance_count = 2 }, # B200 reservation (2 of 3 CR slots, 1 freed for other team)
       ]
       # T4 and L4 don't have capacity reservations - managed via supported_gpu_types fallback
     }