From 5d24adce6b44bff2a06ff044e937b97221e9d967 Mon Sep 17 00:00:00 2001 From: Wouter Devriendt Date: Wed, 21 Jan 2026 15:46:57 -0800 Subject: [PATCH] fix: prevent orphaned ALB target groups and improve Jupyter URL handling Root cause: ALB target groups were created but never cleaned up when: 1. Listener rule creation failed (100 target group limit) 2. jupyter_port was 0 (Jupyter disabled) causing "Invalid port" errors 3. Expiry Lambda cleanup was failing silently Changes: - Skip ALB setup if jupyter_port is 0 (Jupyter disabled) - Clean up target group if listener rule creation fails - Add cleanup_orphaned_target_groups() function to alb_utils.py - Call orphaned TG cleanup from expiry Lambda on each run - Use IP:port fallback instead of broken domain:port when ALB fails Co-Authored-By: Claude Opus 4.5 --- .../lambda/reservation_expiry/index.py | 12 +++ .../lambda/reservation_processor/index.py | 90 ++++++++++++------- .../lambda/shared/alb_utils.py | 56 ++++++++++++ 3 files changed, 128 insertions(+), 30 deletions(-) diff --git a/terraform-gpu-devservers/lambda/reservation_expiry/index.py b/terraform-gpu-devservers/lambda/reservation_expiry/index.py index a2a04f1d..44a78864 100644 --- a/terraform-gpu-devservers/lambda/reservation_expiry/index.py +++ b/terraform-gpu-devservers/lambda/reservation_expiry/index.py @@ -861,6 +861,17 @@ def handler(event, context): logger.error(f"Error cleaning up soft-deleted snapshots: {e}") deleted_snapshot_count = 0 + # Clean up orphaned ALB target groups (created but never attached to listener rules) + orphaned_tg_count = 0 + try: + from shared.alb_utils import cleanup_orphaned_target_groups, is_alb_enabled + if is_alb_enabled(): + cleanup_stats = cleanup_orphaned_target_groups() + orphaned_tg_count = cleanup_stats.get("deleted", 0) + logger.info(f"Orphaned target group cleanup: {cleanup_stats}") + except Exception as e: + logger.error(f"Error cleaning up orphaned target groups: {e}") + return { "statusCode": 200, "body": json.dumps( @@ -873,6 +884,7 @@ def handler(event, context): "deleted_snapshots": deleted_snapshot_count, "tagged_snapshots": tagged_snapshot_count, "synced_disks": synced_disk_count, + "orphaned_tg_cleaned": orphaned_tg_count, } ), } diff --git a/terraform-gpu-devservers/lambda/reservation_processor/index.py b/terraform-gpu-devservers/lambda/reservation_processor/index.py index 40e3c586..86ebca62 100644 --- a/terraform-gpu-devservers/lambda/reservation_processor/index.py +++ b/terraform-gpu-devservers/lambda/reservation_processor/index.py @@ -2616,20 +2616,10 @@ def progress_callback(progress_message): # Fallback to direct IP+port when DNS is not configured ssh_command = f"ssh -p {node_port} dev@{node_public_ip}" - # Generate Jupyter URL (we'll get the token after pod is ready) - if domain_name and domain_ssh_command: - # Use HTTP with domain name for Jupyter when DNS is configured - # TODO: Add HTTPS support with SSL certificate - # domain_name is just the subdomain, we need to add DOMAIN_NAME to get FQDN - from shared.dns_utils import DOMAIN_NAME as DNS_DOMAIN - if DNS_DOMAIN: - full_domain = f"{domain_name}.{DNS_DOMAIN}" - else: - full_domain = domain_name - jupyter_url_base = f"http://{full_domain}:{jupyter_port}" - else: - # Fallback to HTTP with IP when DNS is not configured - jupyter_url_base = f"http://{node_public_ip}:{jupyter_port}" + # Generate Jupyter URL - will be set to HTTPS if ALB setup succeeds + # NOTE: When ALB is enabled, only port 443 is exposed, so http://ip:port fallback won't work + # If ALB setup fails, jupyter_url_base stays empty and user gets an error message + jupyter_url_base = "" # Update status: Finalizing connection setup update_reservation_status( @@ -2700,9 +2690,9 @@ def progress_callback(progress_message): alb_enabled = is_alb_enabled() logger.info(f"ALB enabled check result: {alb_enabled}") - if alb_enabled: + if alb_enabled and jupyter_port > 0: logger.info( - f"Setting up ALB/NLB for reservation {reservation_id}") + f"Setting up ALB for reservation {reservation_id} (jupyter_port={jupyter_port})") # Get instance ID from pod instance_id = get_instance_id_from_pod( @@ -2752,6 +2742,16 @@ def progress_callback(progress_message): logger.info( f"ALB setup complete for {reservation_id} (Jupyter HTTPS + SSH proxy)") + else: + # Listener rule creation failed - clean up the orphaned target group + logger.warning( + f"Listener rule creation failed for {reservation_id}, cleaning up target group {jupyter_tg_arn}") + try: + elbv2 = boto3.client("elbv2") + elbv2.delete_target_group(TargetGroupArn=jupyter_tg_arn) + logger.info(f"Cleaned up orphaned target group {jupyter_tg_arn}") + except Exception as cleanup_error: + logger.error(f"Failed to clean up target group {jupyter_tg_arn}: {cleanup_error}") else: logger.warning( f"Could not get instance ID for pod {pod_name}, skipping ALB setup") @@ -7048,21 +7048,51 @@ def enable_jupyter_in_pod( jupyter_token = get_jupyter_token_from_pod( k8s_client, pod_name) - # Try to use domain name if available - from shared.dns_utils import DOMAIN_NAME as DNS_DOMAIN - reservations_table = dynamodb.Table(RESERVATIONS_TABLE) - reservation_resp = reservations_table.get_item( - Key={"reservation_id": reservation_id}) - domain_name = None - if "Item" in reservation_resp: - domain_name = reservation_resp["Item"].get("domain_name") - - # Build Jupyter URL with domain if available, otherwise use IP - if domain_name and DNS_DOMAIN: - full_domain = f"{domain_name}.{DNS_DOMAIN}" - jupyter_url = f"http://{full_domain}:{jupyter_port}" - else: + # Build Jupyter URL - set up ALB if enabled, otherwise use IP:port + jupyter_url = "" + + # Try to set up ALB for HTTPS access + try: + from shared.alb_utils import ( + is_alb_enabled, + create_jupyter_target_group, + create_alb_listener_rule, + store_alb_mapping, + get_instance_id_from_pod, + ) + from shared.dns_utils import DOMAIN_NAME as DNS_DOMAIN + + # Get domain_name from reservation + reservations_table = dynamodb.Table(RESERVATIONS_TABLE) + res_resp = reservations_table.get_item(Key={"reservation_id": reservation_id}) + domain_name = res_resp.get("Item", {}).get("domain_name") + + if is_alb_enabled() and domain_name and jupyter_port > 0: + instance_id = get_instance_id_from_pod(k8s_client, pod_name) + if instance_id: + jupyter_tg_arn = create_jupyter_target_group( + reservation_id, pod_name, instance_id, jupyter_port + ) + if jupyter_tg_arn: + jupyter_rule_arn = create_alb_listener_rule(domain_name, jupyter_tg_arn) + if jupyter_rule_arn: + full_domain = f"{domain_name}.{DNS_DOMAIN}" + jupyter_url = f"https://{full_domain}" + logger.info(f"ALB setup complete for late-enabled Jupyter on {reservation_id}") + else: + # Clean up target group if listener rule failed + try: + elbv2 = boto3.client("elbv2") + elbv2.delete_target_group(TargetGroupArn=jupyter_tg_arn) + except Exception: + pass + except Exception as alb_error: + logger.warning(f"ALB setup failed for late-enabled Jupyter: {alb_error}") + + # Fallback to IP:port if ALB not available (only works if NodePorts are exposed) + if not jupyter_url: jupyter_url = f"http://{node_public_ip}:{jupyter_port}" + logger.warning(f"Using IP:port fallback for Jupyter - may not work if only 443 is exposed") if jupyter_token: jupyter_url += f"?token={jupyter_token}" diff --git a/terraform-gpu-devservers/lambda/shared/alb_utils.py b/terraform-gpu-devservers/lambda/shared/alb_utils.py index e3185c0a..855aef22 100644 --- a/terraform-gpu-devservers/lambda/shared/alb_utils.py +++ b/terraform-gpu-devservers/lambda/shared/alb_utils.py @@ -290,6 +290,62 @@ def delete_alb_mapping(reservation_id: str) -> bool: return False +def cleanup_orphaned_target_groups() -> Dict[str, Any]: + """ + Clean up orphaned Jupyter target groups that have no associated ALB listener rules. + This handles cases where target groups were created but listener rule creation failed. + + Returns: + Dict with cleanup statistics + """ + stats = {"checked": 0, "deleted": 0, "errors": 0, "in_use": 0} + + try: + # Get all Jupyter target groups + paginator = elbv2_client.get_paginator("describe_target_groups") + + for page in paginator.paginate(): + for tg in page.get("TargetGroups", []): + tg_name = tg.get("TargetGroupName", "") + tg_arn = tg.get("TargetGroupArn", "") + + # Only process jupyter-* target groups + if not tg_name.startswith("jupyter-"): + continue + + stats["checked"] += 1 + + try: + # Check if target group has any listener rules attached + # If LoadBalancerArns is empty, the target group is orphaned + if not tg.get("LoadBalancerArns"): + logger.info(f"Found orphaned target group: {tg_name} ({tg_arn})") + try: + elbv2_client.delete_target_group(TargetGroupArn=tg_arn) + logger.info(f"Deleted orphaned target group: {tg_name}") + stats["deleted"] += 1 + except ClientError as delete_error: + if "in use" in str(delete_error).lower(): + stats["in_use"] += 1 + logger.warning(f"Target group {tg_name} still in use, skipping") + else: + stats["errors"] += 1 + logger.error(f"Failed to delete {tg_name}: {delete_error}") + else: + stats["in_use"] += 1 + except Exception as e: + stats["errors"] += 1 + logger.error(f"Error checking target group {tg_name}: {e}") + + logger.info(f"Orphaned target group cleanup complete: {stats}") + return stats + + except Exception as e: + logger.error(f"Failed to cleanup orphaned target groups: {e}") + stats["errors"] += 1 + return stats + + def get_instance_id_from_pod(k8s_client, pod_name: str, namespace: str = "gpu-dev") -> Optional[str]: """ Get EC2 instance ID from pod's node