From a8c17b7d803b0f9a9eec3f2f620582aba4bc980b Mon Sep 17 00:00:00 2001 From: Emily Soth Date: Fri, 20 Mar 2026 17:35:53 -0700 Subject: [PATCH 1/7] work in progress: build cloud run service from custom docker image --- slurm_cluster_config/README.md | 1 + slurm_cluster_config/hpc-slurm.yml | 2 +- slurm_cluster_config/server_interface.tf | 97 ++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 1 deletion(-) diff --git a/slurm_cluster_config/README.md b/slurm_cluster_config/README.md index 4acf39a4..0963864e 100644 --- a/slurm_cluster_config/README.md +++ b/slurm_cluster_config/README.md @@ -19,6 +19,7 @@ Copy the additional terraform code into that directory: ``` cp server_interface.tf hpc-slurm/primary cp providers.tf hpc-slurm/primary # overwrite the existing providers.tf +cp versions.tf hpc-slurm/primary ``` To deploy: diff --git a/slurm_cluster_config/hpc-slurm.yml b/slurm_cluster_config/hpc-slurm.yml index b9be4d76..15f0d7ec 100644 --- a/slurm_cluster_config/hpc-slurm.yml +++ b/slurm_cluster_config/hpc-slurm.yml @@ -54,7 +54,7 @@ deployment_groups: # the module by Filestore modules. # https://cloud.google.com/vpc/docs/configure-private-services-access#permissions - id: private_service_access - source: community/modules/network/private-service-access + source: modules/network/private-service-access use: [network] settings: deletion_policy: "ABANDON" diff --git a/slurm_cluster_config/server_interface.tf b/slurm_cluster_config/server_interface.tf index f06bd42a..73a87ecb 100644 --- a/slurm_cluster_config/server_interface.tf +++ b/slurm_cluster_config/server_interface.tf @@ -11,6 +11,7 @@ resource "google_project_service" "enable_services" { for_each = toset([ "apigateway.googleapis.com", "apikeys.googleapis.com", + "cloudbuild.googleapis.com", "cloudresourcemanager.googleapis.com", "run.googleapis.com", "secretmanager.googleapis.com", @@ -22,6 +23,43 @@ resource "google_project_service" "enable_services" { } +# Docker container ------------------------------------------------------------ + +# 1. Create the Artifact Registry Repository +resource "google_artifact_registry_repository" "my_repo" { + location = "us-central1" + repository_id = "my-docker-repo" + description = "Docker repository for my local images" + format = "DOCKER" +} + +# 2. Configure Docker Provider to authenticate with GCP +data "google_client_config" "default" {} + +provider "docker" { + registry_auth { + address = "${google_artifact_registry_repository.my_repo.location}-docker.pkg.dev" + username = "oauth2accesstoken" + password = data.google_client_config.default.access_token + } +} + +# 3. Build and Push the local image +resource "docker_image" "my_image" { + # The name must match the GCP format: LOCATION-docker.pkg.dev/PROJECT/REPO/IMAGE:TAG + name = "${google_artifact_registry_repository.my_repo.location}-docker.pkg.dev/${google_artifact_registry_repository.my_repo.project}/${google_artifact_registry_repository.my_repo.repository_id}/my-app:v1" + + build { + context = "../../../invest_processes" # Path to your local directory containing the Dockerfile + } +} + +# 4. (Optional) Ensure the image is actually pushed to the registry +resource "docker_registry_image" "push_to_gcp" { + name = docker_image.my_image.name +} + + # Cloud Run Service ----------------------------------------------------------- # # This service runs a minimal nginx proxy that redirects traffic to the @@ -44,6 +82,63 @@ resource "google_secret_manager_secret_iam_member" "cloud_run_secret_access" { member = "serviceAccount:${google_service_account.cloud_run_sa.email}" } + + +# 1. Zip the local source code +data "archive_file" "source_archive" { + type = "tar.gz" + source_dir = "../../../invest_processes" + output_path = "${path.module}/invest_processes.tar.gz" +} + +# Google Storage Bucket to hold uploaded source code for the server +resource "google_storage_bucket" "pygeoapi_server_source_bucket" { + name = "pygeoapi_server_source_bucket" + location = "US" + force_destroy = true + + # delete contents older than 3 days + lifecycle_rule { + condition { + age = 3 + } + action { + type = "Delete" + } + } +} + +# 2. Upload source to a GCS bucket +resource "google_storage_bucket_object" "source_object" { + name = "source-${data.archive_file.source_archive.output_md5}.zip" + bucket = google_storage_bucket.pygeoapi_server_source_bucket.name + source = data.archive_file.source_archive.output_path +} + +# Define Cloud Run with a build_config +resource "google_cloud_run_v2_service" "pygeoapi_service" { + provider = google-beta + name = "pygeoapi-service" + location = "us-central1" + deletion_protection = false + + template { + containers { + # This points to where the build will save the image + image = "us-central1-docker.pkg.dev/${var.project_id}/${google_artifact_registry_repository.my_repo.repository_id}/my-image:latest" + } + } + + # Because a Dockerfile exists in the zip, Cloud Build automatically uses it + build_config { + source_location = "${google_storage_bucket.pygeoapi_server_source_bucket.url}/${google_storage_bucket_object.source_object.name}" + } + + # Wait for the upload to finish before deploying + depends_on = [google_storage_bucket_object.source_object] +} + + # Create the Cloud Run Service resource "google_cloud_run_v2_service" "proxy" { name = "cloud-run-proxy" @@ -292,6 +387,7 @@ resource "google_compute_url_map" "default" { variable "domain_name" { description = "The load balancer domain name, e.g. 'compute.naturalcapitalalliance.org'" type = string + default = "compute.naturalcapitalalliance.org" } # Create a Google-managed SSL certificate for the load balancer @@ -377,6 +473,7 @@ resource "google_project_iam_member" "github_actions_uploader_binding" { variable "github_repo" { description = "The GitHub repo name, for example 'natcap/invest-compute'" type = string + default = "natcap/invest-compute" } # Create Workload Identity Pool and Provider to authenticate GHA workflows From eceaf29516ffab721ec65be84a13ee1e11e05e0a Mon Sep 17 00:00:00 2001 From: Emily Soth Date: Mon, 23 Mar 2026 12:53:54 -0700 Subject: [PATCH 2/7] move gcp client inside function to avoid auth on import --- invest_processes/src/invest_processes/slurm_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/invest_processes/src/invest_processes/slurm_manager.py b/invest_processes/src/invest_processes/slurm_manager.py index 49219eac..ba839f9e 100644 --- a/invest_processes/src/invest_processes/slurm_manager.py +++ b/invest_processes/src/invest_processes/slurm_manager.py @@ -19,8 +19,6 @@ LOGGER = logging.getLogger(__name__) BUCKET_NAME = 'invest-compute-workspaces' -STORAGE_CLIENT = storage.Client() -BUCKET = STORAGE_CLIENT.bucket(BUCKET_NAME) WORKSPACE_ROOT = 'workspaces' os.makedirs(WORKSPACE_ROOT, exist_ok=True) @@ -36,6 +34,8 @@ def upload_directory_to_bucket(dir_path): Returns: None """ + STORAGE_CLIENT = storage.Client() + BUCKET = STORAGE_CLIENT.bucket(BUCKET_NAME) dir_path = Path(dir_path) for path in dir_path.rglob('*'): if not path.is_file(): From b5d5118415754949cd7c3e94ab1cd03b8a712441 Mon Sep 17 00:00:00 2001 From: Emily Soth Date: Mon, 23 Mar 2026 12:54:33 -0700 Subject: [PATCH 3/7] update terraform to build cloud run service from docker image --- slurm_cluster_config/server_interface.tf | 61 ++++++++---------------- 1 file changed, 20 insertions(+), 41 deletions(-) diff --git a/slurm_cluster_config/server_interface.tf b/slurm_cluster_config/server_interface.tf index 73a87ecb..b6691146 100644 --- a/slurm_cluster_config/server_interface.tf +++ b/slurm_cluster_config/server_interface.tf @@ -44,10 +44,17 @@ provider "docker" { } } +locals { + # Calculate a SHA1 hash of all files in the "app" directory (your build context) + app_dir_sha1 = sha1(join("", [ + for f in fileset(path.module, "../../../invest_processes/**") : filesha1(f) + ])) +} + # 3. Build and Push the local image resource "docker_image" "my_image" { # The name must match the GCP format: LOCATION-docker.pkg.dev/PROJECT/REPO/IMAGE:TAG - name = "${google_artifact_registry_repository.my_repo.location}-docker.pkg.dev/${google_artifact_registry_repository.my_repo.project}/${google_artifact_registry_repository.my_repo.repository_id}/my-app:v1" + name = "${google_artifact_registry_repository.my_repo.location}-docker.pkg.dev/${google_artifact_registry_repository.my_repo.project}/${google_artifact_registry_repository.my_repo.repository_id}/my-app:${local.app_dir_sha1}" build { context = "../../../invest_processes" # Path to your local directory containing the Dockerfile @@ -83,38 +90,6 @@ resource "google_secret_manager_secret_iam_member" "cloud_run_secret_access" { } - -# 1. Zip the local source code -data "archive_file" "source_archive" { - type = "tar.gz" - source_dir = "../../../invest_processes" - output_path = "${path.module}/invest_processes.tar.gz" -} - -# Google Storage Bucket to hold uploaded source code for the server -resource "google_storage_bucket" "pygeoapi_server_source_bucket" { - name = "pygeoapi_server_source_bucket" - location = "US" - force_destroy = true - - # delete contents older than 3 days - lifecycle_rule { - condition { - age = 3 - } - action { - type = "Delete" - } - } -} - -# 2. Upload source to a GCS bucket -resource "google_storage_bucket_object" "source_object" { - name = "source-${data.archive_file.source_archive.output_md5}.zip" - bucket = google_storage_bucket.pygeoapi_server_source_bucket.name - source = data.archive_file.source_archive.output_path -} - # Define Cloud Run with a build_config resource "google_cloud_run_v2_service" "pygeoapi_service" { provider = google-beta @@ -125,17 +100,21 @@ resource "google_cloud_run_v2_service" "pygeoapi_service" { template { containers { # This points to where the build will save the image - image = "us-central1-docker.pkg.dev/${var.project_id}/${google_artifact_registry_repository.my_repo.repository_id}/my-image:latest" - } - } + image = docker_image.my_image.name + + ports { + container_port = 8080 + } - # Because a Dockerfile exists in the zip, Cloud Build automatically uses it - build_config { - source_location = "${google_storage_bucket.pygeoapi_server_source_bucket.url}/${google_storage_bucket_object.source_object.name}" + startup_probe { + tcp_socket { + port = 5000 + } + } + } } - # Wait for the upload to finish before deploying - depends_on = [google_storage_bucket_object.source_object] + depends_on = [docker_registry_image.push_to_gcp] } From 397d8fabb1e603b289efb866bd628f67f92504f6 Mon Sep 17 00:00:00 2001 From: Emily Soth Date: Mon, 23 Mar 2026 15:26:39 -0700 Subject: [PATCH 4/7] configure pygeoapi to log to stdout instead of a file --- invest_processes/pygeoapi-config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/invest_processes/pygeoapi-config.yml b/invest_processes/pygeoapi-config.yml index a1df586e..ea3c0380 100644 --- a/invest_processes/pygeoapi-config.yml +++ b/invest_processes/pygeoapi-config.yml @@ -57,7 +57,6 @@ server: logging: level: DEBUG - logfile: pygeoapi.log metadata: identification: From 7f07766369b7424bcb8bdc89d2f3d1786a052336 Mon Sep 17 00:00:00 2001 From: Emily Soth Date: Thu, 26 Mar 2026 13:07:33 -0700 Subject: [PATCH 5/7] build docker image with Cloud Build and point service to Artifact Registry image --- slurm_cluster_config/server_interface.tf | 166 ++++++++--------------- 1 file changed, 59 insertions(+), 107 deletions(-) diff --git a/slurm_cluster_config/server_interface.tf b/slurm_cluster_config/server_interface.tf index b6691146..542a231d 100644 --- a/slurm_cluster_config/server_interface.tf +++ b/slurm_cluster_config/server_interface.tf @@ -33,39 +33,13 @@ resource "google_artifact_registry_repository" "my_repo" { format = "DOCKER" } -# 2. Configure Docker Provider to authenticate with GCP -data "google_client_config" "default" {} - -provider "docker" { - registry_auth { - address = "${google_artifact_registry_repository.my_repo.location}-docker.pkg.dev" - username = "oauth2accesstoken" - password = data.google_client_config.default.access_token - } -} - locals { - # Calculate a SHA1 hash of all files in the "app" directory (your build context) + # Calculate a SHA1 hash of all files in the source directory app_dir_sha1 = sha1(join("", [ for f in fileset(path.module, "../../../invest_processes/**") : filesha1(f) ])) } -# 3. Build and Push the local image -resource "docker_image" "my_image" { - # The name must match the GCP format: LOCATION-docker.pkg.dev/PROJECT/REPO/IMAGE:TAG - name = "${google_artifact_registry_repository.my_repo.location}-docker.pkg.dev/${google_artifact_registry_repository.my_repo.project}/${google_artifact_registry_repository.my_repo.repository_id}/my-app:${local.app_dir_sha1}" - - build { - context = "../../../invest_processes" # Path to your local directory containing the Dockerfile - } -} - -# 4. (Optional) Ensure the image is actually pushed to the registry -resource "docker_registry_image" "push_to_gcp" { - name = docker_image.my_image.name -} - # Cloud Run Service ----------------------------------------------------------- # @@ -81,29 +55,73 @@ resource "google_service_account" "cloud_run_sa" { display_name = "Service Account for Cloud Run" } -# Grant it access to the nginx Secret -resource "google_secret_manager_secret_iam_member" "cloud_run_secret_access" { - project = var.project_id - secret_id = google_secret_manager_secret.nginx_config.secret_id - role = "roles/secretmanager.secretAccessor" - member = "serviceAccount:${google_service_account.cloud_run_sa.email}" +# Allow Cloud Run to start builds +resource "google_project_iam_member" "run_agent_build_editor" { + project = var.project_id + role = "roles/cloudbuild.builds.editor" + member = "serviceAccount:${google_service_account.cloud_run_sa.email}" +} + + + +# Create a Storage Bucket to hold the zipped source code +resource "google_storage_bucket" "source_bucket" { + name = "my-project-source" + location = "us-central1" + uniform_bucket_level_access = true +} + +# Zip the local source code +data "archive_file" "source_zip" { + type = "zip" + source_dir = "${path.module}/../../../invest_processes" + output_path = "${path.module}/invest_processes.zip" } +# Upload the zipped source code to the bucket +# Include the hash in the name to trigger a new upload when the source code changes +resource "google_storage_bucket_object" "source_object" { + name = "source-${data.archive_file.source_zip.output_md5}.zip" + bucket = google_storage_bucket.source_bucket.name + source = data.archive_file.source_zip.output_path +} -# Define Cloud Run with a build_config +# Run a gcloud command to start a Cloud Build from the uploaded source code +# There is currently not a working way to create a Cloud Build directly in terraform +# https://github.com/hashicorp/terraform-provider-google/issues/23057 +resource "terraform_data" "manual_build_submission" { + triggers_replace = [ + google_storage_bucket_object.source_object.id + ] + + provisioner "local-exec" { + command = < Date: Mon, 30 Mar 2026 09:57:36 -0700 Subject: [PATCH 6/7] get sbatch command working over ssh from cloud run service --- .../src/invest_processes/slurm_manager.py | 45 ++++++++++++------- slurm_cluster_config/server_interface.tf | 42 +++++++++++++++-- 2 files changed, 68 insertions(+), 19 deletions(-) diff --git a/invest_processes/src/invest_processes/slurm_manager.py b/invest_processes/src/invest_processes/slurm_manager.py index ba839f9e..e529b868 100644 --- a/invest_processes/src/invest_processes/slurm_manager.py +++ b/invest_processes/src/invest_processes/slurm_manager.py @@ -24,6 +24,21 @@ os.makedirs(WORKSPACE_ROOT, exist_ok=True) +def run_over_ssh(command): + vm_name = 'hpcslurm-slurm-login-001' + return subprocess.run( + [ + 'gcloud', 'compute', 'ssh', vm_name, + '--project', 'sdss-sdss-invest-compute', + '--zone', 'us-central1-a', + '--tunnel-through-iap', + '--quiet', # disable interactive prompts + '--', + ' '.join(command) + ] + capture_output=True, text=True) + + def upload_directory_to_bucket(dir_path): """Upload everything in a given directory to the GCP bucket. @@ -568,22 +583,20 @@ def submit_slurm_job(self, processor, data_dict): }) # Submit the job - try: - args = [ - 'sbatch', '--parsable', - '--comment', f'{job_metadata}', # custom metadata - '--chdir', workspace_dir, - '--output', 'stdout.log', # relative to the slurm workspace dir - '--error', 'stderr.log', - script_path] - LOGGER.info( - f'Submitting slurm job with the following command:\n{args}') - result = subprocess.run( - args, capture_output=True, text=True, check=True) - LOGGER.info(f'stdout from sbatch: {result.stdout}') - - except subprocess.CalledProcessError as e: - raise RuntimeError('Error when submitting slurm job') from e + args = [ + 'sbatch', '--parsable', + '--comment', f'{job_metadata}', # custom metadata + '--chdir', workspace_dir, + '--output', 'stdout.log', # relative to the slurm workspace dir + '--error', 'stderr.log', + str(script_path)] + LOGGER.info( + f'Submitting slurm job with the following command:\n{args}') + result = run_over_ssh(args, capture_output=True, text=True) + LOGGER.info(f'stdout from sbatch: {result.stdout}') + LOGGER.debug(f'stderr from sbatch: {result.stderr}') + if result.returncode != 0: + raise RuntimeError(f'Submitting slurm job returned non-zero exit code {result.returncode}') job_id = result.stdout.strip() LOGGER.info(f"Job submitted successfully with ID: {job_id}") diff --git a/slurm_cluster_config/server_interface.tf b/slurm_cluster_config/server_interface.tf index 542a231d..ec901593 100644 --- a/slurm_cluster_config/server_interface.tf +++ b/slurm_cluster_config/server_interface.tf @@ -22,6 +22,12 @@ resource "google_project_service" "enable_services" { disable_on_destroy = false } +# Enable OS login for all VMs ------------------------------------------------- +resource "google_compute_project_metadata" "default" { + metadata = { + enable-oslogin = "TRUE" + } +} # Docker container ------------------------------------------------------------ @@ -55,13 +61,30 @@ resource "google_service_account" "cloud_run_sa" { display_name = "Service Account for Cloud Run" } -# Allow Cloud Run to start builds -resource "google_project_iam_member" "run_agent_build_editor" { +# Allow Cloud Run to find VMs and use OS Login +resource "google_project_iam_member" "cloud_run_sa_os_login" { + project = var.project_id + role = "roles/compute.osLogin" + member = "serviceAccount:${google_service_account.cloud_run_sa.email}" +} + +resource "google_project_iam_member" "cloud_run_sa_iap_tunneling" { + project = var.project_id + role = "roles/iap.tunnelResourceAccessor" + member = "serviceAccount:${google_service_account.cloud_run_sa.email}" +} + +resource "google_project_iam_member" "cloud_run_sa_compute_viewer" { project = var.project_id - role = "roles/cloudbuild.builds.editor" + role = "roles/compute.viewer" member = "serviceAccount:${google_service_account.cloud_run_sa.email}" } +resource "google_project_iam_member" "cloud_run_sa_service_account_user" { + project = var.project_id + role = "roles/iam.serviceAccountUser" + member = "serviceAccount:${google_service_account.cloud_run_sa.email}" +} # Create a Storage Bucket to hold the zipped source code @@ -124,11 +147,24 @@ resource "google_cloud_run_v2_service" "pygeoapi_service" { container_port = 5000 } + # Define environment variables + env { + name = "HOME" + value = "/tmp" + } + startup_probe { tcp_socket { port = 5000 } } + + resources { + limits = { + "cpu" = "2" + "memory" = "8Gi" + } + } } service_account = google_service_account.cloud_run_sa.email From a53892bc6a08152a15de26b27fe082410622717d Mon Sep 17 00:00:00 2001 From: Emily Soth Date: Fri, 24 Apr 2026 11:03:48 -0700 Subject: [PATCH 7/7] add Dockerfile and terraform versions file --- invest_processes/Dockerfile | 33 +++++++++++++++++++++++++++++++ slurm_cluster_config/versions.tf | 34 ++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 invest_processes/Dockerfile create mode 100644 slurm_cluster_config/versions.tf diff --git a/invest_processes/Dockerfile b/invest_processes/Dockerfile new file mode 100644 index 00000000..412b7f7f --- /dev/null +++ b/invest_processes/Dockerfile @@ -0,0 +1,33 @@ +FROM mambaorg/micromamba:latest + +ARG SETUPTOOLS_SCM_PRETEND_VERSION_FOR_INVEST-PROCESSES=1.0.14 + +# Copy over the source code +COPY --chown=$MAMBA_USER:$MAMBA_USER . /app/invest_processes +WORKDIR /app/invest_processes + +ENV PYGEOAPI_CONFIG=pygeoapi-config.yml +ENV PYGEOAPI_OPENAPI=openapi.yml + +# Install system dependencies and gcloud +USER root +RUN apt-get update && apt-get install -y openssh-server ca-certificates gnupg curl && \ + echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | \ + tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | \ + gpg --batch --no-tty --dearmor -o /usr/share/keyrings/cloud.google.gpg && \ + apt-get update -y && \ + apt-get install google-cloud-cli -y && \ + gcloud --help && micromamba create -y -n env python=3.12 gdal natcap.invest==3.17.2 && \ + micromamba run -n env pip install pygeoapi rasterio==1.5 && \ + micromamba run -n env pygeoapi --version && \ + micromamba run -n env pip install . && \ + micromamba run -n env pygeoapi openapi generate $PYGEOAPI_CONFIG --output-file $PYGEOAPI_OPENAPI + +# Cloud Run services must listen on the port specified by the PORT environment variable +# The default is 8080, but can be configured in the service settings +ENV PORT=5000 +EXPOSE 5000 + +# launch the pygeoapi server, running in the mamba environment +CMD ["micromamba", "run", "-n", "env", "pygeoapi", "serve"] diff --git a/slurm_cluster_config/versions.tf b/slurm_cluster_config/versions.tf new file mode 100644 index 00000000..13c64169 --- /dev/null +++ b/slurm_cluster_config/versions.tf @@ -0,0 +1,34 @@ +/** + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +terraform { + required_version = "= 1.12.2" + + required_providers { + google = { + source = "hashicorp/google" + version = ">= 6.9.0, <= 7.21.0" + } + google-beta = { + source = "hashicorp/google-beta" + version = ">= 6.9.0, <= 7.21.0" + } + docker = { + source = "kreuzwerker/docker" + version = "~> 3.0" + } + } +}