diff --git a/invest_processes/Dockerfile b/invest_processes/Dockerfile new file mode 100644 index 00000000..412b7f7f --- /dev/null +++ b/invest_processes/Dockerfile @@ -0,0 +1,33 @@ +FROM mambaorg/micromamba:latest + +ARG SETUPTOOLS_SCM_PRETEND_VERSION_FOR_INVEST-PROCESSES=1.0.14 + +# Copy over the source code +COPY --chown=$MAMBA_USER:$MAMBA_USER . /app/invest_processes +WORKDIR /app/invest_processes + +ENV PYGEOAPI_CONFIG=pygeoapi-config.yml +ENV PYGEOAPI_OPENAPI=openapi.yml + +# Install system dependencies and gcloud +USER root +RUN apt-get update && apt-get install -y openssh-server ca-certificates gnupg curl && \ + echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | \ + tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | \ + gpg --batch --no-tty --dearmor -o /usr/share/keyrings/cloud.google.gpg && \ + apt-get update -y && \ + apt-get install google-cloud-cli -y && \ + gcloud --help && micromamba create -y -n env python=3.12 gdal natcap.invest==3.17.2 && \ + micromamba run -n env pip install pygeoapi rasterio==1.5 && \ + micromamba run -n env pygeoapi --version && \ + micromamba run -n env pip install . && \ + micromamba run -n env pygeoapi openapi generate $PYGEOAPI_CONFIG --output-file $PYGEOAPI_OPENAPI + +# Cloud Run services must listen on the port specified by the PORT environment variable +# The default is 8080, but can be configured in the service settings +ENV PORT=5000 +EXPOSE 5000 + +# launch the pygeoapi server, running in the mamba environment +CMD ["micromamba", "run", "-n", "env", "pygeoapi", "serve"] diff --git a/invest_processes/pygeoapi-config.yml b/invest_processes/pygeoapi-config.yml index a1df586e..ea3c0380 100644 --- a/invest_processes/pygeoapi-config.yml +++ b/invest_processes/pygeoapi-config.yml @@ -57,7 +57,6 @@ server: logging: level: DEBUG - logfile: pygeoapi.log metadata: identification: diff --git a/invest_processes/src/invest_processes/slurm_manager.py b/invest_processes/src/invest_processes/slurm_manager.py index 49219eac..e529b868 100644 --- a/invest_processes/src/invest_processes/slurm_manager.py +++ b/invest_processes/src/invest_processes/slurm_manager.py @@ -19,13 +19,26 @@ LOGGER = logging.getLogger(__name__) BUCKET_NAME = 'invest-compute-workspaces' -STORAGE_CLIENT = storage.Client() -BUCKET = STORAGE_CLIENT.bucket(BUCKET_NAME) WORKSPACE_ROOT = 'workspaces' os.makedirs(WORKSPACE_ROOT, exist_ok=True) +def run_over_ssh(command): + vm_name = 'hpcslurm-slurm-login-001' + return subprocess.run( + [ + 'gcloud', 'compute', 'ssh', vm_name, + '--project', 'sdss-sdss-invest-compute', + '--zone', 'us-central1-a', + '--tunnel-through-iap', + '--quiet', # disable interactive prompts + '--', + ' '.join(command) + ] + capture_output=True, text=True) + + def upload_directory_to_bucket(dir_path): """Upload everything in a given directory to the GCP bucket. @@ -36,6 +49,8 @@ def upload_directory_to_bucket(dir_path): Returns: None """ + STORAGE_CLIENT = storage.Client() + BUCKET = STORAGE_CLIENT.bucket(BUCKET_NAME) dir_path = Path(dir_path) for path in dir_path.rglob('*'): if not path.is_file(): @@ -568,22 +583,20 @@ def submit_slurm_job(self, processor, data_dict): }) # Submit the job - try: - args = [ - 'sbatch', '--parsable', - '--comment', f'{job_metadata}', # custom metadata - '--chdir', workspace_dir, - '--output', 'stdout.log', # relative to the slurm workspace dir - '--error', 'stderr.log', - script_path] - LOGGER.info( - f'Submitting slurm job with the following command:\n{args}') - result = subprocess.run( - args, capture_output=True, text=True, check=True) - LOGGER.info(f'stdout from sbatch: {result.stdout}') - - except subprocess.CalledProcessError as e: - raise RuntimeError('Error when submitting slurm job') from e + args = [ + 'sbatch', '--parsable', + '--comment', f'{job_metadata}', # custom metadata + '--chdir', workspace_dir, + '--output', 'stdout.log', # relative to the slurm workspace dir + '--error', 'stderr.log', + str(script_path)] + LOGGER.info( + f'Submitting slurm job with the following command:\n{args}') + result = run_over_ssh(args, capture_output=True, text=True) + LOGGER.info(f'stdout from sbatch: {result.stdout}') + LOGGER.debug(f'stderr from sbatch: {result.stderr}') + if result.returncode != 0: + raise RuntimeError(f'Submitting slurm job returned non-zero exit code {result.returncode}') job_id = result.stdout.strip() LOGGER.info(f"Job submitted successfully with ID: {job_id}") diff --git a/slurm_cluster_config/README.md b/slurm_cluster_config/README.md index 4acf39a4..0963864e 100644 --- a/slurm_cluster_config/README.md +++ b/slurm_cluster_config/README.md @@ -19,6 +19,7 @@ Copy the additional terraform code into that directory: ``` cp server_interface.tf hpc-slurm/primary cp providers.tf hpc-slurm/primary # overwrite the existing providers.tf +cp versions.tf hpc-slurm/primary ``` To deploy: diff --git a/slurm_cluster_config/hpc-slurm.yml b/slurm_cluster_config/hpc-slurm.yml index b9be4d76..15f0d7ec 100644 --- a/slurm_cluster_config/hpc-slurm.yml +++ b/slurm_cluster_config/hpc-slurm.yml @@ -54,7 +54,7 @@ deployment_groups: # the module by Filestore modules. # https://cloud.google.com/vpc/docs/configure-private-services-access#permissions - id: private_service_access - source: community/modules/network/private-service-access + source: modules/network/private-service-access use: [network] settings: deletion_policy: "ABANDON" diff --git a/slurm_cluster_config/server_interface.tf b/slurm_cluster_config/server_interface.tf index f06bd42a..ec901593 100644 --- a/slurm_cluster_config/server_interface.tf +++ b/slurm_cluster_config/server_interface.tf @@ -11,6 +11,7 @@ resource "google_project_service" "enable_services" { for_each = toset([ "apigateway.googleapis.com", "apikeys.googleapis.com", + "cloudbuild.googleapis.com", "cloudresourcemanager.googleapis.com", "run.googleapis.com", "secretmanager.googleapis.com", @@ -21,6 +22,30 @@ resource "google_project_service" "enable_services" { disable_on_destroy = false } +# Enable OS login for all VMs ------------------------------------------------- +resource "google_compute_project_metadata" "default" { + metadata = { + enable-oslogin = "TRUE" + } +} + +# Docker container ------------------------------------------------------------ + +# 1. Create the Artifact Registry Repository +resource "google_artifact_registry_repository" "my_repo" { + location = "us-central1" + repository_id = "my-docker-repo" + description = "Docker repository for my local images" + format = "DOCKER" +} + +locals { + # Calculate a SHA1 hash of all files in the source directory + app_dir_sha1 = sha1(join("", [ + for f in fileset(path.module, "../../../invest_processes/**") : filesha1(f) + ])) +} + # Cloud Run Service ----------------------------------------------------------- # @@ -36,51 +61,114 @@ resource "google_service_account" "cloud_run_sa" { display_name = "Service Account for Cloud Run" } -# Grant it access to the nginx Secret -resource "google_secret_manager_secret_iam_member" "cloud_run_secret_access" { - project = var.project_id - secret_id = google_secret_manager_secret.nginx_config.secret_id - role = "roles/secretmanager.secretAccessor" - member = "serviceAccount:${google_service_account.cloud_run_sa.email}" +# Allow Cloud Run to find VMs and use OS Login +resource "google_project_iam_member" "cloud_run_sa_os_login" { + project = var.project_id + role = "roles/compute.osLogin" + member = "serviceAccount:${google_service_account.cloud_run_sa.email}" } -# Create the Cloud Run Service -resource "google_cloud_run_v2_service" "proxy" { - name = "cloud-run-proxy" - location = var.region - ingress = "INGRESS_TRAFFIC_ALL" # Accessible from Gateway (Public) +resource "google_project_iam_member" "cloud_run_sa_iap_tunneling" { + project = var.project_id + role = "roles/iap.tunnelResourceAccessor" + member = "serviceAccount:${google_service_account.cloud_run_sa.email}" +} + +resource "google_project_iam_member" "cloud_run_sa_compute_viewer" { + project = var.project_id + role = "roles/compute.viewer" + member = "serviceAccount:${google_service_account.cloud_run_sa.email}" +} + +resource "google_project_iam_member" "cloud_run_sa_service_account_user" { + project = var.project_id + role = "roles/iam.serviceAccountUser" + member = "serviceAccount:${google_service_account.cloud_run_sa.email}" +} + + +# Create a Storage Bucket to hold the zipped source code +resource "google_storage_bucket" "source_bucket" { + name = "my-project-source" + location = "us-central1" + uniform_bucket_level_access = true +} + +# Zip the local source code +data "archive_file" "source_zip" { + type = "zip" + source_dir = "${path.module}/../../../invest_processes" + output_path = "${path.module}/invest_processes.zip" +} + +# Upload the zipped source code to the bucket +# Include the hash in the name to trigger a new upload when the source code changes +resource "google_storage_bucket_object" "source_object" { + name = "source-${data.archive_file.source_zip.output_md5}.zip" + bucket = google_storage_bucket.source_bucket.name + source = data.archive_file.source_zip.output_path +} + +# Run a gcloud command to start a Cloud Build from the uploaded source code +# There is currently not a working way to create a Cloud Build directly in terraform +# https://github.com/hashicorp/terraform-provider-google/issues/23057 +resource "terraform_data" "manual_build_submission" { + triggers_replace = [ + google_storage_bucket_object.source_object.id + ] + + provisioner "local-exec" { + command = <