Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions invest_processes/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
FROM mambaorg/micromamba:latest

ARG SETUPTOOLS_SCM_PRETEND_VERSION_FOR_INVEST-PROCESSES=1.0.14

# Copy over the source code
COPY --chown=$MAMBA_USER:$MAMBA_USER . /app/invest_processes
WORKDIR /app/invest_processes

ENV PYGEOAPI_CONFIG=pygeoapi-config.yml
ENV PYGEOAPI_OPENAPI=openapi.yml

# Install system dependencies and gcloud
USER root
RUN apt-get update && apt-get install -y openssh-server ca-certificates gnupg curl && \
echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | \
tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | \
gpg --batch --no-tty --dearmor -o /usr/share/keyrings/cloud.google.gpg && \
apt-get update -y && \
apt-get install google-cloud-cli -y && \
gcloud --help && micromamba create -y -n env python=3.12 gdal natcap.invest==3.17.2 && \
micromamba run -n env pip install pygeoapi rasterio==1.5 && \
micromamba run -n env pygeoapi --version && \
micromamba run -n env pip install . && \
micromamba run -n env pygeoapi openapi generate $PYGEOAPI_CONFIG --output-file $PYGEOAPI_OPENAPI

# Cloud Run services must listen on the port specified by the PORT environment variable
# The default is 8080, but can be configured in the service settings
ENV PORT=5000
EXPOSE 5000

# launch the pygeoapi server, running in the mamba environment
CMD ["micromamba", "run", "-n", "env", "pygeoapi", "serve"]
1 change: 0 additions & 1 deletion invest_processes/pygeoapi-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ server:

logging:
level: DEBUG
logfile: pygeoapi.log

metadata:
identification:
Expand Down
49 changes: 31 additions & 18 deletions invest_processes/src/invest_processes/slurm_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,26 @@

LOGGER = logging.getLogger(__name__)
BUCKET_NAME = 'invest-compute-workspaces'
STORAGE_CLIENT = storage.Client()
BUCKET = STORAGE_CLIENT.bucket(BUCKET_NAME)

WORKSPACE_ROOT = 'workspaces'
os.makedirs(WORKSPACE_ROOT, exist_ok=True)


def run_over_ssh(command):
vm_name = 'hpcslurm-slurm-login-001'
return subprocess.run(
[
'gcloud', 'compute', 'ssh', vm_name,
'--project', 'sdss-sdss-invest-compute',
'--zone', 'us-central1-a',
'--tunnel-through-iap',
'--quiet', # disable interactive prompts
'--',
' '.join(command)
]
capture_output=True, text=True)


def upload_directory_to_bucket(dir_path):
"""Upload everything in a given directory to the GCP bucket.

Expand All @@ -36,6 +49,8 @@ def upload_directory_to_bucket(dir_path):
Returns:
None
"""
STORAGE_CLIENT = storage.Client()
BUCKET = STORAGE_CLIENT.bucket(BUCKET_NAME)
dir_path = Path(dir_path)
for path in dir_path.rglob('*'):
if not path.is_file():
Expand Down Expand Up @@ -568,22 +583,20 @@ def submit_slurm_job(self, processor, data_dict):
})

# Submit the job
try:
args = [
'sbatch', '--parsable',
'--comment', f'{job_metadata}', # custom metadata
'--chdir', workspace_dir,
'--output', 'stdout.log', # relative to the slurm workspace dir
'--error', 'stderr.log',
script_path]
LOGGER.info(
f'Submitting slurm job with the following command:\n{args}')
result = subprocess.run(
args, capture_output=True, text=True, check=True)
LOGGER.info(f'stdout from sbatch: {result.stdout}')

except subprocess.CalledProcessError as e:
raise RuntimeError('Error when submitting slurm job') from e
args = [
'sbatch', '--parsable',
'--comment', f'{job_metadata}', # custom metadata
'--chdir', workspace_dir,
'--output', 'stdout.log', # relative to the slurm workspace dir
'--error', 'stderr.log',
str(script_path)]
LOGGER.info(
f'Submitting slurm job with the following command:\n{args}')
result = run_over_ssh(args, capture_output=True, text=True)
LOGGER.info(f'stdout from sbatch: {result.stdout}')
LOGGER.debug(f'stderr from sbatch: {result.stderr}')
if result.returncode != 0:
raise RuntimeError(f'Submitting slurm job returned non-zero exit code {result.returncode}')

job_id = result.stdout.strip()
LOGGER.info(f"Job submitted successfully with ID: {job_id}")
Expand Down
1 change: 1 addition & 0 deletions slurm_cluster_config/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Copy the additional terraform code into that directory:
```
cp server_interface.tf hpc-slurm/primary
cp providers.tf hpc-slurm/primary # overwrite the existing providers.tf
cp versions.tf hpc-slurm/primary
```

To deploy:
Expand Down
2 changes: 1 addition & 1 deletion slurm_cluster_config/hpc-slurm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ deployment_groups:
# the module by Filestore modules.
# https://cloud.google.com/vpc/docs/configure-private-services-access#permissions
- id: private_service_access
source: community/modules/network/private-service-access
source: modules/network/private-service-access
use: [network]
settings:
deletion_policy: "ABANDON"
Expand Down
186 changes: 125 additions & 61 deletions slurm_cluster_config/server_interface.tf
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ resource "google_project_service" "enable_services" {
for_each = toset([
"apigateway.googleapis.com",
"apikeys.googleapis.com",
"cloudbuild.googleapis.com",
"cloudresourcemanager.googleapis.com",
"run.googleapis.com",
"secretmanager.googleapis.com",
Expand All @@ -21,6 +22,30 @@ resource "google_project_service" "enable_services" {
disable_on_destroy = false
}

# Enable OS login for all VMs -------------------------------------------------
resource "google_compute_project_metadata" "default" {
metadata = {
enable-oslogin = "TRUE"
}
}

# Docker container ------------------------------------------------------------

# 1. Create the Artifact Registry Repository
resource "google_artifact_registry_repository" "my_repo" {
location = "us-central1"
repository_id = "my-docker-repo"
description = "Docker repository for my local images"
format = "DOCKER"
}

locals {
# Calculate a SHA1 hash of all files in the source directory
app_dir_sha1 = sha1(join("", [
for f in fileset(path.module, "../../../invest_processes/**") : filesha1(f)
]))
}


# Cloud Run Service -----------------------------------------------------------
#
Expand All @@ -36,51 +61,114 @@ resource "google_service_account" "cloud_run_sa" {
display_name = "Service Account for Cloud Run"
}

# Grant it access to the nginx Secret
resource "google_secret_manager_secret_iam_member" "cloud_run_secret_access" {
project = var.project_id
secret_id = google_secret_manager_secret.nginx_config.secret_id
role = "roles/secretmanager.secretAccessor"
member = "serviceAccount:${google_service_account.cloud_run_sa.email}"
# Allow Cloud Run to find VMs and use OS Login
resource "google_project_iam_member" "cloud_run_sa_os_login" {
project = var.project_id
role = "roles/compute.osLogin"
member = "serviceAccount:${google_service_account.cloud_run_sa.email}"
}

# Create the Cloud Run Service
resource "google_cloud_run_v2_service" "proxy" {
name = "cloud-run-proxy"
location = var.region
ingress = "INGRESS_TRAFFIC_ALL" # Accessible from Gateway (Public)
resource "google_project_iam_member" "cloud_run_sa_iap_tunneling" {
project = var.project_id
role = "roles/iap.tunnelResourceAccessor"
member = "serviceAccount:${google_service_account.cloud_run_sa.email}"
}

resource "google_project_iam_member" "cloud_run_sa_compute_viewer" {
project = var.project_id
role = "roles/compute.viewer"
member = "serviceAccount:${google_service_account.cloud_run_sa.email}"
}

resource "google_project_iam_member" "cloud_run_sa_service_account_user" {
project = var.project_id
role = "roles/iam.serviceAccountUser"
member = "serviceAccount:${google_service_account.cloud_run_sa.email}"
}


# Create a Storage Bucket to hold the zipped source code
resource "google_storage_bucket" "source_bucket" {
name = "my-project-source"
location = "us-central1"
uniform_bucket_level_access = true
}

# Zip the local source code
data "archive_file" "source_zip" {
type = "zip"
source_dir = "${path.module}/../../../invest_processes"
output_path = "${path.module}/invest_processes.zip"
}

# Upload the zipped source code to the bucket
# Include the hash in the name to trigger a new upload when the source code changes
resource "google_storage_bucket_object" "source_object" {
name = "source-${data.archive_file.source_zip.output_md5}.zip"
bucket = google_storage_bucket.source_bucket.name
source = data.archive_file.source_zip.output_path
}

# Run a gcloud command to start a Cloud Build from the uploaded source code
# There is currently not a working way to create a Cloud Build directly in terraform
# https://github.com/hashicorp/terraform-provider-google/issues/23057
resource "terraform_data" "manual_build_submission" {
triggers_replace = [
google_storage_bucket_object.source_object.id
]

provisioner "local-exec" {
command = <<EOT
gcloud builds submit gs://${google_storage_bucket.source_bucket.name}/${google_storage_bucket_object.source_object.name} \
--tag us-central1-docker.pkg.dev/${var.project_id}/${google_artifact_registry_repository.my_repo.name}/pygeoapi-server:${local.app_dir_sha1} \
--project ${var.project_id} \
--region us-central1
EOT
}
}

# Create a Cloud Run service from the docker image that we built
resource "google_cloud_run_v2_service" "pygeoapi_service" {
provider = google-beta
name = "pygeoapi-service"
location = "us-central1"
deletion_protection = false

template {
service_account = google_service_account.cloud_run_sa.email
scaling {
min_instance_count = 1
}

template {
containers {
image = "nginx:alpine"
# Point to the Artifact Registry image that we built
image = "${google_artifact_registry_repository.my_repo.location}-docker.pkg.dev/${google_artifact_registry_repository.my_repo.project}/${google_artifact_registry_repository.my_repo.repository_id}/pygeoapi-server:${local.app_dir_sha1}"

# Mount the config volume
volume_mounts {
name = "nginx-conf"
mount_path = "/etc/nginx/conf.d"
ports {
container_port = 5000
}
}

annotations = {
# this causes terraform to redeploy the service whenever the secret changes
force-update-key = google_secret_manager_secret_version.nginx_config_data.name
}
# Define environment variables
env {
name = "HOME"
value = "/tmp"
}

# Create a volume containing the config defined below
volumes {
name = "nginx-conf"
secret {
secret = google_secret_manager_secret.nginx_config.secret_id
items {
version = "latest"
path = "default.conf"
startup_probe {
tcp_socket {
port = 5000
}
}

resources {
limits = {
"cpu" = "2"
"memory" = "8Gi"
}
}
}

service_account = google_service_account.cloud_run_sa.email

# Enable VPC egress so that this service can reach the internal server
vpc_access {
network_interfaces {
Expand All @@ -90,42 +178,16 @@ resource "google_cloud_run_v2_service" "proxy" {
egress = "ALL_TRAFFIC"
}
}
}

# Define the nginx config
# Though the contents are not really secret, storing the config data
# as a Secret is a convenient way to make it accessible as a volume
# in the Cloud Run service.
resource "google_secret_manager_secret" "nginx_config" {
secret_id = "proxy-nginx-config"
replication {
auto {}
}
# Ensure the build finishes before Cloud Run tries to pull the image
depends_on = [terraform_data.manual_build_submission]
}

resource "google_secret_manager_secret_version" "nginx_config_data" {
secret = google_secret_manager_secret.nginx_config.id

# This config listens on 8080 and proxies to the internal server
# Cloud Run listens on port 8080 by default
# No trailing slash on the URL tells it to pass the full path along
# TODO: get the interal server IP dynamically in terraform
secret_data = <<EOF
server {
listen 8080;
location / {
proxy_pass http://10.0.0.3:5000;
}
}
EOF
}


# Allow the Gateway SA to invoke the Cloud Run service
resource "google_cloud_run_v2_service_iam_binding" "invoker" {
project = var.project_id
location = var.region
name = google_cloud_run_v2_service.proxy.name
name = google_cloud_run_v2_service.pygeoapi_service.name
role = "roles/run.invoker"

members = [
Expand Down Expand Up @@ -194,7 +256,7 @@ resource "google_api_gateway_api_config" "api_cfg" {
contents = base64encode(
templatefile(
"../../bundled-openapi.yml",
{ backend_url = google_cloud_run_v2_service.proxy.uri }
{ backend_url = google_cloud_run_v2_service.pygeoapi_service.uri }
)
)
}
Expand Down Expand Up @@ -292,6 +354,7 @@ resource "google_compute_url_map" "default" {
variable "domain_name" {
description = "The load balancer domain name, e.g. 'compute.naturalcapitalalliance.org'"
type = string
default = "compute.naturalcapitalalliance.org"
}

# Create a Google-managed SSL certificate for the load balancer
Expand Down Expand Up @@ -377,6 +440,7 @@ resource "google_project_iam_member" "github_actions_uploader_binding" {
variable "github_repo" {
description = "The GitHub repo name, for example 'natcap/invest-compute'"
type = string
default = "natcap/invest-compute"
}

# Create Workload Identity Pool and Provider to authenticate GHA workflows
Expand Down
Loading
Loading