Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions terraform/files/node-problem-detector/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -457,8 +457,12 @@ settings:

cp ./config/*.py /host/tmp

chroot /host bash -c "chmod +x /tmp/check_gpu_setup.py"
chroot /host bash -c "/tmp/check_gpu_setup.py --slurm --$healthcheck_type > /tmp/oke-latest-${healthcheck_type}-healthcheck.log 2>&1"
if [ ! -e "/host/tmp/uv" ]; then
cp /usr/bin/uv /host/tmp/uv
chmod +x /host/tmp/uv
fi

EXIT_CODE=$(nsenter -n/host/proc/1/ns/net chroot /host bash -c "/tmp/uv run /tmp/check_gpu_setup.py --dry-run --$healthcheck_type > /tmp/oke-latest-${healthcheck_type}-healthcheck.log 2>&1")

ERROR_MSG=$(cat "$healtcheck_log_file" | grep "Healthcheck::")
UNKNOWN_MSG=$(cat "$healtcheck_log_file" | grep -o 'Skipping.*')
Expand All @@ -467,7 +471,7 @@ settings:
then
echo "${ERROR_MSG#* }"
exit $NONOK
elif [ "$ERROR_MSG" == "" ] && [ "$UNKNOWN_MSG" != "" ]
elif [ ! "${EXIT_CODE}" ] && [ "$UNKNOWN_MSG" != "" ]
then
echo $UNKNOWN_MSG
exit $UNKNOWN
Expand Down Expand Up @@ -521,7 +525,7 @@ logDir:

image:
repository: iad.ocir.io/idxzjcdglx2s/oke-npd
tag: v1.34.0-1
tag: v1.34.0-2
# image.digest -- the image digest. If given it takes precedence over a given tag.
digest: ""
pullPolicy: IfNotPresent
Expand Down
70 changes: 39 additions & 31 deletions terraform/oke-cluster.tf
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ locals {
bastion = var.create_bastion ? { create = "auto" } : { create = "never"}
operator = var.create_operator ? { create = "auto" } : { create = "never"}
int_lb = { create = "auto" }
pub_lb = alltrue([!var.create_vcn, var.pub_lb_sn_id == null, var.pub_lb_sn_cidr == null]) ? { create = "never" } : { create = "auto"}
pub_lb = { create = "auto" }
cp = { create = "auto" }
workers = { create = "auto" }
pods = { create = "auto" }
Expand All @@ -50,89 +50,97 @@ locals {
{
bastion = merge(
var.create_bastion ? { create = "auto" } : { create = "never" },
(var.create_vcn && var.bastion_sn_cidr == null) || (!var.create_vcn && var.bastion_sn_id == null) ?
(var.create_vcn && var.bastion_sn_cidr == null) || (!var.create_vcn && !var.custom_subnet_ids) ?
{ newbits = 13, netnum = 1 } : {},
var.create_vcn && var.bastion_sn_cidr != null ?
{ cidr = var.bastion_sn_cidr } : {},
var.vcn_id != null && var.bastion_sn_id != null ?
{ id = var.bastion_sn_id } : {}
!var.create_vcn && var.custom_subnet_ids ?
{ id = var.bastion_sn_id, create = "never" } : {},
lookup(var.subnet_advanced_attrs, "bastion", {})
)
operator = var.create_operator ? merge(
operator = merge(
var.create_operator ? { create = "auto" } : { create = "never" },
(var.create_vcn && var.operator_sn_cidr == null) || (!var.create_vcn && var.operator_sn_id == null) ?
(var.create_vcn && var.operator_sn_cidr == null) || (!var.create_vcn && !var.custom_subnet_ids) ?
{ newbits = 13, netnum = 2 } : {},
var.create_vcn && var.operator_sn_cidr != null ?
{ cidr = var.operator_sn_cidr } : {},
var.vcn_id != null && var.operator_sn_id != null ?
{ id = var.operator_sn_id } : {}
) : { create = "never" }
!var.create_vcn && var.custom_subnet_ids ?
{ id = var.operator_sn_id, create = "never" } : {},
lookup(var.subnet_advanced_attrs, "operator", {})
)
int_lb = merge(
{ create = "auto" },
(var.create_vcn && var.int_lb_sn_cidr == null) || (!var.create_vcn && var.int_lb_sn_id == null) ?
(var.create_vcn && var.int_lb_sn_cidr == null) || (!var.create_vcn && !var.custom_subnet_ids) ?
{ newbits = 11, netnum = 1 } : {},
var.create_vcn && var.int_lb_sn_cidr != null ?
{ cidr = var.int_lb_sn_cidr } : {},
var.vcn_id != null && var.int_lb_sn_id != null ?
{ id = var.int_lb_sn_id } : {}
!var.create_vcn && var.custom_subnet_ids ?
{ id = var.int_lb_sn_id, create = "never" } : {},
lookup(var.subnet_advanced_attrs, "int_lb", {})
)
pub_lb = merge(
{ create = "auto" },
(var.create_vcn && var.pub_lb_sn_cidr == null) || (!var.create_vcn && var.pub_lb_sn_id == null) ?
(var.create_vcn && var.pub_lb_sn_cidr == null) || (!var.create_vcn && !var.custom_subnet_ids) ?
{ newbits = 11, netnum = 2 } : {},
var.create_vcn && var.pub_lb_sn_cidr != null ?
{ cidr = var.pub_lb_sn_cidr } : {},
var.vcn_id != null && var.pub_lb_sn_id != null ?
{ id = var.pub_lb_sn_id } : {},
alltrue([!var.create_vcn, var.pub_lb_sn_id == null, var.pub_lb_sn_cidr == null]) ? { create = "never" } : {}
!var.create_vcn && var.custom_subnet_ids ?
{ id = var.pub_lb_sn_id, create = "never" } : {},
lookup(var.subnet_advanced_attrs, "pub_lb", {})
)
cp = merge(
{ create = "auto" },
(var.create_vcn && var.cp_sn_cidr == null) || (!var.create_vcn && var.cp_sn_id == null) ?
(var.create_vcn && var.cp_sn_cidr == null) || (!var.create_vcn && !var.custom_subnet_ids) ?
{ newbits = 13, netnum = 0 } : {},
var.create_vcn && var.cp_sn_cidr != null ?
{ cidr = var.cp_sn_cidr } : {},
var.vcn_id != null && var.cp_sn_id != null ?
{ id = var.cp_sn_id } : {}
!var.create_vcn && var.custom_subnet_ids ?
{ id = var.cp_sn_id, create = "never" } : {},
lookup(var.subnet_advanced_attrs, "cp", {})
)
workers = merge(
{ create = "auto" },
(var.create_vcn && var.workers_sn_cidr == null) || (!var.create_vcn && var.workers_sn_id == null) ?
(var.create_vcn && var.workers_sn_cidr == null) || (!var.create_vcn && !var.custom_subnet_ids) ?
{ newbits = 4, netnum = 2 } : {},
var.create_vcn && var.workers_sn_cidr != null ?
{ cidr = var.workers_sn_cidr } : {},
var.vcn_id != null && var.workers_sn_id != null ?
{ id = var.workers_sn_id } : {}
!var.create_vcn && var.custom_subnet_ids ?
{ id = var.workers_sn_id, create = "never" } : {},
lookup(var.subnet_advanced_attrs, "workers", {})
)
pods = merge(
{ create = "auto" },
(var.create_vcn && var.pods_sn_cidr == null) || (!var.create_vcn && var.pods_sn_id == null) ?
(var.create_vcn && var.pods_sn_cidr == null) || (!var.create_vcn && !var.custom_subnet_ids) ?
{ newbits = 2, netnum = 2 } : {},
var.create_vcn && var.pods_sn_cidr != null ?
{ cidr = var.pods_sn_cidr } : {},
var.vcn_id != null && var.pods_sn_id != null ?
{ id = var.pods_sn_id } : {}
!var.create_vcn && var.custom_subnet_ids ?
{ id = var.pods_sn_id, create = "never" } : {},
lookup(var.subnet_advanced_attrs, "pods", {})
)
},
var.create_fss ? {
fss = merge(
{ create = "always" },
(var.create_vcn && var.fss_sn_cidr == null) || (!var.create_vcn && var.fss_sn_id == null) ?
(var.create_vcn && var.fss_sn_cidr == null) || (!var.create_vcn && !var.custom_subnet_ids) ?
{ newbits = 11, netnum = 3 } : {},
var.create_vcn && var.fss_sn_cidr != null ?
{ cidr = var.fss_sn_cidr } : {},
var.vcn_id != null && var.fss_sn_id != null ?
{ id = var.fss_sn_id } : {}
!var.create_vcn && var.custom_subnet_ids ?
{ id = var.fss_sn_id, create = "never" } : {},
lookup(var.subnet_advanced_attrs, "fss", {})
)
} : {},
var.create_lustre ? {
lustre = merge(
{ create = "always" },
(var.create_vcn && var.lustre_sn_cidr == null) || (!var.create_vcn && var.lustre_sn_id == null) ?
(var.create_vcn && var.lustre_sn_cidr == null) || (!var.create_vcn && !var.custom_subnet_ids) ?
{ newbits = 7, netnum = 1 } : {},
var.create_vcn && var.lustre_sn_cidr != null ?
{ cidr = var.lustre_sn_cidr } : {},
var.vcn_id != null && var.lustre_sn_id != null ?
{ id = var.lustre_sn_id } : {}
!var.create_vcn && var.custom_subnet_ids ?
{ id = var.lustre_sn_id, create = "never" } : {},
lookup(var.subnet_advanced_attrs, "lustre", {})
)
} : {}
)
Expand Down
1 change: 1 addition & 0 deletions terraform/schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ variableGroups:
# Monitoring
- wildcard_dns_domain
- monitoring_advanced_options
- subnet_advanced_attrs

- title: "Identity"
variables:
Expand Down
4 changes: 4 additions & 0 deletions terraform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ variable "workers_sn_id" { default = null }
variable "pods_sn_id" { default = null }
variable "fss_sn_id" { default = null }
variable "lustre_sn_id" { default = null }
variable "subnet_advanced_attrs" {
default = {}
type = any
}
variable "networking_advanced_options" {
type = bool
default = false
Expand Down
20 changes: 10 additions & 10 deletions terraform/via-operator-helm-deployments.tf
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,16 @@ module "nginx" {
post_deployment_commands = flatten([
"cat <<'EOF' | kubectl apply -f -",
( var.use_lets_encrypt_prod_endpoint == true ?
split("\n", file("${path.root}/files/cert-manager/cluster-issuer-prod.yaml")) :
split("\n", file("${path.root}/files/cert-manager/cluster-issuer-staging.yaml"))
split("\n", file("${path.module}/files/cert-manager/cluster-issuer-prod.yaml")) :
split("\n", file("${path.module}/files/cert-manager/cluster-issuer-staging.yaml"))
),
"EOF",
"sleep 60" #wait for the LB to be provisioned
])
deployment_extra_args = ["--wait"]

helm_template_values_override = templatefile(
"${path.root}/files/nginx-ingress/values.yaml.tpl",
"${path.module}/files/nginx-ingress/values.yaml.tpl",
{
min_bw = 10,
max_bw = 100,
Expand Down Expand Up @@ -98,7 +98,7 @@ module "kube_prometheus_stack" {

post_deployment_commands = []

helm_template_values_override = templatefile("./files/kube-prometheus/values.yaml.tftpl", { preferred_kubernetes_services = var.preferred_kubernetes_services})
helm_template_values_override = templatefile("${path.module}/files/kube-prometheus/values.yaml.tftpl", { preferred_kubernetes_services = var.preferred_kubernetes_services})

helm_user_values_override = yamlencode(
{
Expand Down Expand Up @@ -143,7 +143,7 @@ module "node_problem_detector" {
deployment_extra_args = ["--force", "--dependency-update", "--history-max 1"]
post_deployment_commands = []

helm_template_values_override = file("${path.root}/files/node-problem-detector/values.yaml")
helm_template_values_override = file("${path.module}/files/node-problem-detector/values.yaml")
helm_user_values_override = ""

depends_on = [module.kube_prometheus_stack]
Expand All @@ -162,7 +162,7 @@ module "nvidia_dcgm_exporter" {

deployment_name = "dcgm-exporter"
namespace = var.monitoring_namespace
helm_chart_path = "${path.root}/files/nvidia-dcgm-exporter"
helm_chart_path = "${path.module}/files/nvidia-dcgm-exporter"
helm_chart_version = var.dcgm_exporter_chart_version

pre_deployment_commands = [
Expand All @@ -172,7 +172,7 @@ module "nvidia_dcgm_exporter" {
deployment_extra_args = ["--force", "--dependency-update", "--history-max 1"]
post_deployment_commands = []

helm_template_values_override = file("${path.root}/files/nvidia-dcgm-exporter/oke-values.yaml")
helm_template_values_override = file("${path.module}/files/nvidia-dcgm-exporter/oke-values.yaml")
helm_user_values_override = ""

depends_on = [module.kube_prometheus_stack]
Expand Down Expand Up @@ -202,7 +202,7 @@ module "amd_device_metrics_exporter" {
deployment_extra_args = ["--force", "--dependency-update", "--history-max 1"]
post_deployment_commands = []

helm_template_values_override = file("${path.root}/files/amd-device-metrics-exporter/values.yaml")
helm_template_values_override = file("${path.module}/files/amd-device-metrics-exporter/values.yaml")
helm_user_values_override = ""

depends_on = [module.kube_prometheus_stack]
Expand Down Expand Up @@ -232,7 +232,7 @@ module "lustre_client" {
post_deployment_commands = var.create_lustre_pv ? flatten([
"cat <<'EOF' | kubectl apply -f -",
split("\n", templatefile(
"${path.root}/files/lustre/lustre-pv.yaml.tpl",
"${path.module}/files/lustre/lustre-pv.yaml.tpl",
{
lustre_storage_size = floor(var.lustre_size_in_tb),
lustre_ip = one(oci_lustre_file_storage_lustre_file_system.lustre.*.management_service_address),
Expand Down Expand Up @@ -261,7 +261,7 @@ module "oke-ons-webhook" {

deployment_name = "oke-ons-webhook"
namespace = var.monitoring_namespace
helm_chart_path = "${path.root}/files/oke-ons-webhook"
helm_chart_path = "${path.module}/files/oke-ons-webhook"
helm_chart_version = var.oke_ons_webhook_chart_version

pre_deployment_commands = [
Expand Down
2 changes: 1 addition & 1 deletion terraform/via-provider-amd-device-metrics-exporter.tf
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ resource "helm_release" "amd_device_metrics_exporter" {
chart = "device-metrics-exporter-charts"
repository = "https://rocm.github.io/device-metrics-exporter"
version = var.amd_device_metrics_exporter_chart_version
values = ["${file("./files/amd-device-metrics-exporter/values.yaml")}"]
values = ["${file("${path.module}/files/amd-device-metrics-exporter/values.yaml")}"]
create_namespace = false
recreate_pods = true
force_update = true
Expand Down
2 changes: 1 addition & 1 deletion terraform/via-provider-kube-prometheus-stack.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ resource "helm_release" "prometheus" {
chart = "kube-prometheus-stack"
repository = "https://prometheus-community.github.io/helm-charts"
version = var.prometheus_stack_chart_version
values = ["${templatefile("./files/kube-prometheus/values.yaml.tftpl", { preferred_kubernetes_services = var.preferred_kubernetes_services})}"]
values = ["${templatefile("${path.module}/files/kube-prometheus/values.yaml.tftpl", { preferred_kubernetes_services = var.preferred_kubernetes_services})}"]
create_namespace = true
recreate_pods = false
force_update = true
Expand Down
2 changes: 1 addition & 1 deletion terraform/via-provider-lustre-client.tf
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ resource "kubectl_manifest" "lustre_pv" {
]

yaml_body = templatefile(
"${path.root}/files/lustre/lustre-pv.yaml.tpl",
"${path.module}/files/lustre/lustre-pv.yaml.tpl",
{
lustre_storage_size = floor(var.lustre_size_in_tb),
lustre_ip = one(oci_lustre_file_storage_lustre_file_system.lustre.*.management_service_address),
Expand Down
6 changes: 3 additions & 3 deletions terraform/via-provider-nginx.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ resource "helm_release" "nginx" {
version = var.nginx_chart_version
values = [
templatefile(
"${path.root}/files/nginx-ingress/values.yaml.tpl",
"${path.module}/files/nginx-ingress/values.yaml.tpl",
{
min_bw = 10,
max_bw = 100,
Expand Down Expand Up @@ -47,8 +47,8 @@ resource "kubectl_manifest" "cluster_issuer" {
]

yaml_body = (var.use_lets_encrypt_prod_endpoint ?
file("${path.root}/files/cert-manager/cluster-issuer-prod.yaml") :
file("${path.root}/files/cert-manager/cluster-issuer-staging.yaml")
file("${path.module}/files/cert-manager/cluster-issuer-prod.yaml") :
file("${path.module}/files/cert-manager/cluster-issuer-staging.yaml")
)
}

Expand Down
2 changes: 1 addition & 1 deletion terraform/via-provider-node-problem-detector.tf
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ resource "helm_release" "node-problem_detector" {
chart = "node-problem-detector"
repository = "oci://ghcr.io/deliveryhero/helm-charts"
version = var.node_problem_detector_chart_version
values = ["${file("./files/node-problem-detector/values.yaml")}"]
values = ["${file("${path.module}/files/node-problem-detector/values.yaml")}"]
create_namespace = true
recreate_pods = true
force_update = true
Expand Down
4 changes: 2 additions & 2 deletions terraform/via-provider-nvidia-dcgm-exporter.tf
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ resource "helm_release" "nvidia_dcgm_exporter" {
depends_on = [helm_release.prometheus]
namespace = var.monitoring_namespace
name = "dcgm-exporter"
chart = "${path.root}/files/nvidia-dcgm-exporter"
chart = "${path.module}/files/nvidia-dcgm-exporter"
version = var.dcgm_exporter_chart_version
values = ["${file("./files/nvidia-dcgm-exporter/oke-values.yaml")}"]
values = ["${file("${path.module}/files/nvidia-dcgm-exporter/oke-values.yaml")}"]
create_namespace = false
recreate_pods = true
force_update = true
Expand Down
2 changes: 1 addition & 1 deletion terraform/via-provider-oke-ons-webhook.tf
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ resource "helm_release" "oke-ons-webhook" {
depends_on = [helm_release.prometheus]
namespace = var.monitoring_namespace
name = "oke-ons-webhook"
chart = "${path.root}/files/oke-ons-webhook"
chart = "${path.module}/files/oke-ons-webhook"
version = var.oke_ons_webhook_chart_version
set = [
{
Expand Down