diff --git a/modules/python/clusterloader2/slo/config/deployment_template.yaml b/modules/python/clusterloader2/slo/config/deployment_template.yaml index 91929229c9..786f80d951 100644 --- a/modules/python/clusterloader2/slo/config/deployment_template.yaml +++ b/modules/python/clusterloader2/slo/config/deployment_template.yaml @@ -3,6 +3,13 @@ {{$cnp_test:= .cnp_test}} {{$ccnp_test:= .ccnp_test}} +{{$EnableNetworkPolicyEnforcementLatencyTest := DefaultParam .EnableNetworkPolicyEnforcementLatencyTest false}} +{{$TargetLabelValue := DefaultParam .TargetLabelValue "enforcement-latency"}} +# Run a server pod for network policy enforcement latency test only on every Nth pod. +# Default run on every pod. +{{$NetPolServerOnEveryNthPod := 1}} +{{$RunNetPolicyTest := and $EnableNetworkPolicyEnforcementLatencyTest (eq (Mod .Index $NetPolServerOnEveryNthPod) 0)}} + {{$Image := DefaultParam .Image "mcr.microsoft.com/oss/kubernetes/pause:3.6"}} apiVersion: apps/v1 @@ -18,7 +25,7 @@ spec: replicas: {{.Replicas}} selector: matchLabels: - name: {{.Name}} + name: {{if $RunNetPolicyTest}}policy-load-{{end}}{{.Name}} strategy: type: RollingUpdate rollingUpdate: @@ -27,15 +34,30 @@ spec: template: metadata: labels: - name: {{.Name}} + name: {{if $RunNetPolicyTest}}policy-load-{{end}}{{.Name}} group: {{.Group}} {{if .SvcName}} svc: {{.SvcName}}-{{.Index}} {{end}} restart: {{.deploymentLabel}} +{{if $RunNetPolicyTest}} + net-pol-test: {{$TargetLabelValue}} +{{end}} spec: nodeSelector: slo: "true" +{{if $RunNetPolicyTest}} + hostNetwork: false + containers: + - image: acnpublic.azurecr.io/scaletest/nginx:latest + name: nginx-server + ports: + - containerPort: 80 + resources: + requests: + cpu: {{$CpuRequest}} + memory: {{$MemoryRequest}} +{{else}} containers: - env: - name: ENV_VAR @@ -43,11 +65,12 @@ spec: image: {{$Image}} imagePullPolicy: IfNotPresent name: {{.Name}} - ports: + ports: [] resources: requests: cpu: {{$CpuRequest}} memory: {{$MemoryRequest}} +{{end}} # Add not-ready/unreachable tolerations for 15 minutes so that node # failure doesn't trigger pod deletion. tolerations: diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index 3bdf630356..45d7ed8df6 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -2,6 +2,7 @@ name: load-config # Config options for test type {{$SERVICE_TEST := DefaultParam .CL2_SERVICE_TEST true}} +{{$NETWORK_TEST := DefaultParam .CL2_NETWORK_TEST false}} {{$CNP_TEST := DefaultParam .CL2_CNP_TEST false}} {{$CCNP_TEST := DefaultParam .CL2_CCNP_TEST false}} @@ -14,7 +15,7 @@ name: load-config {{$groupName := DefaultParam .CL2_GROUP_NAME "service-discovery"}} # TODO(jshr-w): This should eventually use >1 namespace. -{{$namespaces := 1}} +{{$namespaces := DefaultParam .CL2_NO_OF_NAMESPACES 1}} {{$nodes := DefaultParam .CL2_NODES 1000}} {{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "15m"}} @@ -27,6 +28,7 @@ name: load-config {{$podStartupLatencyThreshold := DefaultParam .CL2_POD_STARTUP_LATENCY_THRESHOLD "15s"}} {{$CILIUM_METRICS_ENABLED := DefaultParam .CL2_CILIUM_METRICS_ENABLED false}} +{{$SCRAPE_KUBELETS := DefaultParam .CL2_SCRAPE_KUBELETS false}} {{$SCRAPE_CONTAINERD := DefaultParam .CL2_SCRAPE_CONTAINERD false}} # Service test @@ -75,7 +77,7 @@ tuningSets: timeLimit: {{$deletionTime}}s steps: - - name: Log - namespaces={{$namespaces}}, nodesPerNamespace={{$nodesPerNamespace}}, podsPerNode={{$podsPerNode}}, totalPods={{$totalPods}}, podsPerNamespace={{$podsPerNamespace}}, bigDeploymentsPerNamespace={{$bigDeploymentsPerNamespace}}, smallDeploymentsPerNamespace={{$smallDeploymentsPerNamespace}}, bigGroupSize={{$BIG_GROUP_SIZE}}, smallGroupSize={{$SMALL_GROUP_SIZE}}, repeats={{$repeats}}, $saturationTime={{$saturationTime}}, $deletionTime={{$deletionTime}} + - name: Log - namespaces={{$namespaces}}, nodes={{$nodes}}, nodesPerNamespace={{$nodesPerNamespace}}, podsPerNode={{$podsPerNode}}, totalPods={{$totalPods}}, podsPerNamespace={{$podsPerNamespace}}, bigDeploymentsPerNamespace={{$bigDeploymentsPerNamespace}}, smallDeploymentsPerNamespace={{$smallDeploymentsPerNamespace}}, bigGroupSize={{$BIG_GROUP_SIZE}}, smallGroupSize={{$SMALL_GROUP_SIZE}}, repeats={{$repeats}}, $saturationTime={{$saturationTime}}, $deletionTime={{$deletionTime}} measurements: - Identifier: Dummy Method: Sleep @@ -96,6 +98,20 @@ steps: action: start {{end}} +{{if $NETWORK_TEST}} + - module: + path: /modules/network-policy/net-policy-metrics.yaml + params: + action: start + +{{if $SCRAPE_KUBELETS}} + - module: + path: /modules/npm-measurements.yaml + params: + action: start +{{end}} +{{end}} + {{if $SCRAPE_CONTAINERD}} - module: path: /modules/containerd-measurements.yaml @@ -133,6 +149,15 @@ steps: ccnps: {{$CCNPS}} {{end}} +{{if $NETWORK_TEST}} + - module: + path: modules/network-policy/net-policy-enforcement-latency.yaml + params: + setup: true + run: true + testType: "pod-creation" +{{end}} + - module: path: /modules/reconcile-objects.yaml params: @@ -156,6 +181,26 @@ steps: Group: {{$groupName}} deploymentLabel: start +{{if $NETWORK_TEST}} + - module: + path: modules/network-policy/net-policy-metrics.yaml + params: + action: gather + usePolicyCreationMetrics: false + + - module: + path: modules/network-policy/net-policy-enforcement-latency.yaml + params: + complete: true + testType: "pod-creation" + + - module: + path: modules/network-policy/net-policy-enforcement-latency.yaml + params: + run: true + testType: "policy-creation" +{{end}} + - module: path: /modules/reconcile-objects.yaml params: @@ -252,3 +297,23 @@ steps: params: action: gather group: {{$groupName}} + +{{if $NETWORK_TEST}} + - module: + path: modules/network-policy/net-policy-metrics.yaml + params: + action: gather + +{{if $SCRAPE_KUBELETS}} + - module: + path: /modules/npm-measurements.yaml + params: + action: gather +{{end}} + + - module: + path: modules/network-policy/net-policy-enforcement-latency.yaml + params: + complete: true + testType: "policy-creation" +{{end}} diff --git a/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-enforcement-latency.yaml b/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-enforcement-latency.yaml new file mode 100644 index 0000000000..9779d209f3 --- /dev/null +++ b/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-enforcement-latency.yaml @@ -0,0 +1,55 @@ +{{$NETWORK_POLICY_ENFORCEMENT_LATENCY_BASELINE := DefaultParam .CL2_NETWORK_POLICY_ENFORCEMENT_LATENCY_BASELINE false}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY "net-pol-test"}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE "enforcement-latency"}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE "net-policy-client"}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_MAX_TARGET_PODS_PER_NS := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_MAX_TARGET_PODS_PER_NS 100}} +{{$NET_POLICY_ENFORCEMENT_LOAD_COUNT := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LOAD_COUNT 1000}} +{{$NET_POLICY_ENFORCEMENT_LOAD_QPS := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LOAD_QPS 10}} +{{$NET_POLICY_ENFORCEMENT_LOAD_TARGET_NAME := DefaultParam .CL2_POLICY_ENFORCEMENT_LOAD_TARGET_NAME "small-deployment"}} + +{{$setup := DefaultParam .setup false}} +{{$run := DefaultParam .run false}} +{{$complete := DefaultParam .complete false}} +{{$testType := DefaultParam .testType "policy-creation"}} +# Target port needs to match the server container port of target pods that have +# "targetLabelKey: targetLabelValue" label selector. +{{$targetPort := 80}} + +steps: +{{if $setup}} +- name: Setup network policy enforcement latency measurement + measurements: + - Identifier: NetworkPolicyEnforcement + Method: NetworkPolicyEnforcement + Params: + action: setup + targetLabelKey: {{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY}} + targetLabelValue: {{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE}} + baseline: {{$NETWORK_POLICY_ENFORCEMENT_LATENCY_BASELINE}} + testClientNodeSelectorValue: {{$NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE}} +{{end}} + +{{if $run}} +- name: "Run pod creation network policy enforcement latency measurement (testType={{$testType}})" + measurements: + - Identifier: NetworkPolicyEnforcement + Method: NetworkPolicyEnforcement + Params: + action: run + testType: {{$testType}} + targetPort: {{$targetPort}} + maxTargets: {{$NET_POLICY_ENFORCEMENT_LATENCY_MAX_TARGET_PODS_PER_NS}} + policyLoadCount: {{$NET_POLICY_ENFORCEMENT_LOAD_COUNT}} + policyLoadQPS: {{$NET_POLICY_ENFORCEMENT_LOAD_QPS}} + policyLoadTargetBaseName: {{$NET_POLICY_ENFORCEMENT_LOAD_TARGET_NAME}} +{{end}} + +{{if $complete}} +- name: "Complete pod creation network policy enforcement latency measurement (testType={{$testType}})" + measurements: + - Identifier: NetworkPolicyEnforcement + Method: NetworkPolicyEnforcement + Params: + action: complete + testType: {{$testType}} +{{end}} \ No newline at end of file diff --git a/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml b/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml new file mode 100644 index 0000000000..5be48be8bb --- /dev/null +++ b/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml @@ -0,0 +1,122 @@ +# Valid actions: "start", "gather" +{{$action := .action}} +{{$usePolicyCreationMetrics := DefaultParam .usePolicyCreationMetrics true}} +{{$usePodCreationMetrics := DefaultParam .usePodCreationMetrics true}} +{{$useCiliumMetrics := DefaultParam .useCiliumMetrics true}} + +# CL2 params +# Negative default values are used to turn thresholds off if not overridden. Thresholds are only enabled with values of zero or higher. +{{$NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS -1}} +{{$NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS -1}} +{{$NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS -1}} +{{$CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD := DefaultParam .CL2_CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD 0}} +{{$CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD := DefaultParam .CL2_CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD 0.01}} +{{$CILIUM_POLICY_REGEN_TIME_99_THRESHOLD := DefaultParam .CL2_CILIUM_POLICY_REGEN_TIME_99_THRESHOLD -1}} +{{$CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD := DefaultParam .CL2_CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD -1}} + +steps: +- name: "{{$action}}ing network policy metrics" + measurements: + - Identifier: NetworkPolicyEnforcementLatency + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: "Network Policy Enforcement Latency" + metricVersion: v1 + unit: s + queries: + # Network policy enforcement metrics gathered from the test clients. + {{if $usePolicyCreationMetrics}} + - name: PolicyCreation - TargetCount + query: sum(policy_enforcement_latency_policy_creation_seconds_count) + - name: PolicyCreation - Perc50 + query: histogram_quantile(0.5, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) + - name: PolicyCreation - Perc90 + query: histogram_quantile(0.9, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) + - name: PolicyCreation - Perc95 + query: histogram_quantile(0.95, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) + - name: PolicyCreation - Perc99 + query: histogram_quantile(0.99, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) + {{if ge $NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS 0}} + threshold: {{$NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS}} + {{end}} + {{end}} + {{if $usePodCreationMetrics}} + - name: PodCreation - TargetCount + query: sum(pod_creation_reachability_latency_seconds_count) + - name: PodCreation - Perc50 + query: histogram_quantile(0.5, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) + - name: PodCreation - Perc90 + query: histogram_quantile(0.9, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) + - name: PodCreation - Perc95 + query: histogram_quantile(0.95, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) + - name: PodCreation - Perc99 + query: histogram_quantile(0.99, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) + {{if ge $NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS 0}} + threshold: {{$NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS}} + {{end}} + - name: PodIpAssignedLatency - TargetCount + query: sum(pod_ip_address_assigned_latency_seconds_count) + - name: PodIpAssignedLatency - Perc50 + query: histogram_quantile(0.50, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) + - name: PodIpAssignedLatency - Perc90 + query: histogram_quantile(0.90, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) + - name: PodIpAssignedLatency - Perc95 + query: histogram_quantile(0.95, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) + - name: PodIpAssignedLatency - Perc99 + query: histogram_quantile(0.99, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) + {{if ge $NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS 0}} + threshold: {{$NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS}} + {{end}} + {{end}} + + {{if $useCiliumMetrics}} + - Identifier: NetworkPolicyMetrics + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: "Network Policy Performance" + metricVersion: v1 + unit: s + queries: + # Cilium agent metrics that are related to network policies. + - name: Number of times a policy import has failed + # To be replaced with the new Cilium metric that counts all policy changes, not just import errors. + # With that, this can be a percentage of failed imports. + # https://github.com/cilium/cilium/pull/23349 + query: sum(cilium_policy_import_errors_total) + threshold: {{$CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD}} + - name: Failed endpoint regenerations percentage + query: sum(cilium_endpoint_regenerations_total{outcome="fail"}) / sum(cilium_endpoint_regenerations_total) * 100 + threshold: {{$CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD}} + - name: Policy regeneration time - Perc50 + query: histogram_quantile(0.50, sum(cilium_policy_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) + - name: Policy regeneration time - Perc99 + query: histogram_quantile(0.99, sum(cilium_policy_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) + {{if ge $CILIUM_POLICY_REGEN_TIME_99_THRESHOLD 0}} + threshold: {{$CILIUM_POLICY_REGEN_TIME_99_THRESHOLD}} + {{end}} + - name: Time between a policy change and it being fully deployed into the datapath - Perc50 + query: histogram_quantile(0.50, sum(cilium_policy_implementation_delay_bucket) by (le)) + - name: Time between a policy change and it being fully deployed into the datapath - Perc99 + query: histogram_quantile(0.99, sum(cilium_policy_implementation_delay_bucket) by (le)) + - name: Latency of policy update trigger - Perc50 + query: histogram_quantile(0.50, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="latency"}) by (le)) + - name: Latency of policy update trigger - Perc99 + query: histogram_quantile(0.99, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="latency"}) by (le)) + - name: Duration of policy update trigger - Perc50 + query: histogram_quantile(0.50, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="duration"}) by (le)) + - name: Duration of policy update trigger - Perc99 + query: histogram_quantile(0.99, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="duration"}) by (le)) + - name: Endpoint regeneration latency - Perc50 + query: histogram_quantile(0.50, sum(cilium_endpoint_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) + - name: Endpoint regeneration latency - Perc99 + query: histogram_quantile(0.99, sum(cilium_endpoint_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) + {{if ge $CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD 0}} + threshold: {{$CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD}} + {{end}} + - name: Number of policies currently loaded + query: avg(cilium_policy) + - name: Number of endpoints labeled by policy enforcement status + query: sum(cilium_policy_endpoint_enforcement_status) + {{end}} \ No newline at end of file diff --git a/modules/python/clusterloader2/slo/config/modules/npm-measurements.yaml b/modules/python/clusterloader2/slo/config/modules/npm-measurements.yaml new file mode 100644 index 0000000000..a1c685b982 --- /dev/null +++ b/modules/python/clusterloader2/slo/config/modules/npm-measurements.yaml @@ -0,0 +1,66 @@ +{{$action := .action}} # start, gather +# Measurement modules for Azure NPM metrics + +steps: + - name: {{$action}} NPM Agent Metrics + measurements: + - Identifier: NPMAgentsAvgCPUUsage + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: NPM Agent Avg CPU Usage + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:])) + - Identifier: NPMAgentsMaxCPUUsage + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: NPM Agent Max CPU Usage + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:])) + - Identifier: NPMAgentsAvgMemUsage + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: NPM Agent Avg Memory Usage + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, avg_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.50, avg_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024) + - Identifier: NPMAgentsMaxMemUsage + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: NPM Agent Max Memory Usage + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, max_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.50, max_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024) diff --git a/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml b/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml index d3e08b0f8e..9c99f01dca 100644 --- a/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml +++ b/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml @@ -14,6 +14,13 @@ {{$smallDeploymentSize := .smallDeploymentSize}} {{$smallDeploymentsPerNamespace := .smallDeploymentsPerNamespace}} +{{$NETWORK_TEST := DefaultParam .CL2_NETWORK_TEST false}} +{{$ENABLE_NETWORKPOLICIES := DefaultParam .CL2_NETWORK_TEST false}} +{{$ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST := DefaultParam .CL2_ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST false}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY "net-pol-test"}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE "enforcement-latency"}} +{{$NET_POLICY_SERVER_EVERY_NTH_POD := DefaultParam .CL2_NET_POLICY_SERVER_EVERY_NTH_POD 3}} + {{$cnp_test:= .cnp_test}} {{$ccnp_test:= .ccnp_test}} @@ -58,6 +65,10 @@ steps: objectTemplatePath: deployment_template.yaml templateFillMap: Replicas: {{$smallDeploymentSize}} + EnableNetworkPolicyEnforcementLatencyTest: {{$NETWORK_TEST}} + TargetLabelKey: {{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY}} + TargetLabelValue: {{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE}} + NetPolServerOnEveryNthPod: {{$NET_POLICY_SERVER_EVERY_NTH_POD}} {{if or $cnp_test $ccnp_test}} cnp_test: {{$cnp_test}} ccnp_test: {{$ccnp_test}} diff --git a/modules/python/clusterloader2/slo/slo.py b/modules/python/clusterloader2/slo/slo.py index daa8867db2..15d8cddaee 100644 --- a/modules/python/clusterloader2/slo/slo.py +++ b/modules/python/clusterloader2/slo/slo.py @@ -23,12 +23,12 @@ } # TODO: Remove aks once CL2 update provider name to be azure -def calculate_config(cpu_per_node, node_count, max_pods, provider, service_test, cnp_test, ccnp_test): +def calculate_config(cpu_per_node, node_count, max_pods, provider, service_test, network_test, cnp_test, ccnp_test): throughput = 100 nodes_per_namespace = min(node_count, DEFAULT_NODES_PER_NAMESPACE) pods_per_node = DEFAULT_PODS_PER_NODE - if service_test: + if service_test or network_test: pods_per_node = max_pods if cnp_test or ccnp_test: @@ -52,8 +52,12 @@ def configure_clusterloader2( operation_timeout, provider, cilium_enabled, + scrape_kubelets, scrape_containerd, service_test, + network_test, + no_of_namespaces, + total_network_policies, cnp_test, ccnp_test, num_cnps, @@ -62,18 +66,20 @@ def configure_clusterloader2( override_file): steps = node_count // node_per_step - throughput, nodes_per_namespace, pods_per_node, cpu_request = calculate_config(cpu_per_node, node_per_step, max_pods, provider, service_test, cnp_test, ccnp_test) + throughput, nodes_per_namespace, pods_per_node, cpu_request = calculate_config(cpu_per_node, node_per_step, max_pods, provider, service_test, network_test, cnp_test, ccnp_test) with open(override_file, 'w', encoding='utf-8') as file: file.write(f"CL2_NODES: {node_count}\n") file.write(f"CL2_LOAD_TEST_THROUGHPUT: {throughput}\n") file.write(f"CL2_NODES_PER_NAMESPACE: {nodes_per_namespace}\n") file.write(f"CL2_NODES_PER_STEP: {node_per_step}\n") + file.write(f"CL2_NODES: {node_count}\n") file.write(f"CL2_PODS_PER_NODE: {pods_per_node}\n") file.write(f"CL2_DEPLOYMENT_SIZE: {pods_per_node}\n") file.write(f"CL2_LATENCY_POD_CPU: {cpu_request}\n") file.write(f"CL2_REPEATS: {repeats}\n") file.write(f"CL2_STEPS: {steps}\n") + file.write(f"CL2_NO_OF_NAMESPACES: {no_of_namespaces}\n") file.write(f"CL2_OPERATION_TIMEOUT: {operation_timeout}\n") file.write("CL2_PROMETHEUS_TOLERATE_MASTER: true\n") file.write("CL2_PROMETHEUS_MEMORY_LIMIT_FACTOR: 100.0\n") @@ -86,6 +92,9 @@ def configure_clusterloader2( file.write(f"CL2_SCRAPE_CONTAINERD: {str(scrape_containerd).lower()}\n") file.write("CONTAINERD_SCRAPE_INTERVAL: 5m\n") + if scrape_kubelets: + file.write(f"CL2_SCRAPE_KUBELETS: {str(scrape_kubelets).lower()}\n") + if cilium_enabled: file.write("CL2_CILIUM_METRICS_ENABLED: true\n") file.write("CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: true\n") @@ -97,6 +106,26 @@ def configure_clusterloader2( else: file.write("CL2_SERVICE_TEST: false\n") + if network_test: + file.write("CL2_NETWORK_TEST: true\n") + file.write("CL2_ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST: true\n") + file.write("CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE: true\n") + file.write("CL2_PROMETHEUS_SCRAPE_KUBE_PROXY: true\n") + file.write("CL2_NETWORK_PROGRAMMING_LATENCY_THRESHOLD: 30s\n") + file.write("CL2_ENABLE_VIOLATIONS_FOR_NETWORK_PROGRAMMING_LATENCIES: false\n") + file.write("CL2_NETWORK_LATENCY_THRESHOLD: 0s\n") + file.write("CL2_PROBE_MEASUREMENTS_PING_SLEEP_DURATION: 1s\n") + file.write("CL2_ENABLE_IN_CLUSTER_NETWORK_LATENCY: true\n") + file.write("CL2_PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT: 15m\n") + file.write("CL2_NETWORK_POLICY_ENFORCEMENT_LATENCY_BASELINE: false\n") + file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY: net-pol-test\n") + file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE: enforcement-latency\n") + file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE: net-policy-client\n") + file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_MAX_TARGET_PODS_PER_NS: 100\n") + file.write(f"CL2_NET_POLICY_ENFORCEMENT_LOAD_COUNT: {total_network_policies}\n") + file.write("CL2_NET_POLICY_ENFORCEMENT_LOAD_QPS: 10\n") + file.write("CL2_POLICY_ENFORCEMENT_LOAD_TARGET_NAME: small-deployment\n") + if cnp_test: file.write("CL2_CNP_TEST: true\n") file.write(f"CL2_CNPS_PER_NAMESPACE: {num_cnps}\n") @@ -136,11 +165,12 @@ def execute_clusterloader2( cl2_config_file, kubeconfig, provider, + scrape_kubelets, scrape_containerd ): run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, cl2_config_file=cl2_config_file, overrides=True, enable_prometheus=True, - scrape_containerd=scrape_containerd) + scrape_kubelets=scrape_kubelets, scrape_containerd=scrape_containerd) def collect_clusterloader2( cpu_per_node, @@ -152,6 +182,7 @@ def collect_clusterloader2( run_id, run_url, service_test, + network_test, cnp_test, ccnp_test, result_file, @@ -168,7 +199,7 @@ def collect_clusterloader2( else: raise Exception(f"No testsuites found in the report! Raw data: {details}") - _, _, pods_per_node, _ = calculate_config(cpu_per_node, node_count, max_pods, provider, service_test, cnp_test, ccnp_test) + _, _, pods_per_node, _ = calculate_config(cpu_per_node, node_count, max_pods, provider, service_test, network_test, cnp_test, ccnp_test) pod_count = node_count * pods_per_node # TODO: Expose optional parameter to include test details @@ -238,10 +269,16 @@ def main(): parser_configure.add_argument("provider", type=str, help="Cloud provider name") parser_configure.add_argument("cilium_enabled", type=str2bool, choices=[True, False], default=False, help="Whether cilium is enabled. Must be either True or False") + parser_configure.add_argument("scrape_kubelets", type=str2bool, choices=[True, False], default=False, + help="Whether to scrape kubelet metrics. Must be either True or False") parser_configure.add_argument("scrape_containerd", type=str2bool, choices=[True, False], default=False, help="Whether to scrape containerd metrics. Must be either True or False") parser_configure.add_argument("service_test", type=str2bool, choices=[True, False], default=False, help="Whether service test is running. Must be either True or False") + parser_configure.add_argument("network_test", type=str2bool, choices=[True, False], nargs='?', default=False, + help="Whether network test is running. Must be either True or False") + parser_configure.add_argument("no_of_namespaces", type=int, nargs='?', default=1, help="Number of namespaces to create") + parser_configure.add_argument("total_network_policies", type=int, nargs='?', default=0, help="Total number of network policies to create") parser_configure.add_argument("cnp_test", type=str2bool, choices=[True, False], nargs='?', default=False, help="Whether cnp test is running. Must be either True or False") parser_configure.add_argument("ccnp_test", type=str2bool, choices=[True, False], nargs='?', default=False, @@ -265,6 +302,8 @@ def main(): parser_execute.add_argument("cl2_config_file", type=str, help="Path to the CL2 config file") parser_execute.add_argument("kubeconfig", type=str, help="Path to the kubeconfig file") parser_execute.add_argument("provider", type=str, help="Cloud provider name") + parser_execute.add_argument("scrape_kubelets", type=str2bool, choices=[True, False], default=False, + help="Whether to scrape kubelet metrics. Must be either True or False") parser_execute.add_argument("scrape_containerd", type=str2bool, choices=[True, False], default=False, help="Whether to scrape containerd metrics. Must be either True or False") @@ -280,6 +319,8 @@ def main(): parser_collect.add_argument("run_url", type=str, help="Run URL") parser_collect.add_argument("service_test", type=str2bool, choices=[True, False], default=False, help="Whether service test is running. Must be either True or False") + parser_collect.add_argument("network_test", type=str2bool, choices=[True, False], nargs='?', default=False, + help="Whether network test is running. Must be either True or False") parser_collect.add_argument("cnp_test", type=str2bool, choices=[True, False], nargs='?', default=False, help="Whether cnp test is running. Must be either True or False") parser_collect.add_argument("ccnp_test", type=str2bool, choices=[True, False], nargs='?', default=False, @@ -294,17 +335,18 @@ def main(): if args.command == "configure": configure_clusterloader2(args.cpu_per_node, args.node_count, args.node_per_step, args.max_pods, args.repeats, args.operation_timeout, args.provider, - args.cilium_enabled, args.scrape_containerd, - args.service_test, args.cnp_test, args.ccnp_test, args.num_cnps, args.num_ccnps, args.dualstack, args.cl2_override_file) + args.cilium_enabled, args.scrape_kubelets, args.scrape_containerd, + args.service_test, args.network_test, args.no_of_namespaces, args.total_network_policies, + args.cnp_test, args.ccnp_test, args.num_cnps, args.num_ccnps, args.dualstack, args.cl2_override_file) elif args.command == "validate": validate_clusterloader2(args.node_count, args.operation_timeout) elif args.command == "execute": execute_clusterloader2(args.cl2_image, args.cl2_config_dir, args.cl2_report_dir, args.cl2_config_file, - args.kubeconfig, args.provider, args.scrape_containerd) + args.kubeconfig, args.provider, args.scrape_kubelets, args.scrape_containerd) elif args.command == "collect": collect_clusterloader2(args.cpu_per_node, args.node_count, args.max_pods, args.repeats, args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, - args.service_test, args.cnp_test, args.ccnp_test, + args.service_test, args.network_test, args.cnp_test, args.ccnp_test, args.result_file, args.test_type, args.start_timestamp) if __name__ == "__main__": diff --git a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml new file mode 100644 index 0000000000..ad0f68fa14 --- /dev/null +++ b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml @@ -0,0 +1,104 @@ +trigger: none +schedules: + - cron: "0 */6 * * *" + displayName: "Every 6 Hours Daily" + branches: + include: + - main + always: true + +parameters: + - name: node_count + type: number + default: 24 + - name: node_per_step + type: number + default: 240 + - name: pods_per_node + type: number + default: 40 + - name: repeats + type: number + default: 1 + - name: scale_timeout + type: string + default: "15m" + - name: no_of_namespaces + type: number + default: 10 + - name: total_nework_policies + type: number + default: 4800 + +variables: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: network-policy-churn + SCENARIO_VERSION: main + OWNER: aks + +stages: + - stage: azure_eastus2 + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2 + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20241022" + topology: network-churn + matrix: + azure_cilium: + cpu_per_node: 4 + node_count: ${{ parameters.node_count }} + node_per_step: ${{ parameters.node_per_step }} + max_pods: ${{ parameters.pods_per_node }} + repeats: ${{ parameters.repeats }} + scale_timeout: ${{ parameters.scale_timeout }} + no_of_namespaces: ${{ parameters.no_of_namespaces }} + total_network_policies: ${{ parameters.total_nework_policies }} + cilium_enabled: True + npm_enabled: False + network_policy: cilium + network_dataplane: cilium + service_test: False + network_test: True + cl2_config_file: load-config.yaml + max_parallel: 2 + timeout_in_minutes: 720 + credential_type: service_connection + ssh_key_enabled: false + - stage: azure_npm_eastus2 + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2 + engine: clusterloader2 + engine_input: + image: "ghcr.io/azure/clusterloader2:v20241022" + topology: network-churn + matrix: + azure_cni: + cpu_per_node: 4 + node_count: ${{ parameters.node_count }} + node_per_step: ${{ parameters.node_per_step }} + max_pods: ${{ parameters.pods_per_node }} + repeats: ${{ parameters.repeats }} + scale_timeout: ${{ parameters.scale_timeout }} + no_of_namespaces: ${{ parameters.no_of_namespaces }} + total_network_policies: ${{ parameters.total_nework_policies }} + cilium_enabled: False + scrape_kubelets: True + npm_enabled: True + service_test: False + network_test: True + cl2_config_file: load-config.yaml + max_parallel: 2 + timeout_in_minutes: 720 + credential_type: service_connection + ssh_key_enabled: false diff --git a/scenarios/perf-eval/network-policy-churn/kubernetes/azure-npm.yaml b/scenarios/perf-eval/network-policy-churn/kubernetes/azure-npm.yaml new file mode 100644 index 0000000000..551b264179 --- /dev/null +++ b/scenarios/perf-eval/network-policy-churn/kubernetes/azure-npm.yaml @@ -0,0 +1,173 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: azure-npm + namespace: kube-system + labels: + addonmanager.kubernetes.io/mode: EnsureExists +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: azure-npm + namespace: kube-system + labels: + addonmanager.kubernetes.io/mode: EnsureExists +rules: + - apiGroups: + - "" + resources: + - pods + - nodes + - namespaces + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - networkpolicies + verbs: + - get + - list + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: azure-npm-binding + namespace: kube-system + labels: + addonmanager.kubernetes.io/mode: EnsureExists +subjects: + - kind: ServiceAccount + name: azure-npm + namespace: kube-system +roleRef: + kind: ClusterRole + name: azure-npm + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: azure-npm + namespace: kube-system + labels: + app: azure-npm + addonmanager.kubernetes.io/mode: EnsureExists +spec: + selector: + matchLabels: + k8s-app: azure-npm + template: + metadata: + labels: + k8s-app: azure-npm + annotations: + scheduler.alpha.kubernetes.io/critical-pod: "" + azure.npm/scrapeable: "" + spec: + priorityClassName: system-node-critical + tolerations: + - operator: "Exists" + effect: NoExecute + - operator: "Exists" + effect: NoSchedule + - key: CriticalAddonsOnly + operator: Exists + containers: + - name: azure-npm + image: mcr.microsoft.com/containernetworking/azure-npm:v1.4.45.3 + resources: {} + securityContext: + privileged: false + capabilities: + add: + - NET_ADMIN + readOnlyRootFilesystem: true + env: + - name: HOSTNAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + - name: NPM_CONFIG + value: /etc/azure-npm/azure-npm.json + volumeMounts: + - name: log + mountPath: /var/log + - name: xtables-lock + mountPath: /run/xtables.lock + - name: protocols + mountPath: /etc/protocols + - name: azure-npm-config + mountPath: /etc/azure-npm + - name: tmp + mountPath: /tmp + hostNetwork: true + hostUsers: false + nodeSelector: + kubernetes.io/os: linux + volumes: + - name: log + hostPath: + path: /var/log + type: Directory + - name: xtables-lock + hostPath: + path: /run/xtables.lock + type: File + - name: protocols + hostPath: + path: /etc/protocols + type: File + - name: azure-npm-config + configMap: + name: azure-npm-config + - name: tmp + emptyDir: {} + serviceAccountName: azure-npm +--- +apiVersion: v1 +kind: Service +metadata: + name: npm-metrics-cluster-service + namespace: kube-system + labels: + app: npm-metrics +spec: + selector: + k8s-app: azure-npm + ports: + - port: 9000 + targetPort: 10091 +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: azure-npm-config + namespace: kube-system +data: + azure-npm.json: | + { + "ResyncPeriodInMinutes": 15, + "ListeningPort": 10091, + "ListeningAddress": "0.0.0.0", + "ApplyIntervalInMilliseconds": 500, + "ApplyMaxBatches": 100, + "MaxBatchedACLsPerPod": 30, + "NetPolInvervalInMilliseconds": 500, + "MaxPendingNetPols": 100, + "Toggles": { + "EnablePrometheusMetrics": true, + "EnablePprof": true, + "EnableHTTPDebugAPI": true, + "EnableV2NPM": true, + "PlaceAzureChainFirst": false, + "ApplyInBackground": true, + "NetPolInBackground": true + }, + "LogLevel": "info" + } diff --git a/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars b/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars new file mode 100644 index 0000000000..80408f971d --- /dev/null +++ b/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars @@ -0,0 +1,70 @@ +scenario_type = "perf-eval" +scenario_name = "network-policy-churn" +deletion_delay = "3h" +owner = "aks" + +network_config_list = [ + { + role = "net" + vnet_name = "net-vnet" + vnet_address_space = "10.0.0.0/9" + subnet = [ + { + name = "net-subnet-1" + address_prefix = "10.0.0.0/16" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + } +] + +aks_config_list = [ + { + role = "net" + aks_name = "net-pol-test" + dns_prefix = "net" + subnet_name = "net-subnet-1" + sku_tier = "Standard" + network_profile = { + network_plugin = "azure" + network_plugin_mode = "overlay" + pod_cidr = "10.128.0.0/9" + service_cidr = "192.168.0.0/16" + dns_service_ip = "192.168.0.10" + } + default_node_pool = { + name = "default" + node_count = 5 + auto_scaling_enabled = false + vm_size = "Standard_D16ds_v5" + os_disk_type = "Managed" + only_critical_addons_enabled = false + temporary_name_for_rotation = "defaulttmp" + } + extra_node_pool = [ + { + name = "promnodepool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D64_v3" + max_pods = 110 + node_labels = { "prometheus" = "true" } + }, + { + name = "userpool0" + node_count = 240 + min_count = 0 + max_count = 500 + auto_scaling_enabled = false + vm_size = "Standard_D4_v3" + max_pods = 250 + node_labels = { "slo" = "true", + "test-np" = "net-policy-client" } + } + ] + kubernetes_version = "1.32" + } +] + diff --git a/scenarios/perf-eval/network-policy-churn/terraform-test-inputs/azure.json b/scenarios/perf-eval/network-policy-churn/terraform-test-inputs/azure.json new file mode 100644 index 0000000000..ea27a572c6 --- /dev/null +++ b/scenarios/perf-eval/network-policy-churn/terraform-test-inputs/azure.json @@ -0,0 +1,4 @@ +{ + "run_id" : "123456789", + "region" : "eastus" +} diff --git a/steps/engine/clusterloader2/network-policy-scale/execute.yml b/steps/engine/clusterloader2/network-policy-scale/execute.yml index 3f618afe90..11de7f0bf4 100644 --- a/steps/engine/clusterloader2/network-policy-scale/execute.yml +++ b/steps/engine/clusterloader2/network-policy-scale/execute.yml @@ -9,12 +9,6 @@ parameters: type: string steps: - - script: | - echo "Set the start time for test execution" - startTimestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") - echo "Start: $startTimestamp" - echo "##vso[task.setvariable variable=SLO_START_TIME]$startTimestamp" - displayName: set up timestamp variable - script: | set -eo pipefail diff --git a/steps/engine/clusterloader2/slo/collect.yml b/steps/engine/clusterloader2/slo/collect.yml index b51b719944..73741ce244 100644 --- a/steps/engine/clusterloader2/slo/collect.yml +++ b/steps/engine/clusterloader2/slo/collect.yml @@ -17,7 +17,7 @@ steps: PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \ $CPU_PER_NODE $NODE_COUNT ${MAX_PODS:-0} \ - $REPEATS $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $SERVICE_TEST ${CNP_TEST:-False} \ + $REPEATS $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $SERVICE_TEST ${NETWORK_TEST:-False} ${CNP_TEST:-False} \ ${CCNP_TEST:-False} $TEST_RESULTS_FILE \ $TEST_TYPE $SLO_START_TIME workingDirectory: modules/python diff --git a/steps/engine/clusterloader2/slo/execute.yml b/steps/engine/clusterloader2/slo/execute.yml index 09ff52bcd5..ffe5a56049 100644 --- a/steps/engine/clusterloader2/slo/execute.yml +++ b/steps/engine/clusterloader2/slo/execute.yml @@ -21,11 +21,12 @@ steps: PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE configure \ $CPU_PER_NODE $NODE_COUNT $NODE_PER_STEP ${MAX_PODS:-0} \ - $REPEATS $SCALE_TIMEOUT $CLOUD $CILIUM_ENABLED ${SCRAPE_CONTAINERD:-False} \ - $SERVICE_TEST ${CNP_TEST:-False} ${CCNP_TEST:-False} ${NUM_CNPS:-0} ${NUM_CCNPS:-0} ${DUALSTACK:-False} ${CL2_CONFIG_DIR}/overrides.yaml + $REPEATS $SCALE_TIMEOUT $CLOUD $CILIUM_ENABLED ${SCRAPE_KUBELETS:-False} ${SCRAPE_CONTAINERD:-False} \ + $SERVICE_TEST ${NETWORK_TEST:-False} ${NO_OF_NAMESPACES:-1} ${TOTAL_NETWORK_POLICIES:-0} \ + ${CNP_TEST:-False} ${CCNP_TEST:-False} ${NUM_CNPS:-0} ${NUM_CCNPS:-0} ${DUALSTACK:-False} ${CL2_CONFIG_DIR}/overrides.yaml PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ ${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR $CL2_CONFIG_FILE \ - ${HOME}/.kube/config $CLOUD ${SCRAPE_CONTAINERD:-False} + ${HOME}/.kube/config $CLOUD ${SCRAPE_KUBELETS:-False} ${SCRAPE_CONTAINERD:-False} workingDirectory: modules/python env: ${{ if eq(parameters.cloud, 'azure') }}: diff --git a/steps/execute-tests.yml b/steps/execute-tests.yml index 84b6e92a2b..ed8a32c752 100644 --- a/steps/execute-tests.yml +++ b/steps/execute-tests.yml @@ -13,6 +13,7 @@ parameters: default: {} steps: + - template: /steps/topology/${{ parameters.topology }}/execute-${{ parameters.engine }}.yml@self parameters: cloud: ${{ parameters.cloud }} diff --git a/steps/topology/network-churn/collect-clusterloader2.yml b/steps/topology/network-churn/collect-clusterloader2.yml new file mode 100644 index 0000000000..272b3d1291 --- /dev/null +++ b/steps/topology/network-churn/collect-clusterloader2.yml @@ -0,0 +1,30 @@ +parameters: +- name: cloud + type: string + default: '' +- name: engine_input + type: object + default: {} +- name: regions + type: object + default: {} + +steps: +- template: /steps/engine/clusterloader2/slo/collect.yml + parameters: + cloud: ${{ parameters.cloud }} + engine_input: ${{ parameters.engine_input }} + region: ${{ parameters.regions[0] }} + +- template: /steps/engine/clusterloader2/cilium/scale-cluster.yml + parameters: + role: net + region: ${{ parameters.regions[0] }} + nodes_per_nodepool: 0 + enable_autoscale: "false" + +- script: | + run_id=$(Build.BuildId)-$(System.JobId) + echo "Run ID: $run_id" + echo "##vso[task.setvariable variable=RUN_ID]$run_id" + displayName: "Set unique Run ID before publish" diff --git a/steps/topology/network-churn/execute-clusterloader2.yml b/steps/topology/network-churn/execute-clusterloader2.yml new file mode 100644 index 0000000000..d084b2ef03 --- /dev/null +++ b/steps/topology/network-churn/execute-clusterloader2.yml @@ -0,0 +1,17 @@ +parameters: +- name: cloud + type: string + default: '' +- name: engine_input + type: object + default: {} +- name: regions + type: object + default: {} + +steps: +- template: /steps/engine/clusterloader2/slo/execute.yml + parameters: + cloud: ${{ parameters.cloud }} + engine_input: ${{ parameters.engine_input }} + region: ${{ parameters.regions[0] }} diff --git a/steps/topology/network-churn/validate-resources.yml b/steps/topology/network-churn/validate-resources.yml new file mode 100644 index 0000000000..9840317079 --- /dev/null +++ b/steps/topology/network-churn/validate-resources.yml @@ -0,0 +1,41 @@ +parameters: + - name: cloud + type: string + - name: engine + type: string + - name: regions + type: object + +steps: + - template: /steps/cloud/${{ parameters.cloud }}/update-kubeconfig.yml + parameters: + role: net + region: ${{ parameters.regions[0] }} + + - script: | + #!/bin/bash + set -e + + # Check if NPM_ENABLED environment variable is set to true + if [[ "$NPM_ENABLED" == "True" ]]; then + echo "NPM is enabled, applying Azure NPM configuration..." + + # Path to the NPM configuration file + npm_config_file="$(Pipeline.Workspace)/s/scenarios/$(SCENARIO_TYPE)/$(SCENARIO_NAME)/kubernetes/azure-npm.yaml" + + # Check if the file exists + if [[ -f "$npm_config_file" ]]; then + echo "Applying NPM configuration from: $npm_config_file" + kubectl apply -f "$npm_config_file" + echo "Azure NPM configuration applied successfully!" + else + echo "Error: NPM configuration file not found at $npm_config_file" + echo "Looking for file in current directory:" + ls -la "$(Pipeline.Workspace)/s/scenarios/$(SCENARIO_TYPE)/$(SCENARIO_NAME)/kubernetes/" + exit 1 + fi + else + echo "NPM is not enabled, skipping Azure NPM configuration." + fi + displayName: 'Apply Azure NPM Configuration (if enabled)' + condition: and(succeeded(), eq('${{ parameters.cloud }}', 'azure')) diff --git a/steps/topology/network-policy-scale/collect-clusterloader2.yml b/steps/topology/network-policy-scale/collect-clusterloader2.yml index 430bf27e1e..12cd440c24 100644 --- a/steps/topology/network-policy-scale/collect-clusterloader2.yml +++ b/steps/topology/network-policy-scale/collect-clusterloader2.yml @@ -15,8 +15,22 @@ steps: cloud: ${{ parameters.cloud }} engine_input: ${{ parameters.engine_input }} region: ${{ parameters.regions[0] }} + - script: | run_id=$(Build.BuildId)-$(System.JobId) echo "Run ID: $run_id" echo "##vso[task.setvariable variable=RUN_ID]$run_id" displayName: "Set unique Run ID before publish" + +- script: | + # Append timestamp to the test results file if the file exists + set -eux + if [ -f "$(TEST_RESULTS_FILE)" ]; then + jq --arg start_timestamp $START \ + -c '. + {start_timestamp: $start_timestamp}' $(TEST_RESULTS_FILE) > temp-$START.json \ + && mv temp-$START.json $(TEST_RESULTS_FILE) + else + echo "##vso[task.logissue type=warning;]File $(TEST_RESULTS_FILE) does not exist." + fi + displayName: "Add timestamp to Test Results" + condition: always()