From ccf144e2c9de54fe607f1f4f6a21abdcf010cd60 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 2 Apr 2025 09:39:39 +0100 Subject: [PATCH 01/36] Add network churn validation and enhance test execution with timestamps --- jobs/competitive-test.yml | 8 ++ .../slo/config/deployment_template.yaml | 29 ++++- .../slo/config/load-config.yaml | 71 +++++++++- .../net-policy-enforcement-latency.yaml | 55 ++++++++ .../network-policy/net-policy-metrics.yaml | 122 ++++++++++++++++++ .../slo/config/modules/reconcile-objects.yaml | 13 ++ modules/python/clusterloader2/slo/slo.py | 51 +++++++- .../network-churn/cilium-network-churn.yml | 102 +++++++++++++++ steps/collect-telescope-metadata.yml | 4 +- steps/engine/clusterloader2/slo/collect.yml | 3 +- steps/engine/clusterloader2/slo/execute.yml | 3 +- steps/execute-tests.yml | 7 + steps/setup-tests.yml | 14 +- .../network-churn/collect-clusterloader2.yml | 23 ++++ .../network-churn/execute-clusterloader2.yml | 17 +++ .../network-churn/validate-resources.yml | 13 ++ 16 files changed, 514 insertions(+), 21 deletions(-) create mode 100644 modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-enforcement-latency.yaml create mode 100644 modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml create mode 100644 pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml create mode 100644 steps/topology/network-churn/collect-clusterloader2.yml create mode 100644 steps/topology/network-churn/execute-clusterloader2.yml create mode 100644 steps/topology/network-churn/validate-resources.yml diff --git a/jobs/competitive-test.yml b/jobs/competitive-test.yml index c4872ad11c..dffed3eba7 100644 --- a/jobs/competitive-test.yml +++ b/jobs/competitive-test.yml @@ -33,6 +33,9 @@ parameters: - name: run_id type: string default: '' +- name: run_id_2 # This cluster will be used if 'use_secondary_cluster' is true + type: string + default: '' - name: timeout_in_minutes type: number default: 60 # default when not specified is 60 minutes @@ -48,6 +51,9 @@ parameters: - name: ssh_key_enabled type: boolean default: true +- name: use_secondary_cluster # Set it to true if you want to use a secondary cluster(run_id_2) for the test + type: boolean + default: false jobs: - job: ${{ parameters.cloud }} @@ -62,10 +68,12 @@ jobs: cloud: ${{ parameters.cloud }} region: ${{ parameters.regions[0] }} run_id: ${{ parameters.run_id }} + run_id_2: ${{ parameters.run_id_2 }} test_modules_dir: ${{ parameters.test_modules_dir }} retry_attempt_count: ${{ parameters.retry_attempt_count }} credential_type: ${{ parameters.credential_type }} ssh_key_enabled: ${{ parameters.ssh_key_enabled }} + use_secondary_cluster: ${{ parameters.use_secondary_cluster }} - template: /steps/provision-resources.yml parameters: cloud: ${{ parameters.cloud }} diff --git a/modules/python/clusterloader2/slo/config/deployment_template.yaml b/modules/python/clusterloader2/slo/config/deployment_template.yaml index 91929229c9..786f80d951 100644 --- a/modules/python/clusterloader2/slo/config/deployment_template.yaml +++ b/modules/python/clusterloader2/slo/config/deployment_template.yaml @@ -3,6 +3,13 @@ {{$cnp_test:= .cnp_test}} {{$ccnp_test:= .ccnp_test}} +{{$EnableNetworkPolicyEnforcementLatencyTest := DefaultParam .EnableNetworkPolicyEnforcementLatencyTest false}} +{{$TargetLabelValue := DefaultParam .TargetLabelValue "enforcement-latency"}} +# Run a server pod for network policy enforcement latency test only on every Nth pod. +# Default run on every pod. +{{$NetPolServerOnEveryNthPod := 1}} +{{$RunNetPolicyTest := and $EnableNetworkPolicyEnforcementLatencyTest (eq (Mod .Index $NetPolServerOnEveryNthPod) 0)}} + {{$Image := DefaultParam .Image "mcr.microsoft.com/oss/kubernetes/pause:3.6"}} apiVersion: apps/v1 @@ -18,7 +25,7 @@ spec: replicas: {{.Replicas}} selector: matchLabels: - name: {{.Name}} + name: {{if $RunNetPolicyTest}}policy-load-{{end}}{{.Name}} strategy: type: RollingUpdate rollingUpdate: @@ -27,15 +34,30 @@ spec: template: metadata: labels: - name: {{.Name}} + name: {{if $RunNetPolicyTest}}policy-load-{{end}}{{.Name}} group: {{.Group}} {{if .SvcName}} svc: {{.SvcName}}-{{.Index}} {{end}} restart: {{.deploymentLabel}} +{{if $RunNetPolicyTest}} + net-pol-test: {{$TargetLabelValue}} +{{end}} spec: nodeSelector: slo: "true" +{{if $RunNetPolicyTest}} + hostNetwork: false + containers: + - image: acnpublic.azurecr.io/scaletest/nginx:latest + name: nginx-server + ports: + - containerPort: 80 + resources: + requests: + cpu: {{$CpuRequest}} + memory: {{$MemoryRequest}} +{{else}} containers: - env: - name: ENV_VAR @@ -43,11 +65,12 @@ spec: image: {{$Image}} imagePullPolicy: IfNotPresent name: {{.Name}} - ports: + ports: [] resources: requests: cpu: {{$CpuRequest}} memory: {{$MemoryRequest}} +{{end}} # Add not-ready/unreachable tolerations for 15 minutes so that node # failure doesn't trigger pod deletion. tolerations: diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index 731d744f84..b6d4bb879d 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -1,12 +1,13 @@ name: load-config # Config options for test type -{{$SERVICE_TEST := DefaultParam .CL2_SERVICE_TEST true}} +{{$SERVICE_TEST := DefaultParam .CL2_SERVICE_TEST false}} +{{$NETWORK_TEST := DefaultParam .CL2_NETWORK_TEST false}} {{$CNP_TEST := DefaultParam .CL2_CNP_TEST false}} {{$CCNP_TEST := DefaultParam .CL2_CCNP_TEST false}} # Config options for test parameters -{{$nodesPerNamespace := DefaultParam .CL2_NODES_PER_NAMESPACE 100}} +{{$nodesPerNamespace := DefaultParam .CL2_NODES_PER_NAMESPACE 100}} #TODO itia: check if its passed right {{$podsPerNode := DefaultParam .CL2_PODS_PER_NODE 50}} {{$loadTestThroughput := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 100}} {{$deploymentSize := DefaultParam .CL2_DEPLOYMENT_SIZE 100}} @@ -14,12 +15,12 @@ name: load-config {{$groupName := DefaultParam .CL2_GROUP_NAME "service-discovery"}} # TODO(jshr-w): This should eventually use >1 namespace. -{{$namespaces := 1}} +{{$namespaces := DefaultParam .CL2_NO_OF_NAMESPACES 1}} {{$nodes := DefaultParam .CL2_NODES 1000}} {{$deploymentQPS := DivideFloat $loadTestThroughput $deploymentSize}} {{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "15m"}} -{{$totalPods := MultiplyInt $namespaces $nodes $podsPerNode}} +{{$totalPods := MultiplyInt $namespaces $nodesPerNamespace $podsPerNode}} #TODO itia: can it break existing tests? {{$podsPerNamespace := DivideInt $totalPods $namespaces}} {{$deploymentsPerNamespace := DivideInt $podsPerNamespace $deploymentSize}} @@ -40,7 +41,16 @@ name: load-config {{$bigDeploymentsPerNamespace := DefaultParam .bigDeploymentsPerNamespace 1}} {{end}} -{{$smallDeploymentPods := SubtractInt $podsPerNamespace (MultiplyInt $bigDeploymentsPerNamespace $BIG_GROUP_SIZE)}} +# Use explicit conditional block to assign smallDeploymentPods to maintain backward compatibility +{{$calculatedPods := SubtractInt $podsPerNamespace (MultiplyInt $bigDeploymentsPerNamespace $BIG_GROUP_SIZE)}} + +{{$smallDeploymentPods := 0}} +{{if $NETWORK_TEST}} + {{$smallDeploymentPods = $podsPerNamespace}} +{{else}} + {{$smallDeploymentPods = $calculatedPods}} +{{end}} + {{$smallDeploymentsPerNamespace := DivideInt $smallDeploymentPods $SMALL_GROUP_SIZE}} # CNP & CCNP Test @@ -67,7 +77,7 @@ tuningSets: qps: {{$deploymentQPS}} steps: - - name: Log - namespaces={{$namespaces}}, nodesPerNamespace={{$nodesPerNamespace}}, podsPerNode={{$podsPerNode}}, totalPods={{$totalPods}}, podsPerNamespace={{$podsPerNamespace}}, deploymentsPerNamespace={{$deploymentsPerNamespace}}, deploymentSize={{$deploymentSize}}, deploymentQPS={{$deploymentQPS}} + - name: Log - namespaces={{$namespaces}}, nodes={{$nodes}}, nodesPerNamespace={{$nodesPerNamespace}}, podsPerNode={{$podsPerNode}}, totalPods={{$totalPods}}, podsPerNamespace={{$podsPerNamespace}}, deploymentsPerNamespace={{$deploymentsPerNamespace}}, deploymentSize={{$deploymentSize}}, deploymentQPS={{$deploymentQPS}} measurements: - Identifier: Dummy Method: Sleep @@ -88,6 +98,13 @@ steps: action: start {{end}} +{{if $NETWORK_TEST}} + - module: + path: /modules/network-policy/net-policy-metrics.yaml + params: + action: start +{{end}} + {{if $SCRAPE_CONTAINERD}} - module: path: /modules/containerd-measurements.yaml @@ -125,6 +142,15 @@ steps: ccnps: {{$CCNPS}} {{end}} +{{if $NETWORK_TEST}} + - module: + path: modules/network-policy/net-policy-enforcement-latency.yaml + params: + setup: true + run: true + testType: "pod-creation" +{{end}} + - module: path: /modules/reconcile-objects.yaml params: @@ -148,6 +174,26 @@ steps: Group: {{$groupName}} deploymentLabel: start +{{if $NETWORK_TEST}} + - module: + path: modules/network-policy/net-policy-metrics.yaml + params: + action: gather + usePolicyCreationMetrics: false + + - module: + path: modules/network-policy/net-policy-enforcement-latency.yaml + params: + complete: true + testType: "pod-creation" + + - module: + path: modules/network-policy/net-policy-enforcement-latency.yaml + params: + run: true + testType: "policy-creation" +{{end}} + - module: path: /modules/reconcile-objects.yaml params: @@ -244,3 +290,16 @@ steps: params: action: gather group: {{$groupName}} + +{{if $NETWORK_TEST}} + - module: + path: modules/network-policy/net-policy-metrics.yaml + params: + action: gather + + - module: + path: modules/network-policy/net-policy-enforcement-latency.yaml + params: + complete: true + testType: "policy-creation" +{{end}} diff --git a/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-enforcement-latency.yaml b/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-enforcement-latency.yaml new file mode 100644 index 0000000000..9779d209f3 --- /dev/null +++ b/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-enforcement-latency.yaml @@ -0,0 +1,55 @@ +{{$NETWORK_POLICY_ENFORCEMENT_LATENCY_BASELINE := DefaultParam .CL2_NETWORK_POLICY_ENFORCEMENT_LATENCY_BASELINE false}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY "net-pol-test"}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE "enforcement-latency"}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE "net-policy-client"}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_MAX_TARGET_PODS_PER_NS := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_MAX_TARGET_PODS_PER_NS 100}} +{{$NET_POLICY_ENFORCEMENT_LOAD_COUNT := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LOAD_COUNT 1000}} +{{$NET_POLICY_ENFORCEMENT_LOAD_QPS := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LOAD_QPS 10}} +{{$NET_POLICY_ENFORCEMENT_LOAD_TARGET_NAME := DefaultParam .CL2_POLICY_ENFORCEMENT_LOAD_TARGET_NAME "small-deployment"}} + +{{$setup := DefaultParam .setup false}} +{{$run := DefaultParam .run false}} +{{$complete := DefaultParam .complete false}} +{{$testType := DefaultParam .testType "policy-creation"}} +# Target port needs to match the server container port of target pods that have +# "targetLabelKey: targetLabelValue" label selector. +{{$targetPort := 80}} + +steps: +{{if $setup}} +- name: Setup network policy enforcement latency measurement + measurements: + - Identifier: NetworkPolicyEnforcement + Method: NetworkPolicyEnforcement + Params: + action: setup + targetLabelKey: {{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY}} + targetLabelValue: {{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE}} + baseline: {{$NETWORK_POLICY_ENFORCEMENT_LATENCY_BASELINE}} + testClientNodeSelectorValue: {{$NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE}} +{{end}} + +{{if $run}} +- name: "Run pod creation network policy enforcement latency measurement (testType={{$testType}})" + measurements: + - Identifier: NetworkPolicyEnforcement + Method: NetworkPolicyEnforcement + Params: + action: run + testType: {{$testType}} + targetPort: {{$targetPort}} + maxTargets: {{$NET_POLICY_ENFORCEMENT_LATENCY_MAX_TARGET_PODS_PER_NS}} + policyLoadCount: {{$NET_POLICY_ENFORCEMENT_LOAD_COUNT}} + policyLoadQPS: {{$NET_POLICY_ENFORCEMENT_LOAD_QPS}} + policyLoadTargetBaseName: {{$NET_POLICY_ENFORCEMENT_LOAD_TARGET_NAME}} +{{end}} + +{{if $complete}} +- name: "Complete pod creation network policy enforcement latency measurement (testType={{$testType}})" + measurements: + - Identifier: NetworkPolicyEnforcement + Method: NetworkPolicyEnforcement + Params: + action: complete + testType: {{$testType}} +{{end}} \ No newline at end of file diff --git a/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml b/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml new file mode 100644 index 0000000000..5be48be8bb --- /dev/null +++ b/modules/python/clusterloader2/slo/config/modules/network-policy/net-policy-metrics.yaml @@ -0,0 +1,122 @@ +# Valid actions: "start", "gather" +{{$action := .action}} +{{$usePolicyCreationMetrics := DefaultParam .usePolicyCreationMetrics true}} +{{$usePodCreationMetrics := DefaultParam .usePodCreationMetrics true}} +{{$useCiliumMetrics := DefaultParam .useCiliumMetrics true}} + +# CL2 params +# Negative default values are used to turn thresholds off if not overridden. Thresholds are only enabled with values of zero or higher. +{{$NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS -1}} +{{$NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS -1}} +{{$NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS := DefaultParam .CL2_NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS -1}} +{{$CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD := DefaultParam .CL2_CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD 0}} +{{$CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD := DefaultParam .CL2_CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD 0.01}} +{{$CILIUM_POLICY_REGEN_TIME_99_THRESHOLD := DefaultParam .CL2_CILIUM_POLICY_REGEN_TIME_99_THRESHOLD -1}} +{{$CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD := DefaultParam .CL2_CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD -1}} + +steps: +- name: "{{$action}}ing network policy metrics" + measurements: + - Identifier: NetworkPolicyEnforcementLatency + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: "Network Policy Enforcement Latency" + metricVersion: v1 + unit: s + queries: + # Network policy enforcement metrics gathered from the test clients. + {{if $usePolicyCreationMetrics}} + - name: PolicyCreation - TargetCount + query: sum(policy_enforcement_latency_policy_creation_seconds_count) + - name: PolicyCreation - Perc50 + query: histogram_quantile(0.5, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) + - name: PolicyCreation - Perc90 + query: histogram_quantile(0.9, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) + - name: PolicyCreation - Perc95 + query: histogram_quantile(0.95, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) + - name: PolicyCreation - Perc99 + query: histogram_quantile(0.99, sum(policy_enforcement_latency_policy_creation_seconds_bucket) by (le)) + {{if ge $NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS 0}} + threshold: {{$NP_ENFORCE_POLICY_CREATION_99_THRESHOLD_SECONDS}} + {{end}} + {{end}} + {{if $usePodCreationMetrics}} + - name: PodCreation - TargetCount + query: sum(pod_creation_reachability_latency_seconds_count) + - name: PodCreation - Perc50 + query: histogram_quantile(0.5, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) + - name: PodCreation - Perc90 + query: histogram_quantile(0.9, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) + - name: PodCreation - Perc95 + query: histogram_quantile(0.95, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) + - name: PodCreation - Perc99 + query: histogram_quantile(0.99, sum(rate(pod_creation_reachability_latency_seconds_bucket[%v])) by (le)) + {{if ge $NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS 0}} + threshold: {{$NP_ENFORCE_POD_CREATION_99_THRESHOLD_SECONDS}} + {{end}} + - name: PodIpAssignedLatency - TargetCount + query: sum(pod_ip_address_assigned_latency_seconds_count) + - name: PodIpAssignedLatency - Perc50 + query: histogram_quantile(0.50, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) + - name: PodIpAssignedLatency - Perc90 + query: histogram_quantile(0.90, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) + - name: PodIpAssignedLatency - Perc95 + query: histogram_quantile(0.95, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) + - name: PodIpAssignedLatency - Perc99 + query: histogram_quantile(0.99, sum(rate(pod_ip_address_assigned_latency_seconds_bucket[%v])) by (le)) + {{if ge $NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS 0}} + threshold: {{$NP_ENFORCE_POD_IP_ASSIGNED_99_THRESHOLD_SECONDS}} + {{end}} + {{end}} + + {{if $useCiliumMetrics}} + - Identifier: NetworkPolicyMetrics + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: "Network Policy Performance" + metricVersion: v1 + unit: s + queries: + # Cilium agent metrics that are related to network policies. + - name: Number of times a policy import has failed + # To be replaced with the new Cilium metric that counts all policy changes, not just import errors. + # With that, this can be a percentage of failed imports. + # https://github.com/cilium/cilium/pull/23349 + query: sum(cilium_policy_import_errors_total) + threshold: {{$CILIUM_POLICY_IMPORTS_ERROR_THRESHOLD}} + - name: Failed endpoint regenerations percentage + query: sum(cilium_endpoint_regenerations_total{outcome="fail"}) / sum(cilium_endpoint_regenerations_total) * 100 + threshold: {{$CILIUM_ENDPOINT_REGEN_FAIL_PERC_THRESHOLD}} + - name: Policy regeneration time - Perc50 + query: histogram_quantile(0.50, sum(cilium_policy_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) + - name: Policy regeneration time - Perc99 + query: histogram_quantile(0.99, sum(cilium_policy_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) + {{if ge $CILIUM_POLICY_REGEN_TIME_99_THRESHOLD 0}} + threshold: {{$CILIUM_POLICY_REGEN_TIME_99_THRESHOLD}} + {{end}} + - name: Time between a policy change and it being fully deployed into the datapath - Perc50 + query: histogram_quantile(0.50, sum(cilium_policy_implementation_delay_bucket) by (le)) + - name: Time between a policy change and it being fully deployed into the datapath - Perc99 + query: histogram_quantile(0.99, sum(cilium_policy_implementation_delay_bucket) by (le)) + - name: Latency of policy update trigger - Perc50 + query: histogram_quantile(0.50, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="latency"}) by (le)) + - name: Latency of policy update trigger - Perc99 + query: histogram_quantile(0.99, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="latency"}) by (le)) + - name: Duration of policy update trigger - Perc50 + query: histogram_quantile(0.50, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="duration"}) by (le)) + - name: Duration of policy update trigger - Perc99 + query: histogram_quantile(0.99, sum(cilium_triggers_policy_update_call_duration_seconds_bucket{type="duration"}) by (le)) + - name: Endpoint regeneration latency - Perc50 + query: histogram_quantile(0.50, sum(cilium_endpoint_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) + - name: Endpoint regeneration latency - Perc99 + query: histogram_quantile(0.99, sum(cilium_endpoint_regeneration_time_stats_seconds_bucket{scope="total"}) by (le)) + {{if ge $CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD 0}} + threshold: {{$CILIUM_ENDPOINT_REGEN_TIME_99_THRESHOLD}} + {{end}} + - name: Number of policies currently loaded + query: avg(cilium_policy) + - name: Number of endpoints labeled by policy enforcement status + query: sum(cilium_policy_endpoint_enforcement_status) + {{end}} \ No newline at end of file diff --git a/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml b/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml index d3e08b0f8e..0ae97f20c5 100644 --- a/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml +++ b/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml @@ -14,6 +14,13 @@ {{$smallDeploymentSize := .smallDeploymentSize}} {{$smallDeploymentsPerNamespace := .smallDeploymentsPerNamespace}} +{{$NETWORK_TEST := DefaultParam .CL2_NETWORK_TEST false}} +{{$ENABLE_NETWORKPOLICIES := DefaultParam .CL2_NETWORK_TEST false}} +{{$ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST := DefaultParam .CL2_ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST false}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY "net-pol-test"}} +{{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE := DefaultParam .CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE "enforcement-latency"}} +{{$NET_POLICY_SERVER_EVERY_NTH_POD := DefaultParam .CL2_NET_POLICY_SERVER_EVERY_NTH_POD 3}} + {{$cnp_test:= .cnp_test}} {{$ccnp_test:= .ccnp_test}} @@ -35,6 +42,7 @@ steps: - name: {{$actionName}} phases: +{{if not $NETWORK_TEST}} - namespaceRange: min: 1 max: {{$namespaces}} @@ -48,6 +56,7 @@ steps: SvcName: big-service Group: {{.Group}} deploymentLabel: {{.deploymentLabel}} +{{end}} - namespaceRange: min: 1 max: {{$namespaces}} @@ -58,6 +67,10 @@ steps: objectTemplatePath: deployment_template.yaml templateFillMap: Replicas: {{$smallDeploymentSize}} + EnableNetworkPolicyEnforcementLatencyTest: {{$NETWORK_TEST}} + TargetLabelKey: {{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY}} + TargetLabelValue: {{$NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE}} + NetPolServerOnEveryNthPod: {{$NET_POLICY_SERVER_EVERY_NTH_POD}} {{if or $cnp_test $ccnp_test}} cnp_test: {{$cnp_test}} ccnp_test: {{$ccnp_test}} diff --git a/modules/python/clusterloader2/slo/slo.py b/modules/python/clusterloader2/slo/slo.py index d89972b906..8c169d1e5a 100644 --- a/modules/python/clusterloader2/slo/slo.py +++ b/modules/python/clusterloader2/slo/slo.py @@ -23,12 +23,15 @@ } # TODO: Remove aks once CL2 update provider name to be azure -def calculate_config(cpu_per_node, node_count, max_pods, provider, service_test, cnp_test, ccnp_test): +def calculate_config(cpu_per_node, node_count, max_pods, provider, service_test, network_test, cnp_test, ccnp_test): throughput = 100 nodes_per_namespace = min(node_count, DEFAULT_NODES_PER_NAMESPACE) - pods_per_node = DEFAULT_PODS_PER_NODE if service_test: + nodes_per_namespace = 24 #TODO: itia: fix hardcoded value + + pods_per_node = DEFAULT_PODS_PER_NODE + if service_test or network_test: pods_per_node = max_pods if cnp_test or ccnp_test: @@ -54,6 +57,9 @@ def configure_clusterloader2( cilium_enabled, scrape_containerd, service_test, + network_test, + no_of_namespaces, + total_network_policies, cnp_test, ccnp_test, num_cnps, @@ -62,18 +68,20 @@ def configure_clusterloader2( override_file): steps = node_count // node_per_step - throughput, nodes_per_namespace, pods_per_node, cpu_request = calculate_config(cpu_per_node, node_per_step, max_pods, provider, service_test, cnp_test, ccnp_test) + throughput, nodes_per_namespace, pods_per_node, cpu_request = calculate_config(cpu_per_node, node_per_step, max_pods, provider, service_test, network_test, cnp_test, ccnp_test) with open(override_file, 'w', encoding='utf-8') as file: file.write(f"CL2_NODES: {node_count}\n") file.write(f"CL2_LOAD_TEST_THROUGHPUT: {throughput}\n") - file.write(f"CL2_NODES_PER_NAMESPACE: {nodes_per_namespace}\n") + file.write(f"CL2_NODES_PER_NAMESPACE: {nodes_per_namespace}\n") #TODO: itia: check if its passed right file.write(f"CL2_NODES_PER_STEP: {node_per_step}\n") + file.write(f"CL2_NODES: {node_count}\n") file.write(f"CL2_PODS_PER_NODE: {pods_per_node}\n") file.write(f"CL2_DEPLOYMENT_SIZE: {pods_per_node}\n") file.write(f"CL2_LATENCY_POD_CPU: {cpu_request}\n") file.write(f"CL2_REPEATS: {repeats}\n") file.write(f"CL2_STEPS: {steps}\n") + file.write(f"CL2_NO_OF_NAMESPACES: {no_of_namespaces}\n") file.write(f"CL2_OPERATION_TIMEOUT: {operation_timeout}\n") file.write("CL2_PROMETHEUS_TOLERATE_MASTER: true\n") file.write("CL2_PROMETHEUS_MEMORY_LIMIT_FACTOR: 100.0\n") @@ -97,6 +105,27 @@ def configure_clusterloader2( else: file.write("CL2_SERVICE_TEST: false\n") + if network_test: + file.write("CL2_NETWORK_TEST: true\n") + file.write("CL2_ENABLE_NETWORK_POLICY_ENFORCEMENT_LATENCY_TEST: true\n") + file.write("CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE: true\n") + file.write("CL2_PROMETHEUS_SCRAPE_KUBE_PROXY: true\n") + file.write("CL2_NETWORK_PROGRAMMING_LATENCY_THRESHOLD: 30s\n") + file.write("CL2_ENABLE_VIOLATIONS_FOR_NETWORK_PROGRAMMING_LATENCIES: false\n") + file.write("CL2_NETWORK_LATENCY_THRESHOLD: 0s\n") + file.write("CL2_PROBE_MEASUREMENTS_PING_SLEEP_DURATION: 1s\n") + file.write("CL2_ENABLE_IN_CLUSTER_NETWORK_LATENCY: true\n") + file.write("CL2_PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT: 15m\n") + file.write("CL2_NETWORK_POLICY_ENFORCEMENT_LATENCY_BASELINE: false\n") + file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY: net-pol-test\n") + file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE: enforcement-latency\n") + #file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_KEY: test\n") + file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE: net-policy-client\n") + file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_MAX_TARGET_PODS_PER_NS: 100\n") + file.write(f"CL2_NET_POLICY_ENFORCEMENT_LOAD_COUNT: {total_network_policies}\n") + file.write("CL2_NET_POLICY_ENFORCEMENT_LOAD_QPS: 10\n") + file.write("CL2_POLICY_ENFORCEMENT_LOAD_TARGET_NAME: small-deployment\n") + if cnp_test: file.write("CL2_CNP_TEST: true\n") file.write(f"CL2_CNPS_PER_NAMESPACE: {num_cnps}\n") @@ -152,6 +181,7 @@ def collect_clusterloader2( run_id, run_url, service_test, + network_test, cnp_test, ccnp_test, result_file, @@ -168,7 +198,7 @@ def collect_clusterloader2( else: raise Exception(f"No testsuites found in the report! Raw data: {details}") - _, _, pods_per_node, _ = calculate_config(cpu_per_node, node_count, max_pods, provider, service_test, cnp_test, ccnp_test) + _, _, pods_per_node, _ = calculate_config(cpu_per_node, node_count, max_pods, provider, service_test, network_test, cnp_test, ccnp_test) pod_count = node_count * pods_per_node # TODO: Expose optional parameter to include test details @@ -242,6 +272,10 @@ def main(): help="Whether to scrape containerd metrics. Must be either True or False") parser_configure.add_argument("service_test", type=eval, choices=[True, False], default=False, help="Whether service test is running. Must be either True or False") + parser_configure.add_argument("network_test", type=eval, choices=[True, False], default=False, + help="Whether network test is running. Must be either True or False") + parser_configure.add_argument("no_of_namespaces", type=int, default=1, help="Number of namespaces to create") + parser_configure.add_argument("total_network_policies", type=int, default=0, help="Total number of network policies to create") parser_configure.add_argument("cnp_test", type=eval, choices=[True, False], nargs='?', default=False, help="Whether cnp test is running. Must be either True or False") parser_configure.add_argument("ccnp_test", type=eval, choices=[True, False], nargs='?', default=False, @@ -280,6 +314,8 @@ def main(): parser_collect.add_argument("run_url", type=str, help="Run URL") parser_collect.add_argument("service_test", type=eval, choices=[True, False], default=False, help="Whether service test is running. Must be either True or False") + parser_collect.add_argument("network_test", type=eval, choices=[True, False], default=False, + help="Whether network test is running. Must be either True or False") parser_collect.add_argument("cnp_test", type=eval, choices=[True, False], nargs='?', default=False, help="Whether cnp test is running. Must be either True or False") parser_collect.add_argument("ccnp_test", type=eval, choices=[True, False], nargs='?', default=False, @@ -295,7 +331,8 @@ def main(): configure_clusterloader2(args.cpu_per_node, args.node_count, args.node_per_step, args.max_pods, args.repeats, args.operation_timeout, args.provider, args.cilium_enabled, args.scrape_containerd, - args.service_test, args.cnp_test, args.ccnp_test, args.num_cnps, args.num_ccnps, args.dualstack, args.cl2_override_file) + args.service_test, args.network_test, args.no_of_namespaces, args.total_network_policies, + args.cnp_test, args.ccnp_test, args.num_cnps, args.num_ccnps, args.dualstack, args.cl2_override_file) elif args.command == "validate": validate_clusterloader2(args.node_count, args.operation_timeout) elif args.command == "execute": @@ -304,7 +341,7 @@ def main(): elif args.command == "collect": collect_clusterloader2(args.cpu_per_node, args.node_count, args.max_pods, args.repeats, args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, - args.service_test, args.cnp_test, args.ccnp_test, + args.service_test, args.network_test, args.cnp_test, args.ccnp_test, args.result_file, args.test_type, args.start_timestamp) if __name__ == "__main__": diff --git a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml new file mode 100644 index 0000000000..c65094fb95 --- /dev/null +++ b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml @@ -0,0 +1,102 @@ +trigger: none +schedules: +- cron: "0 */8 * * *" + displayName: "Every 8 Hours Daily" + branches: + include: + - network-policy-churn + always: true + +parameters: + - name: node_count + type: number + default: 240 + - name: node_per_step + type: number + default: 240 + - name: pods_per_node + type: number + default: 40 + - name: repeats + type: number + default: 1 + - name: scale_timeout + type: string + default: "15m" + - name: no_of_namespaces + type: number + default: 10 + - name: total_nework_policies + type: number + default: 4800 + +variables: + SCENARIO_TYPE: perf-eval + SCENARIO_NAME: network-policy-churn + SCENARIO_VERSION: main + OWNER: aks + +stages: + - stage: azure_eastus2 + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2 + engine: clusterloader2 + engine_input: + image: "ghcr.io/agrawaliti/clusterloader2:latest" + topology: network-churn + matrix: + azure_cilium: + cpu_per_node: 4 + node_count: ${{ parameters.node_count }} + node_per_step: ${{ parameters.node_per_step }} + max_pods: ${{ parameters.pods_per_node }} + repeats: ${{ parameters.repeats }} + scale_timeout: ${{ parameters.scale_timeout }} + no_of_namespaces: ${{ parameters.no_of_namespaces }} + total_network_policies: ${{ parameters.total_nework_policies }} + cilium_enabled: True + network_policy: cilium + network_dataplane: cilium + service_test: False + network_test: True + cl2_config_file: load-config.yaml + max_parallel: 2 + timeout_in_minutes: 720 + credential_type: service_connection + ssh_key_enabled: false + - stage: azure_npm_eastus2 + dependsOn: [] + jobs: + - template: /jobs/competitive-test.yml + parameters: + cloud: azure + regions: + - eastus2 + engine: clusterloader2 + engine_input: + image: "ghcr.io/agrawaliti/clusterloader2:latest" + topology: network-churn + matrix: + azure_cni: + cpu_per_node: 4 + node_count: ${{ parameters.node_count }} + node_per_step: ${{ parameters.node_per_step }} + max_pods: ${{ parameters.pods_per_node }} + repeats: ${{ parameters.repeats }} + scale_timeout: ${{ parameters.scale_timeout }} + no_of_namespaces: ${{ parameters.no_of_namespaces }} + total_network_policies: ${{ parameters.total_nework_policies }} + cilium_enabled: False + service_test: False + network_test: True + cl2_config_file: load-config.yaml + max_parallel: 2 + timeout_in_minutes: 720 + credential_type: service_connection + ssh_key_enabled: false + use_secondary_cluster: true \ No newline at end of file diff --git a/steps/collect-telescope-metadata.yml b/steps/collect-telescope-metadata.yml index 76d651bd72..8947b35e3f 100644 --- a/steps/collect-telescope-metadata.yml +++ b/steps/collect-telescope-metadata.yml @@ -86,8 +86,8 @@ steps: # Append run_id to the test results file if the file exists set -eux if [ -f "$(TEST_RESULTS_FILE)" ]; then - jq --arg telescope_run_id $RUN_ID \ - -c '. + {telescope_run_id: $telescope_run_id}' $(TEST_RESULTS_FILE) > temp-$RUN_ID.json \ + jq --arg telescope_run_id $RUN_ID --arg start_timestamp $START \ + -c '. + {telescope_run_id: $telescope_run_id, start_timestamp: $start_timestamp}' $(TEST_RESULTS_FILE) > temp-$RUN_ID.json \ && mv temp-$RUN_ID.json $(TEST_RESULTS_FILE) else echo "##vso[task.logissue type=warning;]File $(TEST_RESULTS_FILE) does not exist." diff --git a/steps/engine/clusterloader2/slo/collect.yml b/steps/engine/clusterloader2/slo/collect.yml index 881b06f73c..e51ceca3a1 100644 --- a/steps/engine/clusterloader2/slo/collect.yml +++ b/steps/engine/clusterloader2/slo/collect.yml @@ -14,10 +14,11 @@ steps: region: ${{ parameters.region }} - script: | set -eo pipefail + echo "Tests Starting timestamp: $START" PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \ $CPU_PER_NODE $NODE_COUNT ${MAX_PODS:-0} \ - $REPEATS $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $SERVICE_TEST ${CNP_TEST:-False} \ + $REPEATS $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $SERVICE_TEST ${NETWORK_TEST:-False} ${CNP_TEST:-False} \ ${CCNP_TEST:-False} $TEST_RESULTS_FILE \ $TEST_TYPE $SLO_START_TIME workingDirectory: modules/python/clusterloader2 diff --git a/steps/engine/clusterloader2/slo/execute.yml b/steps/engine/clusterloader2/slo/execute.yml index 6e52ecc826..d66d9183b8 100644 --- a/steps/engine/clusterloader2/slo/execute.yml +++ b/steps/engine/clusterloader2/slo/execute.yml @@ -22,7 +22,8 @@ steps: PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE configure \ $CPU_PER_NODE $NODE_COUNT $NODE_PER_STEP ${MAX_PODS:-0} \ $REPEATS $SCALE_TIMEOUT $CLOUD $CILIUM_ENABLED ${SCRAPE_CONTAINERD:-False} \ - $SERVICE_TEST ${CNP_TEST:-False} ${CCNP_TEST:-False} ${NUM_CNPS:-0} ${NUM_CCNPS:-0} ${DUALSTACK:-False} ${CL2_CONFIG_DIR}/overrides.yaml + $SERVICE_TEST ${NETWORK_TEST:-False} ${NO_OF_NAMESPACES:-1} ${TOTAL_NETWORK_POLICIES:-0} + ${CNP_TEST:-False} ${CCNP_TEST:-False} ${NUM_CNPS:-0} ${NUM_CCNPS:-0} ${DUALSTACK:-False} ${CL2_CONFIG_DIR}/overrides.yaml PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ ${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR $CL2_CONFIG_FILE \ ${HOME}/.kube/config $CLOUD ${SCRAPE_CONTAINERD:-False} diff --git a/steps/execute-tests.yml b/steps/execute-tests.yml index 84b6e92a2b..0b9cb47c3f 100644 --- a/steps/execute-tests.yml +++ b/steps/execute-tests.yml @@ -13,6 +13,13 @@ parameters: default: {} steps: +- script: | + echo "Set the start time for test execution" + startTimestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + echo "Start: $startTimestamp" + echo "##vso[task.setvariable variable=START]$startTimestamp" + displayName: Set up start timestamp variable + - template: /steps/topology/${{ parameters.topology }}/execute-${{ parameters.engine }}.yml@self parameters: cloud: ${{ parameters.cloud }} diff --git a/steps/setup-tests.yml b/steps/setup-tests.yml index 171e71f188..2a5158bd9c 100644 --- a/steps/setup-tests.yml +++ b/steps/setup-tests.yml @@ -9,6 +9,9 @@ parameters: - name: run_id type: string default: '' +- name: run_id_2 + type: string + default: '' - name: retry_attempt_count type: number default: 3 @@ -16,10 +19,18 @@ parameters: type: string - name: ssh_key_enabled type: boolean +- name: use_secondary_cluster + type: boolean + default: false steps: - script: | - if [ -n "$RUN_ID" ]; then + set -eu + if [ "${{ parameters.use_secondary_cluster }}" == "True" ] && [ -n "${RUN_ID_2:-}" ]; then + echo "Using secondary cluster" + run_id=$RUN_ID_2 + elif [ -n "${RUN_ID:-}" ]; then + echo "Using primary cluster" run_id=$RUN_ID else run_id=$(Build.BuildId)-$(System.JobId) @@ -29,6 +40,7 @@ steps: displayName: "Set Run ID" env: RUN_ID: ${{ parameters.run_id }} + RUN_ID_2: ${{ parameters.run_id_2 }} - script: | run_url="$(System.TeamFoundationCollectionUri)$(System.TeamProject)/_build/results?buildId=$(Build.BuildId)&view=logs&j=$(System.JobId)" diff --git a/steps/topology/network-churn/collect-clusterloader2.yml b/steps/topology/network-churn/collect-clusterloader2.yml new file mode 100644 index 0000000000..8b9121ef4b --- /dev/null +++ b/steps/topology/network-churn/collect-clusterloader2.yml @@ -0,0 +1,23 @@ +parameters: +- name: cloud + type: string + default: '' +- name: engine_input + type: object + default: {} +- name: regions + type: object + default: {} + +steps: +- template: /steps/engine/clusterloader2/slo/collect.yml + parameters: + cloud: ${{ parameters.cloud }} + engine_input: ${{ parameters.engine_input }} + region: ${{ parameters.regions[0] }} + +- script: | + run_id=$(Build.BuildId)-$(System.JobId) + echo "Run ID: $run_id" + echo "##vso[task.setvariable variable=RUN_ID]$run_id" + displayName: "Set unique Run ID before publish" \ No newline at end of file diff --git a/steps/topology/network-churn/execute-clusterloader2.yml b/steps/topology/network-churn/execute-clusterloader2.yml new file mode 100644 index 0000000000..9720d82a0c --- /dev/null +++ b/steps/topology/network-churn/execute-clusterloader2.yml @@ -0,0 +1,17 @@ +parameters: +- name: cloud + type: string + default: '' +- name: engine_input + type: object + default: {} +- name: regions + type: object + default: {} + +steps: +- template: /steps/engine/clusterloader2/slo/execute.yml + parameters: + cloud: ${{ parameters.cloud }} + engine_input: ${{ parameters.engine_input }} + region: ${{ parameters.regions[0] }} \ No newline at end of file diff --git a/steps/topology/network-churn/validate-resources.yml b/steps/topology/network-churn/validate-resources.yml new file mode 100644 index 0000000000..e93d1f229d --- /dev/null +++ b/steps/topology/network-churn/validate-resources.yml @@ -0,0 +1,13 @@ +parameters: + - name: cloud + type: string + - name: engine + type: string + - name: regions + type: object + +steps: + - template: /steps/cloud/${{ parameters.cloud }}/update-kubeconfig.yml + parameters: + role: net + region: ${{ parameters.regions[0] }} \ No newline at end of file From 1456095eb71c68c762525a086dbd34da92305a72 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 2 Apr 2025 09:50:50 +0100 Subject: [PATCH 02/36] Update argument parsing to allow optional parameters for network and namespace configurations --- modules/python/clusterloader2/slo/slo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/python/clusterloader2/slo/slo.py b/modules/python/clusterloader2/slo/slo.py index 8c169d1e5a..0ac78d133e 100644 --- a/modules/python/clusterloader2/slo/slo.py +++ b/modules/python/clusterloader2/slo/slo.py @@ -272,10 +272,10 @@ def main(): help="Whether to scrape containerd metrics. Must be either True or False") parser_configure.add_argument("service_test", type=eval, choices=[True, False], default=False, help="Whether service test is running. Must be either True or False") - parser_configure.add_argument("network_test", type=eval, choices=[True, False], default=False, + parser_configure.add_argument("network_test", type=eval, choices=[True, False], nargs='?', default=False, help="Whether network test is running. Must be either True or False") - parser_configure.add_argument("no_of_namespaces", type=int, default=1, help="Number of namespaces to create") - parser_configure.add_argument("total_network_policies", type=int, default=0, help="Total number of network policies to create") + parser_configure.add_argument("no_of_namespaces", type=int, nargs='?', default=1, help="Number of namespaces to create") + parser_configure.add_argument("total_network_policies", type=int, nargs='?', default=0, help="Total number of network policies to create") parser_configure.add_argument("cnp_test", type=eval, choices=[True, False], nargs='?', default=False, help="Whether cnp test is running. Must be either True or False") parser_configure.add_argument("ccnp_test", type=eval, choices=[True, False], nargs='?', default=False, @@ -314,7 +314,7 @@ def main(): parser_collect.add_argument("run_url", type=str, help="Run URL") parser_collect.add_argument("service_test", type=eval, choices=[True, False], default=False, help="Whether service test is running. Must be either True or False") - parser_collect.add_argument("network_test", type=eval, choices=[True, False], default=False, + parser_collect.add_argument("network_test", type=eval, choices=[True, False], nargs='?', default=False, help="Whether network test is running. Must be either True or False") parser_collect.add_argument("cnp_test", type=eval, choices=[True, False], nargs='?', default=False, help="Whether cnp test is running. Must be either True or False") From 6108368084e70f296284351159a4faf0d99124aa Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 2 Apr 2025 09:58:08 +0100 Subject: [PATCH 03/36] Add CL2_CONFIG_FILE parameter to benchmark execution step --- steps/engine/clusterloader2/slo/execute.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/steps/engine/clusterloader2/slo/execute.yml b/steps/engine/clusterloader2/slo/execute.yml index d66d9183b8..20a4e98b56 100644 --- a/steps/engine/clusterloader2/slo/execute.yml +++ b/steps/engine/clusterloader2/slo/execute.yml @@ -38,4 +38,5 @@ steps: CL2_IMAGE: ${{ parameters.engine_input.image }} CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/slo/config CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/slo/results + CL2_CONFIG_FILE: load-config.yaml displayName: "Run Benchmark" From b7e23c08605e4ab5a80a5e6431bb1f61e7211279 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 2 Apr 2025 13:52:09 +0100 Subject: [PATCH 04/36] Refactor network test parameters and update image version for network churn benchmarks --- modules/python/clusterloader2/slo/slo.py | 2 +- .../network-churn/cilium-network-churn.yml | 4 ++-- steps/engine/clusterloader2/slo/execute.yml | 14 +++++++++++++- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/modules/python/clusterloader2/slo/slo.py b/modules/python/clusterloader2/slo/slo.py index 0ac78d133e..83ea15db83 100644 --- a/modules/python/clusterloader2/slo/slo.py +++ b/modules/python/clusterloader2/slo/slo.py @@ -27,7 +27,7 @@ def calculate_config(cpu_per_node, node_count, max_pods, provider, service_test, throughput = 100 nodes_per_namespace = min(node_count, DEFAULT_NODES_PER_NAMESPACE) - if service_test: + if network_test: nodes_per_namespace = 24 #TODO: itia: fix hardcoded value pods_per_node = DEFAULT_PODS_PER_NODE diff --git a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml index c65094fb95..a713e4431c 100644 --- a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml +++ b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml @@ -47,7 +47,7 @@ stages: - eastus2 engine: clusterloader2 engine_input: - image: "ghcr.io/agrawaliti/clusterloader2:latest" + image: "ghcr.io/azure/clusterloader2:v20241022" topology: network-churn matrix: azure_cilium: @@ -79,7 +79,7 @@ stages: - eastus2 engine: clusterloader2 engine_input: - image: "ghcr.io/agrawaliti/clusterloader2:latest" + image: "ghcr.io/azure/clusterloader2:v20241022" topology: network-churn matrix: azure_cni: diff --git a/steps/engine/clusterloader2/slo/execute.yml b/steps/engine/clusterloader2/slo/execute.yml index 20a4e98b56..169ffc3f35 100644 --- a/steps/engine/clusterloader2/slo/execute.yml +++ b/steps/engine/clusterloader2/slo/execute.yml @@ -7,6 +7,15 @@ parameters: default: {} - name: region type: string + - name: no_of_namespaces + type: number + default: 1 + - name: total_network_policies + type: number + default: 0 + - name: network_test + type: boolean + default: false steps: - script: | @@ -22,7 +31,7 @@ steps: PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE configure \ $CPU_PER_NODE $NODE_COUNT $NODE_PER_STEP ${MAX_PODS:-0} \ $REPEATS $SCALE_TIMEOUT $CLOUD $CILIUM_ENABLED ${SCRAPE_CONTAINERD:-False} \ - $SERVICE_TEST ${NETWORK_TEST:-False} ${NO_OF_NAMESPACES:-1} ${TOTAL_NETWORK_POLICIES:-0} + $SERVICE_TEST $NETWORK_TEST $NO_OF_NAMESPACES $TOTAL_NETWORK_POLICIES ${CNP_TEST:-False} ${CCNP_TEST:-False} ${NUM_CNPS:-0} ${NUM_CCNPS:-0} ${DUALSTACK:-False} ${CL2_CONFIG_DIR}/overrides.yaml PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ ${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR $CL2_CONFIG_FILE \ @@ -39,4 +48,7 @@ steps: CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/slo/config CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/slo/results CL2_CONFIG_FILE: load-config.yaml + NO_OF_NAMESPACES: ${{ parameters.no_of_namespaces }} + TOTAL_NETWORK_POLICIES: ${{ parameters.total_network_policies }} + NETWORK_TEST: ${{ parameters.network_test }} #TODO: itia: Remove these 3 param and use default ${NETWORK_TEST:-False} ${NO_OF_NAMESPACES:-1} ${TOTAL_NETWORK_POLICIES:-0} displayName: "Run Benchmark" From 4dc8f366deb3679af684110dd69452f68098c405 Mon Sep 17 00:00:00 2001 From: Sumanth Reddy CH <38838077+sumanthreddy29@users.noreply.github.com> Date: Thu, 24 Apr 2025 11:37:02 -0400 Subject: [PATCH 05/36] Update AKS CLI module to use fixed version (#602) This pull request updates the Azure AKS CLI preview extension configuration in the Terraform module to specify a version for the extension. * [`modules/terraform/azure/aks-cli/main.tf`](diffhunk://#diff-1c09e32cd63aa3f4a6cfae577b3db448daedf498df32bd8834fd4344f6b86ab4R82-R83): Added the `--version` flag with a value of `14.0.0b2` to ensure a specific version of the `aks-preview` extension is used. --- modules/terraform/azure/aks-cli/main.tf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/terraform/azure/aks-cli/main.tf b/modules/terraform/azure/aks-cli/main.tf index c10f569cde..df5650bc1f 100644 --- a/modules/terraform/azure/aks-cli/main.tf +++ b/modules/terraform/azure/aks-cli/main.tf @@ -72,6 +72,7 @@ resource "azurerm_role_assignment" "network_contributor" { resource "terraform_data" "aks_cli_preview" { count = var.aks_cli_config.use_aks_preview_cli_extension == true ? 1 : 0 + # Todo - Update aks-preview extension for newer features provisioner "local-exec" { command = join(" ", [ "az", @@ -79,6 +80,8 @@ resource "terraform_data" "aks_cli_preview" { "add", "-n", "aks-preview", + "--version", + "14.0.0b2", ]) } From e113c299c6ba364f6ee68e44cd2c9cafdc076571 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Fri, 25 Apr 2025 15:02:20 +0100 Subject: [PATCH 06/36] Refactor YAML files to improve formatting and maintain consistency --- .../network-churn/cilium-network-churn.yml | 16 ++++++++-------- steps/execute-tests.yml | 2 +- .../network-churn/collect-clusterloader2.yml | 2 +- .../network-churn/execute-clusterloader2.yml | 2 +- .../network-churn/validate-resources.yml | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml index a713e4431c..a40df56b5e 100644 --- a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml +++ b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml @@ -1,11 +1,11 @@ trigger: none -schedules: -- cron: "0 */8 * * *" - displayName: "Every 8 Hours Daily" - branches: - include: - - network-policy-churn - always: true +# schedules: +# - cron: "0 */8 * * *" +# displayName: "Every 8 Hours Daily" +# branches: +# include: +# - network-policy-churn +# always: true parameters: - name: node_count @@ -99,4 +99,4 @@ stages: timeout_in_minutes: 720 credential_type: service_connection ssh_key_enabled: false - use_secondary_cluster: true \ No newline at end of file + use_secondary_cluster: true diff --git a/steps/execute-tests.yml b/steps/execute-tests.yml index 0b9cb47c3f..4330d7c33d 100644 --- a/steps/execute-tests.yml +++ b/steps/execute-tests.yml @@ -19,7 +19,7 @@ steps: echo "Start: $startTimestamp" echo "##vso[task.setvariable variable=START]$startTimestamp" displayName: Set up start timestamp variable - + - template: /steps/topology/${{ parameters.topology }}/execute-${{ parameters.engine }}.yml@self parameters: cloud: ${{ parameters.cloud }} diff --git a/steps/topology/network-churn/collect-clusterloader2.yml b/steps/topology/network-churn/collect-clusterloader2.yml index 8b9121ef4b..5c5105adac 100644 --- a/steps/topology/network-churn/collect-clusterloader2.yml +++ b/steps/topology/network-churn/collect-clusterloader2.yml @@ -20,4 +20,4 @@ steps: run_id=$(Build.BuildId)-$(System.JobId) echo "Run ID: $run_id" echo "##vso[task.setvariable variable=RUN_ID]$run_id" - displayName: "Set unique Run ID before publish" \ No newline at end of file + displayName: "Set unique Run ID before publish" diff --git a/steps/topology/network-churn/execute-clusterloader2.yml b/steps/topology/network-churn/execute-clusterloader2.yml index 9720d82a0c..d084b2ef03 100644 --- a/steps/topology/network-churn/execute-clusterloader2.yml +++ b/steps/topology/network-churn/execute-clusterloader2.yml @@ -14,4 +14,4 @@ steps: parameters: cloud: ${{ parameters.cloud }} engine_input: ${{ parameters.engine_input }} - region: ${{ parameters.regions[0] }} \ No newline at end of file + region: ${{ parameters.regions[0] }} diff --git a/steps/topology/network-churn/validate-resources.yml b/steps/topology/network-churn/validate-resources.yml index e93d1f229d..fe97a40cbc 100644 --- a/steps/topology/network-churn/validate-resources.yml +++ b/steps/topology/network-churn/validate-resources.yml @@ -10,4 +10,4 @@ steps: - template: /steps/cloud/${{ parameters.cloud }}/update-kubeconfig.yml parameters: role: net - region: ${{ parameters.regions[0] }} \ No newline at end of file + region: ${{ parameters.regions[0] }} From 1e9ea17f9f97ffd3b49265297666ffaf16c99565 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Tue, 29 Apr 2025 16:39:06 +0100 Subject: [PATCH 07/36] finxing typo --- steps/engine/clusterloader2/slo/execute.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/steps/engine/clusterloader2/slo/execute.yml b/steps/engine/clusterloader2/slo/execute.yml index 169ffc3f35..0a5daa6874 100644 --- a/steps/engine/clusterloader2/slo/execute.yml +++ b/steps/engine/clusterloader2/slo/execute.yml @@ -31,7 +31,7 @@ steps: PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE configure \ $CPU_PER_NODE $NODE_COUNT $NODE_PER_STEP ${MAX_PODS:-0} \ $REPEATS $SCALE_TIMEOUT $CLOUD $CILIUM_ENABLED ${SCRAPE_CONTAINERD:-False} \ - $SERVICE_TEST $NETWORK_TEST $NO_OF_NAMESPACES $TOTAL_NETWORK_POLICIES + $SERVICE_TEST $NETWORK_TEST $NO_OF_NAMESPACES $TOTAL_NETWORK_POLICIES \ ${CNP_TEST:-False} ${CCNP_TEST:-False} ${NUM_CNPS:-0} ${NUM_CCNPS:-0} ${DUALSTACK:-False} ${CL2_CONFIG_DIR}/overrides.yaml PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ ${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR $CL2_CONFIG_FILE \ From 3a2e9f287b6d702d56c0d2c3b0b1357fb1faa64e Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Tue, 29 Apr 2025 16:52:33 +0100 Subject: [PATCH 08/36] removing deploymentQPS from load config --- modules/python/clusterloader2/slo/config/load-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index cebc181652..810e1fed54 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -85,7 +85,7 @@ tuningSets: timeLimit: {{$deletionTime}}s steps: - - name: Log - namespaces={{$namespaces}}, nodes={{$nodes}}, nodesPerNamespace={{$nodesPerNamespace}}, podsPerNode={{$podsPerNode}}, totalPods={{$totalPods}}, podsPerNamespace={{$podsPerNamespace}}, deploymentsPerNamespace={{$deploymentsPerNamespace}}, deploymentSize={{$deploymentSize}}, deploymentQPS={{$deploymentQPS}} + - name: Log - namespaces={{$namespaces}}, nodes={{$nodes}}, nodesPerNamespace={{$nodesPerNamespace}}, podsPerNode={{$podsPerNode}}, totalPods={{$totalPods}}, podsPerNamespace={{$podsPerNamespace}}, deploymentsPerNamespace={{$deploymentsPerNamespace}}, deploymentSize={{$deploymentSize}} measurements: - Identifier: Dummy Method: Sleep From 52657178da66df589cc23a4b2fb641a395158cb5 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Tue, 29 Apr 2025 18:00:51 +0100 Subject: [PATCH 09/36] Cleaning code --- .../clusterloader2/slo/config/load-config.yaml | 17 ++++------------- .../slo/config/modules/reconcile-objects.yaml | 2 -- modules/python/clusterloader2/slo/slo.py | 3 +-- steps/engine/clusterloader2/slo/collect.yml | 3 +-- steps/engine/clusterloader2/slo/execute.yml | 15 +-------------- steps/execute-tests.yml | 6 ------ 6 files changed, 7 insertions(+), 39 deletions(-) diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index 810e1fed54..2f59737548 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -7,7 +7,7 @@ name: load-config {{$CCNP_TEST := DefaultParam .CL2_CCNP_TEST false}} # Config options for test parameters -{{$nodesPerNamespace := DefaultParam .CL2_NODES_PER_NAMESPACE 100}} #TODO itia: check if its passed right +{{$nodesPerNamespace := DefaultParam .CL2_NODES_PER_NAMESPACE 100}} {{$podsPerNode := DefaultParam .CL2_PODS_PER_NODE 50}} {{$loadTestThroughput := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 100}} {{$deploymentSize := DefaultParam .CL2_DEPLOYMENT_SIZE 100}} @@ -19,7 +19,7 @@ name: load-config {{$nodes := DefaultParam .CL2_NODES 1000}} {{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "15m"}} -{{$totalPods := MultiplyInt $namespaces $nodesPerNamespace $podsPerNode}} #TODO itia: can it break existing tests? +{{$totalPods := MultiplyInt $namespaces $nodesPerNamespace $podsPerNode}} {{$podsPerNamespace := DivideInt $totalPods $namespaces}} {{$deploymentsPerNamespace := DivideInt $podsPerNamespace $deploymentSize}} @@ -40,16 +40,7 @@ name: load-config {{$bigDeploymentsPerNamespace := DefaultParam .bigDeploymentsPerNamespace 1}} {{end}} -# Use explicit conditional block to assign smallDeploymentPods to maintain backward compatibility -{{$calculatedPods := SubtractInt $podsPerNamespace (MultiplyInt $bigDeploymentsPerNamespace $BIG_GROUP_SIZE)}} - -{{$smallDeploymentPods := 0}} -{{if $NETWORK_TEST}} - {{$smallDeploymentPods = $podsPerNamespace}} -{{else}} - {{$smallDeploymentPods = $calculatedPods}} -{{end}} - +{{$smallDeploymentPods := SubtractInt $podsPerNamespace (MultiplyInt $bigDeploymentsPerNamespace $BIG_GROUP_SIZE)}} {{$smallDeploymentsPerNamespace := DivideInt $smallDeploymentPods $SMALL_GROUP_SIZE}} # CNP & CCNP Test @@ -85,7 +76,7 @@ tuningSets: timeLimit: {{$deletionTime}}s steps: - - name: Log - namespaces={{$namespaces}}, nodes={{$nodes}}, nodesPerNamespace={{$nodesPerNamespace}}, podsPerNode={{$podsPerNode}}, totalPods={{$totalPods}}, podsPerNamespace={{$podsPerNamespace}}, deploymentsPerNamespace={{$deploymentsPerNamespace}}, deploymentSize={{$deploymentSize}} + - name: Log - namespaces={{$namespaces}}, nodes={{$nodes}}, nodesPerNamespace={{$nodesPerNamespace}}, podsPerNode={{$podsPerNode}}, totalPods={{$totalPods}}, podsPerNamespace={{$podsPerNamespace}}, bigDeploymentsPerNamespace={{$bigDeploymentsPerNamespace}}, smallDeploymentsPerNamespace={{$smallDeploymentsPerNamespace}}, bigGroupSize={{$BIG_GROUP_SIZE}}, smallGroupSize={{$SMALL_GROUP_SIZE}}, repeats={{$repeats}}, $saturationTime={{$saturationTime}}, $deletionTime={{$deletionTime}} measurements: - Identifier: Dummy Method: Sleep diff --git a/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml b/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml index 0ae97f20c5..9c99f01dca 100644 --- a/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml +++ b/modules/python/clusterloader2/slo/config/modules/reconcile-objects.yaml @@ -42,7 +42,6 @@ steps: - name: {{$actionName}} phases: -{{if not $NETWORK_TEST}} - namespaceRange: min: 1 max: {{$namespaces}} @@ -56,7 +55,6 @@ steps: SvcName: big-service Group: {{.Group}} deploymentLabel: {{.deploymentLabel}} -{{end}} - namespaceRange: min: 1 max: {{$namespaces}} diff --git a/modules/python/clusterloader2/slo/slo.py b/modules/python/clusterloader2/slo/slo.py index 45be4d7507..9b91f81ce5 100644 --- a/modules/python/clusterloader2/slo/slo.py +++ b/modules/python/clusterloader2/slo/slo.py @@ -73,7 +73,7 @@ def configure_clusterloader2( with open(override_file, 'w', encoding='utf-8') as file: file.write(f"CL2_NODES: {node_count}\n") file.write(f"CL2_LOAD_TEST_THROUGHPUT: {throughput}\n") - file.write(f"CL2_NODES_PER_NAMESPACE: {nodes_per_namespace}\n") #TODO: itia: check if its passed right + file.write(f"CL2_NODES_PER_NAMESPACE: {nodes_per_namespace}\n") file.write(f"CL2_NODES_PER_STEP: {node_per_step}\n") file.write(f"CL2_NODES: {node_count}\n") file.write(f"CL2_PODS_PER_NODE: {pods_per_node}\n") @@ -119,7 +119,6 @@ def configure_clusterloader2( file.write("CL2_NETWORK_POLICY_ENFORCEMENT_LATENCY_BASELINE: false\n") file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_KEY: net-pol-test\n") file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_TARGET_LABEL_VALUE: enforcement-latency\n") - #file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_KEY: test\n") file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_NODE_LABEL_VALUE: net-policy-client\n") file.write("CL2_NET_POLICY_ENFORCEMENT_LATENCY_MAX_TARGET_PODS_PER_NS: 100\n") file.write(f"CL2_NET_POLICY_ENFORCEMENT_LOAD_COUNT: {total_network_policies}\n") diff --git a/steps/engine/clusterloader2/slo/collect.yml b/steps/engine/clusterloader2/slo/collect.yml index 3344134056..ba03c37dab 100644 --- a/steps/engine/clusterloader2/slo/collect.yml +++ b/steps/engine/clusterloader2/slo/collect.yml @@ -14,8 +14,7 @@ steps: region: ${{ parameters.region }} - script: | set -eo pipefail - echo "Tests Starting timestamp: $START" - + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \ $CPU_PER_NODE $NODE_COUNT ${MAX_PODS:-0} \ $REPEATS $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $SERVICE_TEST ${NETWORK_TEST:-False} ${CNP_TEST:-False} \ diff --git a/steps/engine/clusterloader2/slo/execute.yml b/steps/engine/clusterloader2/slo/execute.yml index bb342c464e..f69c71ac0d 100644 --- a/steps/engine/clusterloader2/slo/execute.yml +++ b/steps/engine/clusterloader2/slo/execute.yml @@ -7,15 +7,6 @@ parameters: default: {} - name: region type: string - - name: no_of_namespaces - type: number - default: 1 - - name: total_network_policies - type: number - default: 0 - - name: network_test - type: boolean - default: false steps: - script: | @@ -31,7 +22,7 @@ steps: PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE configure \ $CPU_PER_NODE $NODE_COUNT $NODE_PER_STEP ${MAX_PODS:-0} \ $REPEATS $SCALE_TIMEOUT $CLOUD $CILIUM_ENABLED ${SCRAPE_CONTAINERD:-False} \ - $SERVICE_TEST $NETWORK_TEST $NO_OF_NAMESPACES $TOTAL_NETWORK_POLICIES \ + $SERVICE_TEST ${NETWORK_TEST:-False} ${NO_OF_NAMESPACES:-1} ${TOTAL_NETWORK_POLICIES:-0} \ ${CNP_TEST:-False} ${CCNP_TEST:-False} ${NUM_CNPS:-0} ${NUM_CCNPS:-0} ${DUALSTACK:-False} ${CL2_CONFIG_DIR}/overrides.yaml PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ ${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR $CL2_CONFIG_FILE \ @@ -47,8 +38,4 @@ steps: CL2_IMAGE: ${{ parameters.engine_input.image }} CL2_CONFIG_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/slo/config CL2_REPORT_DIR: $(Pipeline.Workspace)/s/modules/python/clusterloader2/slo/results - CL2_CONFIG_FILE: load-config.yaml - NO_OF_NAMESPACES: ${{ parameters.no_of_namespaces }} - TOTAL_NETWORK_POLICIES: ${{ parameters.total_network_policies }} - NETWORK_TEST: ${{ parameters.network_test }} #TODO: itia: Remove these 3 param and use default ${NETWORK_TEST:-False} ${NO_OF_NAMESPACES:-1} ${TOTAL_NETWORK_POLICIES:-0} displayName: "Run Benchmark" diff --git a/steps/execute-tests.yml b/steps/execute-tests.yml index 4330d7c33d..ed8a32c752 100644 --- a/steps/execute-tests.yml +++ b/steps/execute-tests.yml @@ -13,12 +13,6 @@ parameters: default: {} steps: -- script: | - echo "Set the start time for test execution" - startTimestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") - echo "Start: $startTimestamp" - echo "##vso[task.setvariable variable=START]$startTimestamp" - displayName: Set up start timestamp variable - template: /steps/topology/${{ parameters.topology }}/execute-${{ parameters.engine }}.yml@self parameters: From 3c0657efe01d8cfad9c7b7c2fcbcaeba505c5a2e Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Tue, 29 Apr 2025 18:05:24 +0100 Subject: [PATCH 10/36] Adding schedule --- .../network-churn/cilium-network-churn.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml index a40df56b5e..59a36c9604 100644 --- a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml +++ b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml @@ -1,11 +1,11 @@ trigger: none -# schedules: -# - cron: "0 */8 * * *" -# displayName: "Every 8 Hours Daily" -# branches: -# include: -# - network-policy-churn -# always: true +schedules: +- cron: "0 */4 * * *" + displayName: "Every 4 Hours Daily" + branches: + include: + - itia/network-churn + always: true parameters: - name: node_count From 612e5aaff3694304472efbaff8af42dac8d0fd93 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 30 Apr 2025 12:08:25 +0100 Subject: [PATCH 11/36] updating time and fix lint --- .../network-churn/cilium-network-churn.yml | 12 ++++++------ steps/engine/clusterloader2/slo/collect.yml | 2 +- steps/execute-tests.yml | 6 ++++++ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml index 59a36c9604..530d1ea1ef 100644 --- a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml +++ b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml @@ -1,11 +1,11 @@ trigger: none schedules: -- cron: "0 */4 * * *" - displayName: "Every 4 Hours Daily" - branches: - include: - - itia/network-churn - always: true + - cron: "0 */6 * * *" + displayName: "Every 6 Hours Daily" + branches: + include: + - itia/network-churn + always: true parameters: - name: node_count diff --git a/steps/engine/clusterloader2/slo/collect.yml b/steps/engine/clusterloader2/slo/collect.yml index ba03c37dab..73741ce244 100644 --- a/steps/engine/clusterloader2/slo/collect.yml +++ b/steps/engine/clusterloader2/slo/collect.yml @@ -14,7 +14,7 @@ steps: region: ${{ parameters.region }} - script: | set -eo pipefail - + PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE collect \ $CPU_PER_NODE $NODE_COUNT ${MAX_PODS:-0} \ $REPEATS $CL2_REPORT_DIR "$CLOUD_INFO" $RUN_ID $RUN_URL $SERVICE_TEST ${NETWORK_TEST:-False} ${CNP_TEST:-False} \ diff --git a/steps/execute-tests.yml b/steps/execute-tests.yml index ed8a32c752..4330d7c33d 100644 --- a/steps/execute-tests.yml +++ b/steps/execute-tests.yml @@ -13,6 +13,12 @@ parameters: default: {} steps: +- script: | + echo "Set the start time for test execution" + startTimestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + echo "Start: $startTimestamp" + echo "##vso[task.setvariable variable=START]$startTimestamp" + displayName: Set up start timestamp variable - template: /steps/topology/${{ parameters.topology }}/execute-${{ parameters.engine }}.yml@self parameters: From 3144df609b1b21f9d49a697025cc8a34e723de6c Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 7 May 2025 17:54:19 +0100 Subject: [PATCH 12/36] Add Azure Terraform configuration for network policy churn scenario --- .../terraform-inputs/azure.tfvars | 70 +++++++++++++++++++ .../terraform-test-inputs/azure.json | 4 ++ 2 files changed, 74 insertions(+) create mode 100644 scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars create mode 100644 scenarios/perf-eval/network-policy-churn/terraform-test-inputs/azure.json diff --git a/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars b/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars new file mode 100644 index 0000000000..40885702f3 --- /dev/null +++ b/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars @@ -0,0 +1,70 @@ +scenario_type = "perf-eval" +scenario_name = "network-policy-churn" +deletion_delay = "3h" +owner = "aks" + +network_config_list = [ + { + role = "slo" + vnet_name = "slo-vnet" + vnet_address_space = "10.0.0.0/9" + subnet = [ + { + name = "slo-subnet-1" + address_prefix = "10.0.0.0/16" + } + ] + network_security_group_name = "" + nic_public_ip_associations = [] + nsr_rules = [] + } +] + +aks_config_list = [ + { + role = "slo" + aks_name = "slo" + dns_prefix = "slo" + subnet_name = "slo-subnet-1" + sku_tier = "Standard" + network_profile = { + network_plugin = "azure" + network_plugin_mode = "overlay" + pod_cidr = "10.128.0.0/9" + service_cidr = "192.168.0.0/16" + dns_service_ip = "192.168.0.10" + } + default_node_pool = { + name = "default" + node_count = 5 + auto_scaling_enabled = false + vm_size = "Standard_D16ds_v5" + os_disk_type = "Managed" + only_critical_addons_enabled = false + temporary_name_for_rotation = "defaulttmp" + } + extra_node_pool = [ + { + name = "prompool" + node_count = 1 + auto_scaling_enabled = false + vm_size = "Standard_D64_v3" + max_pods = 110 + node_labels = { "prometheus" = "true" } + }, + { + name = "userpool0" + node_count = 0 + min_count = 0 + max_count = 500 + auto_scaling_enabled = true + vm_size = "Standard_D4_v3" + max_pods = 250 + node_labels = { "slo" = "true", + "test-np" = "net-policy-client" } + } + ] + kubernetes_version = "1.32" + } +] + diff --git a/scenarios/perf-eval/network-policy-churn/terraform-test-inputs/azure.json b/scenarios/perf-eval/network-policy-churn/terraform-test-inputs/azure.json new file mode 100644 index 0000000000..ea27a572c6 --- /dev/null +++ b/scenarios/perf-eval/network-policy-churn/terraform-test-inputs/azure.json @@ -0,0 +1,4 @@ +{ + "run_id" : "123456789", + "region" : "eastus" +} From d28c866855334918bb7e2b8fbd110b0100e30aa7 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Thu, 8 May 2025 14:09:00 +0100 Subject: [PATCH 13/36] Update resource management conditions to check for 'false' instead of 'true' --- steps/cleanup-resources.yml | 2 +- steps/collect-terraform-operation-metadata.yml | 2 +- steps/provision-resources.yml | 4 ++-- steps/ssh/setup-key.yml | 2 +- steps/terraform/run-command.yml | 2 +- steps/terraform/set-input-file.yml | 2 +- steps/terraform/set-input-variables-aws.yml | 2 +- steps/terraform/set-input-variables-azure.yml | 2 +- steps/terraform/set-input-variables-gcp.yml | 2 +- steps/terraform/set-user-data-path.yml | 2 +- steps/terraform/set-working-directory.yml | 2 +- steps/validate-resources.yml | 2 +- 12 files changed, 13 insertions(+), 13 deletions(-) diff --git a/steps/cleanup-resources.yml b/steps/cleanup-resources.yml index 991af9bcc6..6692a79fce 100644 --- a/steps/cleanup-resources.yml +++ b/steps/cleanup-resources.yml @@ -30,7 +30,7 @@ steps: echo "Delete resource group $RUN_ID" az group delete --name $RUN_ID --yes displayName: "Destroy Resource Group" - condition: and(${{ eq(parameters.cloud, 'azure') }}, ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true')) + condition: and(${{ eq(parameters.cloud, 'azure') }}, ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false')) - ${{ if eq(variables['Build.SourceBranchName'], 'main') }}: - template: /steps/collect-terraform-operation-metadata.yml diff --git a/steps/collect-terraform-operation-metadata.yml b/steps/collect-terraform-operation-metadata.yml index ec9cb28f64..fd42cea29e 100644 --- a/steps/collect-terraform-operation-metadata.yml +++ b/steps/collect-terraform-operation-metadata.yml @@ -12,7 +12,7 @@ steps: $TERRAFORM_WORKING_DIRECTORY "$TEST_RESULTS_DIR/terraform_operation_metadata.json" "$SCENARIO_TYPE" "$SCENARIO_NAME" echo "##vso[task.setvariable variable=TERRAFORM_OPERATION_METADATA_FILE]$TEST_RESULTS_DIR/terraform_operation_metadata.json" displayName: "Collect Terraform Operation Metadata" - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') workingDirectory: modules/python/terraform env: PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/terraform/extract_terraform_operation_metadata.py diff --git a/steps/provision-resources.yml b/steps/provision-resources.yml index cf61a821af..992bf04620 100644 --- a/steps/provision-resources.yml +++ b/steps/provision-resources.yml @@ -58,7 +58,7 @@ steps: echo "Owner: $owner" echo "##vso[task.setvariable variable=OWNER]$owner" displayName: "Get Deletion Due Time and Owner" - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') env: region: ${{ parameters.regions[0] }} @@ -68,7 +68,7 @@ steps: az group create --name $RUN_ID --location $region \ --tags "run_id=$RUN_ID" "scenario=${SCENARIO_TYPE}-${SCENARIO_NAME}" "owner=${OWNER}" "creation_date=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" "deletion_due_time=${DELETION_DUE_TIME}" "SkipAKSCluster=1" displayName: "Create Resource Group" - condition: and(${{ eq(parameters.cloud, 'azure') }}, ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true')) + condition: and(${{ eq(parameters.cloud, 'azure') }}, ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false')) env: region: ${{ parameters.regions[0] }} diff --git a/steps/ssh/setup-key.yml b/steps/ssh/setup-key.yml index 2e62598c17..2f33467d2e 100644 --- a/steps/ssh/setup-key.yml +++ b/steps/ssh/setup-key.yml @@ -28,4 +28,4 @@ steps: cat $SSH_KEY_PATH echo "##vso[task.setvariable variable=SSH_KEY_PATH;]${SSH_KEY_PATH}" displayName: "Download SSH Key" - condition: eq(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') + condition: eq(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') diff --git a/steps/terraform/run-command.yml b/steps/terraform/run-command.yml index 6614d25bcd..8df15c3317 100644 --- a/steps/terraform/run-command.yml +++ b/steps/terraform/run-command.yml @@ -81,7 +81,7 @@ steps: terraform ${{ parameters.command }} ${{ parameters.arguments }} fi displayName: "Run Terraform ${{ parameters.command }} Command" - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') retryCountOnTaskFailure: ${{ parameters.retry_attempt_count }} env: REGIONS: ${{ convertToJson(parameters.regions) }} diff --git a/steps/terraform/set-input-file.yml b/steps/terraform/set-input-file.yml index af3ac24570..75218fd508 100644 --- a/steps/terraform/set-input-file.yml +++ b/steps/terraform/set-input-file.yml @@ -29,7 +29,7 @@ steps: echo "##vso[task.setvariable variable=REGIONAL_CONFIG]$regional_config_str" displayName: 'Set Terraform Input File' - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') env: CLOUD: ${{ parameters.cloud }} REGIONS: ${{ convertToJson(parameters.regions) }} diff --git a/steps/terraform/set-input-variables-aws.yml b/steps/terraform/set-input-variables-aws.yml index f0fc1a7e01..3465e0d6e9 100644 --- a/steps/terraform/set-input-variables-aws.yml +++ b/steps/terraform/set-input-variables-aws.yml @@ -48,7 +48,7 @@ steps: echo "##vso[task.setvariable variable=TERRAFORM_REGIONAL_CONFIG]$regional_config_str" displayName: 'Set Terraform Input Variables' - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') env: CLOUD: ${{ parameters.cloud }} REGIONS: ${{ convertToJson(parameters.regions) }} diff --git a/steps/terraform/set-input-variables-azure.yml b/steps/terraform/set-input-variables-azure.yml index 2b91aabd48..7f1a830f72 100644 --- a/steps/terraform/set-input-variables-azure.yml +++ b/steps/terraform/set-input-variables-azure.yml @@ -68,7 +68,7 @@ steps: echo "##vso[task.setvariable variable=TERRAFORM_REGIONAL_CONFIG]$regional_config_str" displayName: 'Set Terraform Input Variables' - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') env: CLOUD: ${{ parameters.cloud }} REGIONS: ${{ convertToJson(parameters.regions) }} diff --git a/steps/terraform/set-input-variables-gcp.yml b/steps/terraform/set-input-variables-gcp.yml index 4879640cb0..2b8e24def4 100644 --- a/steps/terraform/set-input-variables-gcp.yml +++ b/steps/terraform/set-input-variables-gcp.yml @@ -39,7 +39,7 @@ steps: echo "##vso[task.setvariable variable=TERRAFORM_REGIONAL_CONFIG]$regional_config_str" displayName: 'Set Terraform Input Variables' - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') env: CLOUD: ${{ parameters.cloud }} REGIONS: ${{ convertToJson(parameters.regions) }} diff --git a/steps/terraform/set-user-data-path.yml b/steps/terraform/set-user-data-path.yml index 49f931a0b8..81c7a837a2 100644 --- a/steps/terraform/set-user-data-path.yml +++ b/steps/terraform/set-user-data-path.yml @@ -20,6 +20,6 @@ steps: fi displayName: 'Set Terraform User Data Path' - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') env: USER_DATA_PATH: ${{ parameters.user_data_path }} diff --git a/steps/terraform/set-working-directory.yml b/steps/terraform/set-working-directory.yml index d42f4673a5..f044ad7185 100644 --- a/steps/terraform/set-working-directory.yml +++ b/steps/terraform/set-working-directory.yml @@ -16,7 +16,7 @@ steps: echo "Terraform Working Directory: $terraform_working_directory" displayName: 'Set Terraform Working Directory' - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') env: CLOUD: ${{ parameters.cloud }} MODULES_DIR: ${{ parameters.modules_dir }} diff --git a/steps/validate-resources.yml b/steps/validate-resources.yml index b6d54c5e46..a5f1b2834f 100644 --- a/steps/validate-resources.yml +++ b/steps/validate-resources.yml @@ -16,7 +16,7 @@ steps: exit 1 fi displayName: "Validate OWNER info" - condition: eq(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') + condition: eq(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') - template: /steps/topology/${{ parameters.topology }}/validate-resources.yml@self parameters: cloud: ${{ parameters.cloud }} From 49e02611d562dcb8333871090cf04b800d96a2ee Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Thu, 8 May 2025 14:50:56 +0100 Subject: [PATCH 14/36] Terraform configuration for net role --- .../terraform-inputs/azure.tfvars | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars b/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars index 40885702f3..292db83ed2 100644 --- a/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars +++ b/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars @@ -5,12 +5,12 @@ owner = "aks" network_config_list = [ { - role = "slo" - vnet_name = "slo-vnet" + role = "net" + vnet_name = "net-vnet" vnet_address_space = "10.0.0.0/9" subnet = [ { - name = "slo-subnet-1" + name = "net-subnet-1" address_prefix = "10.0.0.0/16" } ] @@ -22,10 +22,10 @@ network_config_list = [ aks_config_list = [ { - role = "slo" - aks_name = "slo" - dns_prefix = "slo" - subnet_name = "slo-subnet-1" + role = "net" + aks_name = "net-pol-test" + dns_prefix = "net" + subnet_name = "net-subnet-1" sku_tier = "Standard" network_profile = { network_plugin = "azure" From 1921bdf68d9f7311daac18ec093b571ea114402a Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Fri, 9 May 2025 15:40:23 +0100 Subject: [PATCH 15/36] Add post-provisioning step for Azure NPM configuration and update resource management conditions --- jobs/competitive-test.yml | 10 +- modules/terraform/azure/aks/main.tf | 1 + modules/terraform/azure/aks/variables.tf | 7 + .../network-churn/cilium-network-churn.yml | 1 + scripts/azure-npm.yaml | 173 ++++++++++++++++++ steps/cleanup-resources.yml | 2 +- .../collect-terraform-operation-metadata.yml | 2 +- steps/post-provisioning-config.yml | 25 +++ steps/provision-resources.yml | 4 +- steps/ssh/setup-key.yml | 2 +- steps/terraform/run-command.yml | 2 +- steps/terraform/set-input-file.yml | 2 +- steps/terraform/set-input-variables-aws.yml | 2 +- steps/terraform/set-input-variables-azure.yml | 4 +- steps/terraform/set-input-variables-gcp.yml | 2 +- steps/terraform/set-user-data-path.yml | 2 +- steps/terraform/set-working-directory.yml | 2 +- steps/validate-resources.yml | 2 +- 18 files changed, 226 insertions(+), 19 deletions(-) create mode 100644 scripts/azure-npm.yaml create mode 100644 steps/post-provisioning-config.yml diff --git a/jobs/competitive-test.yml b/jobs/competitive-test.yml index dffed3eba7..441887b069 100644 --- a/jobs/competitive-test.yml +++ b/jobs/competitive-test.yml @@ -33,9 +33,6 @@ parameters: - name: run_id type: string default: '' -- name: run_id_2 # This cluster will be used if 'use_secondary_cluster' is true - type: string - default: '' - name: timeout_in_minutes type: number default: 60 # default when not specified is 60 minutes @@ -51,9 +48,6 @@ parameters: - name: ssh_key_enabled type: boolean default: true -- name: use_secondary_cluster # Set it to true if you want to use a secondary cluster(run_id_2) for the test - type: boolean - default: false jobs: - job: ${{ parameters.cloud }} @@ -84,6 +78,10 @@ jobs: terraform_input_varibles: ${{ parameters.terraform_input_varibles }} retry_attempt_count: ${{ parameters.retry_attempt_count }} credential_type: ${{ parameters.credential_type }} + - template: /steps/post-provisioning-config.yml + parameters: + cloud: ${{ parameters.cloud }} + regions: ${{ parameters.regions }} - template: /steps/validate-resources.yml parameters: cloud: ${{ parameters.cloud }} diff --git a/modules/terraform/azure/aks/main.tf b/modules/terraform/azure/aks/main.tf index d802e515de..d28758d007 100644 --- a/modules/terraform/azure/aks/main.tf +++ b/modules/terraform/azure/aks/main.tf @@ -37,6 +37,7 @@ resource "azurerm_kubernetes_cluster" "aks" { network_plugin_mode = var.aks_config.network_profile.network_plugin_mode network_policy = try(coalesce(var.network_policy, var.aks_config.network_profile.network_policy), null) network_data_plane = try(coalesce(var.network_dataplane, var.aks_config.network_profile.network_dataplane), null) + npm_enabled = try(coalesce(var.npm_enabled, var.aks_config.network_profile.npm_enabled), false) outbound_type = var.aks_config.network_profile.outbound_type pod_cidr = var.aks_config.network_profile.pod_cidr service_cidr = var.aks_config.network_profile.service_cidr diff --git a/modules/terraform/azure/aks/variables.tf b/modules/terraform/azure/aks/variables.tf index c4c7eb39c6..678cf11ac2 100644 --- a/modules/terraform/azure/aks/variables.tf +++ b/modules/terraform/azure/aks/variables.tf @@ -54,6 +54,12 @@ variable "network_dataplane" { default = null } +variable "npm_enabled" { + description = "Whether to enable NPM configuration" + type = bool + default = false +} + variable "aks_config" { type = object({ role = string @@ -65,6 +71,7 @@ variable "aks_config" { network_plugin_mode = optional(string, null) network_policy = optional(string, null) network_dataplane = optional(string, null) + npm_enabled = optional(bool, false) outbound_type = optional(string, null) pod_cidr = optional(string, null) service_cidr = optional(string, null) diff --git a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml index 530d1ea1ef..31535050ec 100644 --- a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml +++ b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml @@ -92,6 +92,7 @@ stages: no_of_namespaces: ${{ parameters.no_of_namespaces }} total_network_policies: ${{ parameters.total_nework_policies }} cilium_enabled: False + npm_enabled: True service_test: False network_test: True cl2_config_file: load-config.yaml diff --git a/scripts/azure-npm.yaml b/scripts/azure-npm.yaml new file mode 100644 index 0000000000..d6430ab8b4 --- /dev/null +++ b/scripts/azure-npm.yaml @@ -0,0 +1,173 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: azure-npm + namespace: kube-system + labels: + addonmanager.kubernetes.io/mode: EnsureExists +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: azure-npm + namespace: kube-system + labels: + addonmanager.kubernetes.io/mode: EnsureExists +rules: + - apiGroups: + - "" + resources: + - pods + - nodes + - namespaces + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - networkpolicies + verbs: + - get + - list + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: azure-npm-binding + namespace: kube-system + labels: + addonmanager.kubernetes.io/mode: EnsureExists +subjects: + - kind: ServiceAccount + name: azure-npm + namespace: kube-system +roleRef: + kind: ClusterRole + name: azure-npm + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: azure-npm + namespace: kube-system + labels: + app: azure-npm + addonmanager.kubernetes.io/mode: EnsureExists +spec: + selector: + matchLabels: + k8s-app: azure-npm + template: + metadata: + labels: + k8s-app: azure-npm + annotations: + scheduler.alpha.kubernetes.io/critical-pod: "" + azure.npm/scrapeable: "" + spec: + priorityClassName: system-node-critical + tolerations: + - operator: "Exists" + effect: NoExecute + - operator: "Exists" + effect: NoSchedule + - key: CriticalAddonsOnly + operator: Exists + containers: + - name: azure-npm + image: mcr.microsoft.com/containernetworking/azure-npm:v1.4.45.3 + resources: {} + securityContext: + privileged: false + capabilities: + add: + - NET_ADMIN + readOnlyRootFilesystem: true + env: + - name: HOSTNAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + - name: NPM_CONFIG + value: /etc/azure-npm/azure-npm.json + volumeMounts: + - name: log + mountPath: /var/log + - name: xtables-lock + mountPath: /run/xtables.lock + - name: protocols + mountPath: /etc/protocols + - name: azure-npm-config + mountPath: /etc/azure-npm + - name: tmp + mountPath: /tmp + hostNetwork: true + hostUsers: false + nodeSelector: + kubernetes.io/os: linux + volumes: + - name: log + hostPath: + path: /var/log + type: Directory + - name: xtables-lock + hostPath: + path: /run/xtables.lock + type: File + - name: protocols + hostPath: + path: /etc/protocols + type: File + - name: azure-npm-config + configMap: + name: azure-npm-config + - name: tmp + emptyDir: {} + serviceAccountName: azure-npm +--- +apiVersion: v1 +kind: Service +metadata: + name: npm-metrics-cluster-service + namespace: kube-system + labels: + app: npm-metrics +spec: + selector: + k8s-app: azure-npm + ports: + - port: 9000 + targetPort: 10091 +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: azure-npm-config + namespace: kube-system +data: + azure-npm.json: | + { + "ResyncPeriodInMinutes": 15, + "ListeningPort": 10091, + "ListeningAddress": "0.0.0.0", + "ApplyIntervalInMilliseconds": 500, + "ApplyMaxBatches": 100, + "MaxBatchedACLsPerPod": 30, + "NetPolInvervalInMilliseconds": 500, + "MaxPendingNetPols": 100, + "Toggles": { + "EnablePrometheusMetrics": true, + "EnablePprof": true, + "EnableHTTPDebugAPI": true, + "EnableV2NPM": true, + "PlaceAzureChainFirst": false, + "ApplyInBackground": true, + "NetPolInBackground": true + }, + "LogLevel": "info" + } \ No newline at end of file diff --git a/steps/cleanup-resources.yml b/steps/cleanup-resources.yml index 6692a79fce..991af9bcc6 100644 --- a/steps/cleanup-resources.yml +++ b/steps/cleanup-resources.yml @@ -30,7 +30,7 @@ steps: echo "Delete resource group $RUN_ID" az group delete --name $RUN_ID --yes displayName: "Destroy Resource Group" - condition: and(${{ eq(parameters.cloud, 'azure') }}, ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false')) + condition: and(${{ eq(parameters.cloud, 'azure') }}, ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true')) - ${{ if eq(variables['Build.SourceBranchName'], 'main') }}: - template: /steps/collect-terraform-operation-metadata.yml diff --git a/steps/collect-terraform-operation-metadata.yml b/steps/collect-terraform-operation-metadata.yml index fd42cea29e..ec9cb28f64 100644 --- a/steps/collect-terraform-operation-metadata.yml +++ b/steps/collect-terraform-operation-metadata.yml @@ -12,7 +12,7 @@ steps: $TERRAFORM_WORKING_DIRECTORY "$TEST_RESULTS_DIR/terraform_operation_metadata.json" "$SCENARIO_TYPE" "$SCENARIO_NAME" echo "##vso[task.setvariable variable=TERRAFORM_OPERATION_METADATA_FILE]$TEST_RESULTS_DIR/terraform_operation_metadata.json" displayName: "Collect Terraform Operation Metadata" - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') workingDirectory: modules/python/terraform env: PYTHON_SCRIPT_FILE: $(Pipeline.Workspace)/s/modules/python/terraform/extract_terraform_operation_metadata.py diff --git a/steps/post-provisioning-config.yml b/steps/post-provisioning-config.yml new file mode 100644 index 0000000000..98e5a8a923 --- /dev/null +++ b/steps/post-provisioning-config.yml @@ -0,0 +1,25 @@ +parameters: +- name: cloud + type: string +- name: regions + type: object + +steps: +- script: | + set -eu + if [[ "$CLOUD" == "azure" ]]; then + # For Azure, check if npm_enabled is true + if [[ "$NPM_ENABLED" == "true" ]]; then + echo "NPM is enabled, applying kubectl configuration..." + + # Get AKS credentials + aks_name=$(az aks list --resource-group $RUN_ID --query "[0].name" -o tsv) + az aks get-credentials --resource-group $RUN_ID --name $aks_name --overwrite-existing + + # Apply kubectl configuration + kubectl apply -f ../scripts/azure-npm.yaml + echo "NPM configuration applied successfully!" + else + echo "NPM is not enabled, skipping kubectl configuration." + fi + fi \ No newline at end of file diff --git a/steps/provision-resources.yml b/steps/provision-resources.yml index 992bf04620..cf61a821af 100644 --- a/steps/provision-resources.yml +++ b/steps/provision-resources.yml @@ -58,7 +58,7 @@ steps: echo "Owner: $owner" echo "##vso[task.setvariable variable=OWNER]$owner" displayName: "Get Deletion Due Time and Owner" - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') env: region: ${{ parameters.regions[0] }} @@ -68,7 +68,7 @@ steps: az group create --name $RUN_ID --location $region \ --tags "run_id=$RUN_ID" "scenario=${SCENARIO_TYPE}-${SCENARIO_NAME}" "owner=${OWNER}" "creation_date=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" "deletion_due_time=${DELETION_DUE_TIME}" "SkipAKSCluster=1" displayName: "Create Resource Group" - condition: and(${{ eq(parameters.cloud, 'azure') }}, ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false')) + condition: and(${{ eq(parameters.cloud, 'azure') }}, ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true')) env: region: ${{ parameters.regions[0] }} diff --git a/steps/ssh/setup-key.yml b/steps/ssh/setup-key.yml index 2f33467d2e..2e62598c17 100644 --- a/steps/ssh/setup-key.yml +++ b/steps/ssh/setup-key.yml @@ -28,4 +28,4 @@ steps: cat $SSH_KEY_PATH echo "##vso[task.setvariable variable=SSH_KEY_PATH;]${SSH_KEY_PATH}" displayName: "Download SSH Key" - condition: eq(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') + condition: eq(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') diff --git a/steps/terraform/run-command.yml b/steps/terraform/run-command.yml index 8df15c3317..6614d25bcd 100644 --- a/steps/terraform/run-command.yml +++ b/steps/terraform/run-command.yml @@ -81,7 +81,7 @@ steps: terraform ${{ parameters.command }} ${{ parameters.arguments }} fi displayName: "Run Terraform ${{ parameters.command }} Command" - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') retryCountOnTaskFailure: ${{ parameters.retry_attempt_count }} env: REGIONS: ${{ convertToJson(parameters.regions) }} diff --git a/steps/terraform/set-input-file.yml b/steps/terraform/set-input-file.yml index 75218fd508..af3ac24570 100644 --- a/steps/terraform/set-input-file.yml +++ b/steps/terraform/set-input-file.yml @@ -29,7 +29,7 @@ steps: echo "##vso[task.setvariable variable=REGIONAL_CONFIG]$regional_config_str" displayName: 'Set Terraform Input File' - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') env: CLOUD: ${{ parameters.cloud }} REGIONS: ${{ convertToJson(parameters.regions) }} diff --git a/steps/terraform/set-input-variables-aws.yml b/steps/terraform/set-input-variables-aws.yml index 3465e0d6e9..f0fc1a7e01 100644 --- a/steps/terraform/set-input-variables-aws.yml +++ b/steps/terraform/set-input-variables-aws.yml @@ -48,7 +48,7 @@ steps: echo "##vso[task.setvariable variable=TERRAFORM_REGIONAL_CONFIG]$regional_config_str" displayName: 'Set Terraform Input Variables' - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') env: CLOUD: ${{ parameters.cloud }} REGIONS: ${{ convertToJson(parameters.regions) }} diff --git a/steps/terraform/set-input-variables-azure.yml b/steps/terraform/set-input-variables-azure.yml index 7f1a830f72..52000e5296 100644 --- a/steps/terraform/set-input-variables-azure.yml +++ b/steps/terraform/set-input-variables-azure.yml @@ -37,6 +37,7 @@ steps: --arg aks_kubernetes_version "$KUBERNETES_VERSION" \ --arg aks_network_policy "$NETWORK_POLICY" \ --arg aks_network_dataplane "$NETWORK_DATAPLANE" \ + --arg npm_enabled "$NPM_ENABLED" \ --arg k8s_machine_type "$K8S_MACHINE_TYPE" \ --arg k8s_os_disk_type "$K8S_OS_DISK_TYPE" \ --argjson aks_custom_headers "$AKS_CUSTOM_HEADERS" \ @@ -49,6 +50,7 @@ steps: aks_kubernetes_version: $aks_kubernetes_version, aks_network_policy: $aks_network_policy, aks_network_dataplane: $aks_network_dataplane, + npm_enabled: $npm_enabled, k8s_machine_type: $k8s_machine_type, k8s_os_disk_type: $k8s_os_disk_type, aks_custom_headers: $aks_custom_headers, @@ -68,7 +70,7 @@ steps: echo "##vso[task.setvariable variable=TERRAFORM_REGIONAL_CONFIG]$regional_config_str" displayName: 'Set Terraform Input Variables' - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') env: CLOUD: ${{ parameters.cloud }} REGIONS: ${{ convertToJson(parameters.regions) }} diff --git a/steps/terraform/set-input-variables-gcp.yml b/steps/terraform/set-input-variables-gcp.yml index 2b8e24def4..4879640cb0 100644 --- a/steps/terraform/set-input-variables-gcp.yml +++ b/steps/terraform/set-input-variables-gcp.yml @@ -39,7 +39,7 @@ steps: echo "##vso[task.setvariable variable=TERRAFORM_REGIONAL_CONFIG]$regional_config_str" displayName: 'Set Terraform Input Variables' - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') env: CLOUD: ${{ parameters.cloud }} REGIONS: ${{ convertToJson(parameters.regions) }} diff --git a/steps/terraform/set-user-data-path.yml b/steps/terraform/set-user-data-path.yml index 81c7a837a2..49f931a0b8 100644 --- a/steps/terraform/set-user-data-path.yml +++ b/steps/terraform/set-user-data-path.yml @@ -20,6 +20,6 @@ steps: fi displayName: 'Set Terraform User Data Path' - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') env: USER_DATA_PATH: ${{ parameters.user_data_path }} diff --git a/steps/terraform/set-working-directory.yml b/steps/terraform/set-working-directory.yml index f044ad7185..d42f4673a5 100644 --- a/steps/terraform/set-working-directory.yml +++ b/steps/terraform/set-working-directory.yml @@ -16,7 +16,7 @@ steps: echo "Terraform Working Directory: $terraform_working_directory" displayName: 'Set Terraform Working Directory' - condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') + condition: ne(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') env: CLOUD: ${{ parameters.cloud }} MODULES_DIR: ${{ parameters.modules_dir }} diff --git a/steps/validate-resources.yml b/steps/validate-resources.yml index a5f1b2834f..b6d54c5e46 100644 --- a/steps/validate-resources.yml +++ b/steps/validate-resources.yml @@ -16,7 +16,7 @@ steps: exit 1 fi displayName: "Validate OWNER info" - condition: eq(variables['SKIP_RESOURCE_MANAGEMENT'], 'false') + condition: eq(variables['SKIP_RESOURCE_MANAGEMENT'], 'true') - template: /steps/topology/${{ parameters.topology }}/validate-resources.yml@self parameters: cloud: ${{ parameters.cloud }} From 02b6218200015605f07b55aa25e080169c42e17e Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Fri, 9 May 2025 15:46:05 +0100 Subject: [PATCH 16/36] Remove unused parameters and simplify run ID handling in setup-tests and network-churn configurations --- jobs/competitive-test.yml | 2 -- .../network-churn/cilium-network-churn.yml | 1 - steps/setup-tests.yml | 14 +------------- 3 files changed, 1 insertion(+), 16 deletions(-) diff --git a/jobs/competitive-test.yml b/jobs/competitive-test.yml index 441887b069..ca7a265fb9 100644 --- a/jobs/competitive-test.yml +++ b/jobs/competitive-test.yml @@ -62,12 +62,10 @@ jobs: cloud: ${{ parameters.cloud }} region: ${{ parameters.regions[0] }} run_id: ${{ parameters.run_id }} - run_id_2: ${{ parameters.run_id_2 }} test_modules_dir: ${{ parameters.test_modules_dir }} retry_attempt_count: ${{ parameters.retry_attempt_count }} credential_type: ${{ parameters.credential_type }} ssh_key_enabled: ${{ parameters.ssh_key_enabled }} - use_secondary_cluster: ${{ parameters.use_secondary_cluster }} - template: /steps/provision-resources.yml parameters: cloud: ${{ parameters.cloud }} diff --git a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml index 31535050ec..fcb1ba786f 100644 --- a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml +++ b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml @@ -100,4 +100,3 @@ stages: timeout_in_minutes: 720 credential_type: service_connection ssh_key_enabled: false - use_secondary_cluster: true diff --git a/steps/setup-tests.yml b/steps/setup-tests.yml index 2a5158bd9c..171e71f188 100644 --- a/steps/setup-tests.yml +++ b/steps/setup-tests.yml @@ -9,9 +9,6 @@ parameters: - name: run_id type: string default: '' -- name: run_id_2 - type: string - default: '' - name: retry_attempt_count type: number default: 3 @@ -19,18 +16,10 @@ parameters: type: string - name: ssh_key_enabled type: boolean -- name: use_secondary_cluster - type: boolean - default: false steps: - script: | - set -eu - if [ "${{ parameters.use_secondary_cluster }}" == "True" ] && [ -n "${RUN_ID_2:-}" ]; then - echo "Using secondary cluster" - run_id=$RUN_ID_2 - elif [ -n "${RUN_ID:-}" ]; then - echo "Using primary cluster" + if [ -n "$RUN_ID" ]; then run_id=$RUN_ID else run_id=$(Build.BuildId)-$(System.JobId) @@ -40,7 +29,6 @@ steps: displayName: "Set Run ID" env: RUN_ID: ${{ parameters.run_id }} - RUN_ID_2: ${{ parameters.run_id_2 }} - script: | run_url="$(System.TeamFoundationCollectionUri)$(System.TeamProject)/_build/results?buildId=$(Build.BuildId)&view=logs&j=$(System.JobId)" From 5133c998bdfb2d9da1015a07db6866db593d71e2 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Fri, 9 May 2025 15:50:14 +0100 Subject: [PATCH 17/36] Remove npm_enabled configuration from AKS resource --- modules/terraform/azure/aks/main.tf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/terraform/azure/aks/main.tf b/modules/terraform/azure/aks/main.tf index d28758d007..d802e515de 100644 --- a/modules/terraform/azure/aks/main.tf +++ b/modules/terraform/azure/aks/main.tf @@ -37,7 +37,6 @@ resource "azurerm_kubernetes_cluster" "aks" { network_plugin_mode = var.aks_config.network_profile.network_plugin_mode network_policy = try(coalesce(var.network_policy, var.aks_config.network_profile.network_policy), null) network_data_plane = try(coalesce(var.network_dataplane, var.aks_config.network_profile.network_dataplane), null) - npm_enabled = try(coalesce(var.npm_enabled, var.aks_config.network_profile.npm_enabled), false) outbound_type = var.aks_config.network_profile.outbound_type pod_cidr = var.aks_config.network_profile.pod_cidr service_cidr = var.aks_config.network_profile.service_cidr From f9e0579b1d3371a76c02853da822e7841cd0a9ae Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Fri, 9 May 2025 16:01:40 +0100 Subject: [PATCH 18/36] Remove npm_enabled variable and its references from AKS configuration --- modules/terraform/azure/aks/variables.tf | 7 ------- 1 file changed, 7 deletions(-) diff --git a/modules/terraform/azure/aks/variables.tf b/modules/terraform/azure/aks/variables.tf index 678cf11ac2..c4c7eb39c6 100644 --- a/modules/terraform/azure/aks/variables.tf +++ b/modules/terraform/azure/aks/variables.tf @@ -54,12 +54,6 @@ variable "network_dataplane" { default = null } -variable "npm_enabled" { - description = "Whether to enable NPM configuration" - type = bool - default = false -} - variable "aks_config" { type = object({ role = string @@ -71,7 +65,6 @@ variable "aks_config" { network_plugin_mode = optional(string, null) network_policy = optional(string, null) network_dataplane = optional(string, null) - npm_enabled = optional(bool, false) outbound_type = optional(string, null) pod_cidr = optional(string, null) service_cidr = optional(string, null) From 52489356ad6bc2ee324d074bdf396f5ee0c99b16 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Fri, 9 May 2025 16:08:39 +0100 Subject: [PATCH 19/36] Remove npm_enabled argument from Terraform input variables --- steps/terraform/set-input-variables-azure.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/steps/terraform/set-input-variables-azure.yml b/steps/terraform/set-input-variables-azure.yml index 52000e5296..2b91aabd48 100644 --- a/steps/terraform/set-input-variables-azure.yml +++ b/steps/terraform/set-input-variables-azure.yml @@ -37,7 +37,6 @@ steps: --arg aks_kubernetes_version "$KUBERNETES_VERSION" \ --arg aks_network_policy "$NETWORK_POLICY" \ --arg aks_network_dataplane "$NETWORK_DATAPLANE" \ - --arg npm_enabled "$NPM_ENABLED" \ --arg k8s_machine_type "$K8S_MACHINE_TYPE" \ --arg k8s_os_disk_type "$K8S_OS_DISK_TYPE" \ --argjson aks_custom_headers "$AKS_CUSTOM_HEADERS" \ @@ -50,7 +49,6 @@ steps: aks_kubernetes_version: $aks_kubernetes_version, aks_network_policy: $aks_network_policy, aks_network_dataplane: $aks_network_dataplane, - npm_enabled: $npm_enabled, k8s_machine_type: $k8s_machine_type, k8s_os_disk_type: $k8s_os_disk_type, aks_custom_headers: $aks_custom_headers, From bc80b8fedbd3286173fa8d2c2907851071b74c94 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Tue, 13 May 2025 13:31:59 +0100 Subject: [PATCH 20/36] Add Azure NPM configuration step with conditional execution --- .../network-churn/validate-resources.yml | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/steps/topology/network-churn/validate-resources.yml b/steps/topology/network-churn/validate-resources.yml index fe97a40cbc..fc32bcea14 100644 --- a/steps/topology/network-churn/validate-resources.yml +++ b/steps/topology/network-churn/validate-resources.yml @@ -11,3 +11,31 @@ steps: parameters: role: net region: ${{ parameters.regions[0] }} + + - script: | + #!/bin/bash + set -e + + # Check if NPM_ENABLED environment variable is set to true + if [[ "$NPM_ENABLED" == "true" ]]; then + echo "NPM is enabled, applying Azure NPM configuration..." + + # Path to the NPM configuration file + npm_config_file="$(Pipeline.Workspace)/s/scripts/azure_npm.yml" + + # Check if the file exists + if [[ -f "$npm_config_file" ]]; then + echo "Applying NPM configuration from: $npm_config_file" + kubectl apply -f "$npm_config_file" + echo "Azure NPM configuration applied successfully!" + else + echo "Error: NPM configuration file not found at $npm_config_file" + echo "Looking for file in current directory:" + ls -la "$(Pipeline.Workspace)/s/scripts/" + exit 1 + fi + else + echo "NPM is not enabled, skipping Azure NPM configuration." + fi + displayName: 'Apply Azure NPM Configuration (if enabled)' + condition: and(succeeded(), eq('${{ parameters.cloud }}', 'azure')) \ No newline at end of file From 7f645e8b0b5a7e14999b4c4968d1dd11cc2ab7ef Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Tue, 13 May 2025 13:33:10 +0100 Subject: [PATCH 21/36] Set npm_enabled to False in network-churn configuration --- .../CNI Benchmark/network-churn/cilium-network-churn.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml index fcb1ba786f..32ad854ed0 100644 --- a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml +++ b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml @@ -60,6 +60,7 @@ stages: no_of_namespaces: ${{ parameters.no_of_namespaces }} total_network_policies: ${{ parameters.total_nework_policies }} cilium_enabled: True + npm_enabled: False network_policy: cilium network_dataplane: cilium service_test: False From 79e3863f913fa33f05d2165d5479e1aff4df1a7f Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Tue, 13 May 2025 13:34:41 +0100 Subject: [PATCH 22/36] Remove post-provisioning configuration step and clean up credential handling --- jobs/competitive-test.yml | 6 +----- steps/post-provisioning-config.yml | 25 ------------------------- 2 files changed, 1 insertion(+), 30 deletions(-) delete mode 100644 steps/post-provisioning-config.yml diff --git a/jobs/competitive-test.yml b/jobs/competitive-test.yml index ca7a265fb9..beaa9662b2 100644 --- a/jobs/competitive-test.yml +++ b/jobs/competitive-test.yml @@ -75,11 +75,7 @@ jobs: terraform_arguments: ${{ parameters.terraform_arguments }} terraform_input_varibles: ${{ parameters.terraform_input_varibles }} retry_attempt_count: ${{ parameters.retry_attempt_count }} - credential_type: ${{ parameters.credential_type }} - - template: /steps/post-provisioning-config.yml - parameters: - cloud: ${{ parameters.cloud }} - regions: ${{ parameters.regions }} + credential_type: ${{ parameters.credential_type }} - template: /steps/validate-resources.yml parameters: cloud: ${{ parameters.cloud }} diff --git a/steps/post-provisioning-config.yml b/steps/post-provisioning-config.yml deleted file mode 100644 index 98e5a8a923..0000000000 --- a/steps/post-provisioning-config.yml +++ /dev/null @@ -1,25 +0,0 @@ -parameters: -- name: cloud - type: string -- name: regions - type: object - -steps: -- script: | - set -eu - if [[ "$CLOUD" == "azure" ]]; then - # For Azure, check if npm_enabled is true - if [[ "$NPM_ENABLED" == "true" ]]; then - echo "NPM is enabled, applying kubectl configuration..." - - # Get AKS credentials - aks_name=$(az aks list --resource-group $RUN_ID --query "[0].name" -o tsv) - az aks get-credentials --resource-group $RUN_ID --name $aks_name --overwrite-existing - - # Apply kubectl configuration - kubectl apply -f ../scripts/azure-npm.yaml - echo "NPM configuration applied successfully!" - else - echo "NPM is not enabled, skipping kubectl configuration." - fi - fi \ No newline at end of file From 67a992bed6c819ebed078383b345a455cd5c671c Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Tue, 13 May 2025 13:50:26 +0100 Subject: [PATCH 23/36] Add scale-cluster template to network-churn steps for resource validation --- steps/topology/network-churn/collect-clusterloader2.yml | 7 +++++++ steps/topology/network-churn/validate-resources.yml | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/steps/topology/network-churn/collect-clusterloader2.yml b/steps/topology/network-churn/collect-clusterloader2.yml index 5c5105adac..272b3d1291 100644 --- a/steps/topology/network-churn/collect-clusterloader2.yml +++ b/steps/topology/network-churn/collect-clusterloader2.yml @@ -16,6 +16,13 @@ steps: engine_input: ${{ parameters.engine_input }} region: ${{ parameters.regions[0] }} +- template: /steps/engine/clusterloader2/cilium/scale-cluster.yml + parameters: + role: net + region: ${{ parameters.regions[0] }} + nodes_per_nodepool: 0 + enable_autoscale: "false" + - script: | run_id=$(Build.BuildId)-$(System.JobId) echo "Run ID: $run_id" diff --git a/steps/topology/network-churn/validate-resources.yml b/steps/topology/network-churn/validate-resources.yml index fc32bcea14..e904a7ceaa 100644 --- a/steps/topology/network-churn/validate-resources.yml +++ b/steps/topology/network-churn/validate-resources.yml @@ -12,6 +12,13 @@ steps: role: net region: ${{ parameters.regions[0] }} + - template: /steps/engine/clusterloader2/cilium/scale-cluster.yml + parameters: + role: net + region: ${{ parameters.regions[0] }} + nodes_per_nodepool: 240 + enable_autoscale: "false" + - script: | #!/bin/bash set -e From de1230191e3b4da3ac3073354a9968adb45c69f8 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Tue, 13 May 2025 14:00:45 +0100 Subject: [PATCH 24/36] debug steps --- steps/topology/network-churn/validate-resources.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/steps/topology/network-churn/validate-resources.yml b/steps/topology/network-churn/validate-resources.yml index e904a7ceaa..f0e6c641e2 100644 --- a/steps/topology/network-churn/validate-resources.yml +++ b/steps/topology/network-churn/validate-resources.yml @@ -16,15 +16,21 @@ steps: parameters: role: net region: ${{ parameters.regions[0] }} - nodes_per_nodepool: 240 + nodes_per_nodepool: 0 # fix it to 240 enable_autoscale: "false" + - script: | + echo "Dumping all environment variables:" + env | sort + echo "NPM_ENABLED value: '$NPM_ENABLED'" + displayName: 'Debug Environment Variables' + - script: | #!/bin/bash set -e # Check if NPM_ENABLED environment variable is set to true - if [[ "$NPM_ENABLED" == "true" ]]; then + if [[ "$NPM_ENABLED" == "true" ]] || [[ "$NPM_ENABLED" == "True" ]] || [[ "$NPM_ENABLED" == true ]]; then echo "NPM is enabled, applying Azure NPM configuration..." # Path to the NPM configuration file From 18e17d16f8e6528e27695d095c73d27ed57db3b3 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Tue, 13 May 2025 14:55:01 +0100 Subject: [PATCH 25/36] Refactor NPM_ENABLED check and update Azure NPM configuration file path --- .../network-churn/validate-resources.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/steps/topology/network-churn/validate-resources.yml b/steps/topology/network-churn/validate-resources.yml index f0e6c641e2..7c0bd3def5 100644 --- a/steps/topology/network-churn/validate-resources.yml +++ b/steps/topology/network-churn/validate-resources.yml @@ -12,12 +12,12 @@ steps: role: net region: ${{ parameters.regions[0] }} - - template: /steps/engine/clusterloader2/cilium/scale-cluster.yml - parameters: - role: net - region: ${{ parameters.regions[0] }} - nodes_per_nodepool: 0 # fix it to 240 - enable_autoscale: "false" + # - template: /steps/engine/clusterloader2/cilium/scale-cluster.yml + # parameters: + # role: net + # region: ${{ parameters.regions[0] }} + # nodes_per_nodepool: 240 # fix it to 240 + # enable_autoscale: "false" - script: | echo "Dumping all environment variables:" @@ -30,11 +30,11 @@ steps: set -e # Check if NPM_ENABLED environment variable is set to true - if [[ "$NPM_ENABLED" == "true" ]] || [[ "$NPM_ENABLED" == "True" ]] || [[ "$NPM_ENABLED" == true ]]; then + if [[ "$NPM_ENABLED" == "True" ]]; then echo "NPM is enabled, applying Azure NPM configuration..." # Path to the NPM configuration file - npm_config_file="$(Pipeline.Workspace)/s/scripts/azure_npm.yml" + npm_config_file="$(Pipeline.Workspace)/s/scripts/azure-npm.yaml" # Check if the file exists if [[ -f "$npm_config_file" ]]; then From 45593d148606b06fec70a6338ab4cbc6c0caa3a2 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Tue, 13 May 2025 15:29:10 +0100 Subject: [PATCH 26/36] Restore scale-cluster template and remove debug environment variables step --- .../network-churn/validate-resources.yml | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/steps/topology/network-churn/validate-resources.yml b/steps/topology/network-churn/validate-resources.yml index 7c0bd3def5..9c79ae9e5c 100644 --- a/steps/topology/network-churn/validate-resources.yml +++ b/steps/topology/network-churn/validate-resources.yml @@ -12,18 +12,12 @@ steps: role: net region: ${{ parameters.regions[0] }} - # - template: /steps/engine/clusterloader2/cilium/scale-cluster.yml - # parameters: - # role: net - # region: ${{ parameters.regions[0] }} - # nodes_per_nodepool: 240 # fix it to 240 - # enable_autoscale: "false" - - - script: | - echo "Dumping all environment variables:" - env | sort - echo "NPM_ENABLED value: '$NPM_ENABLED'" - displayName: 'Debug Environment Variables' + - template: /steps/engine/clusterloader2/cilium/scale-cluster.yml + parameters: + role: net + region: ${{ parameters.regions[0] }} + nodes_per_nodepool: 240 + enable_autoscale: "false" - script: | #!/bin/bash From 456f726ed89f33b04558a1b42b2abe81158cb739 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Tue, 13 May 2025 18:05:16 +0100 Subject: [PATCH 27/36] Rename "prompool" to "promnodepool" in extra node pool configuration --- .../network-policy-churn/terraform-inputs/azure.tfvars | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars b/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars index 292db83ed2..03db090e69 100644 --- a/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars +++ b/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars @@ -45,7 +45,7 @@ aks_config_list = [ } extra_node_pool = [ { - name = "prompool" + name = "promnodepool" node_count = 1 auto_scaling_enabled = false vm_size = "Standard_D64_v3" From c0316fcc1a701ec8f560eaee69c0e582c15c84dd Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 14 May 2025 10:06:12 +0100 Subject: [PATCH 28/36] Update default node_count parameter to 24 in network-churn configuration --- .../CNI Benchmark/network-churn/cilium-network-churn.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml index 32ad854ed0..2511bc7daa 100644 --- a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml +++ b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml @@ -10,7 +10,7 @@ schedules: parameters: - name: node_count type: number - default: 240 + default: 24 - name: node_per_step type: number default: 240 From a0c6e8dc42a1eea6eaacf0f2760265300ec6e373 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 14 May 2025 10:07:24 +0100 Subject: [PATCH 29/36] Remove hardcoded nodes_per_namespace value for network test in calculate_config function --- modules/python/clusterloader2/slo/slo.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/modules/python/clusterloader2/slo/slo.py b/modules/python/clusterloader2/slo/slo.py index 9b91f81ce5..5d83a5342a 100644 --- a/modules/python/clusterloader2/slo/slo.py +++ b/modules/python/clusterloader2/slo/slo.py @@ -27,9 +27,6 @@ def calculate_config(cpu_per_node, node_count, max_pods, provider, service_test, throughput = 100 nodes_per_namespace = min(node_count, DEFAULT_NODES_PER_NAMESPACE) - if network_test: - nodes_per_namespace = 24 #TODO: itia: fix hardcoded value - pods_per_node = DEFAULT_PODS_PER_NODE if service_test or network_test: pods_per_node = max_pods From 5ce88134d10259f4169d77dee8443786f5b1102d Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 14 May 2025 16:19:25 +0100 Subject: [PATCH 30/36] Comment out the schedules section in cilium-network-churn configuration --- .../network-churn/cilium-network-churn.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml index 2511bc7daa..4c8757c246 100644 --- a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml +++ b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml @@ -1,11 +1,11 @@ trigger: none -schedules: - - cron: "0 */6 * * *" - displayName: "Every 6 Hours Daily" - branches: - include: - - itia/network-churn - always: true +# schedules: +# - cron: "0 */6 * * *" +# displayName: "Every 6 Hours Daily" +# branches: +# include: +# - itia/network-churn +# always: true parameters: - name: node_count From 3fdfb57852f25305a89b8bacc7d491249a341ce8 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Wed, 14 May 2025 17:25:09 +0100 Subject: [PATCH 31/36] Update load-config to enable service test and fix total pods calculation --- jobs/competitive-test.yml | 2 +- modules/python/clusterloader2/slo/config/load-config.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/jobs/competitive-test.yml b/jobs/competitive-test.yml index beaa9662b2..c4872ad11c 100644 --- a/jobs/competitive-test.yml +++ b/jobs/competitive-test.yml @@ -75,7 +75,7 @@ jobs: terraform_arguments: ${{ parameters.terraform_arguments }} terraform_input_varibles: ${{ parameters.terraform_input_varibles }} retry_attempt_count: ${{ parameters.retry_attempt_count }} - credential_type: ${{ parameters.credential_type }} + credential_type: ${{ parameters.credential_type }} - template: /steps/validate-resources.yml parameters: cloud: ${{ parameters.cloud }} diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index 2f59737548..64dd73eb23 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -1,7 +1,7 @@ name: load-config # Config options for test type -{{$SERVICE_TEST := DefaultParam .CL2_SERVICE_TEST false}} +{{$SERVICE_TEST := DefaultParam .CL2_SERVICE_TEST true}} {{$NETWORK_TEST := DefaultParam .CL2_NETWORK_TEST false}} {{$CNP_TEST := DefaultParam .CL2_CNP_TEST false}} {{$CCNP_TEST := DefaultParam .CL2_CCNP_TEST false}} @@ -19,7 +19,7 @@ name: load-config {{$nodes := DefaultParam .CL2_NODES 1000}} {{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "15m"}} -{{$totalPods := MultiplyInt $namespaces $nodesPerNamespace $podsPerNode}} +{{$totalPods := MultiplyInt $namespaces $nodes $podsPerNode}} {{$podsPerNamespace := DivideInt $totalPods $namespaces}} {{$deploymentsPerNamespace := DivideInt $podsPerNamespace $deploymentSize}} From adc8831245ffe11d7fe833b631fb2c8de0bfed14 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Thu, 15 May 2025 15:47:20 +0100 Subject: [PATCH 32/36] Refactor YAML configurations for clarity and consistency --- .../network-churn/cilium-network-churn.yml | 14 +++++++------- .../terraform-inputs/azure.tfvars | 4 ++-- scripts/azure-npm.yaml | 4 ++-- .../topology/network-churn/validate-resources.yml | 10 +++++----- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml index 4c8757c246..9f5bdaa6ad 100644 --- a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml +++ b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml @@ -1,11 +1,11 @@ trigger: none -# schedules: -# - cron: "0 */6 * * *" -# displayName: "Every 6 Hours Daily" -# branches: -# include: -# - itia/network-churn -# always: true +schedules: + - cron: "0 */6 * * *" + displayName: "Every 6 Hours Daily" + branches: + include: + - main + always: true parameters: - name: node_count diff --git a/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars b/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars index 03db090e69..8484e92af2 100644 --- a/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars +++ b/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars @@ -60,8 +60,8 @@ aks_config_list = [ auto_scaling_enabled = true vm_size = "Standard_D4_v3" max_pods = 250 - node_labels = { "slo" = "true", - "test-np" = "net-policy-client" } + node_labels = { "slo" = "true", + "test-np" = "net-policy-client" } } ] kubernetes_version = "1.32" diff --git a/scripts/azure-npm.yaml b/scripts/azure-npm.yaml index d6430ab8b4..551b264179 100644 --- a/scripts/azure-npm.yaml +++ b/scripts/azure-npm.yaml @@ -85,7 +85,7 @@ spec: privileged: false capabilities: add: - - NET_ADMIN + - NET_ADMIN readOnlyRootFilesystem: true env: - name: HOSTNAME @@ -170,4 +170,4 @@ data: "NetPolInBackground": true }, "LogLevel": "info" - } \ No newline at end of file + } diff --git a/steps/topology/network-churn/validate-resources.yml b/steps/topology/network-churn/validate-resources.yml index 9c79ae9e5c..3c31bb76ce 100644 --- a/steps/topology/network-churn/validate-resources.yml +++ b/steps/topology/network-churn/validate-resources.yml @@ -16,20 +16,20 @@ steps: parameters: role: net region: ${{ parameters.regions[0] }} - nodes_per_nodepool: 240 + nodes_per_nodepool: 240 enable_autoscale: "false" - script: | #!/bin/bash set -e - + # Check if NPM_ENABLED environment variable is set to true if [[ "$NPM_ENABLED" == "True" ]]; then echo "NPM is enabled, applying Azure NPM configuration..." - + # Path to the NPM configuration file npm_config_file="$(Pipeline.Workspace)/s/scripts/azure-npm.yaml" - + # Check if the file exists if [[ -f "$npm_config_file" ]]; then echo "Applying NPM configuration from: $npm_config_file" @@ -45,4 +45,4 @@ steps: echo "NPM is not enabled, skipping Azure NPM configuration." fi displayName: 'Apply Azure NPM Configuration (if enabled)' - condition: and(succeeded(), eq('${{ parameters.cloud }}', 'azure')) \ No newline at end of file + condition: and(succeeded(), eq('${{ parameters.cloud }}', 'azure')) From f02e1713dd024e193dcb0e88034e04bd2f0459d8 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Mon, 19 May 2025 17:59:09 +0100 Subject: [PATCH 33/36] Refactor test execution scripts to remove redundant timestamp setup and add timestamp appending to test results --- steps/collect-telescope-metadata.yml | 4 ++-- .../network-policy-scale/execute.yml | 6 ------ steps/execute-tests.yml | 6 ------ .../collect-clusterloader2.yml | 14 ++++++++++++++ 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/steps/collect-telescope-metadata.yml b/steps/collect-telescope-metadata.yml index e85c86288a..4fd3fd5872 100644 --- a/steps/collect-telescope-metadata.yml +++ b/steps/collect-telescope-metadata.yml @@ -90,8 +90,8 @@ steps: # Append run_id to the test results file if the file exists set -eux if [ -f "$(TEST_RESULTS_FILE)" ]; then - jq --arg telescope_run_id $RUN_ID --arg start_timestamp $START \ - -c '. + {telescope_run_id: $telescope_run_id, start_timestamp: $start_timestamp}' $(TEST_RESULTS_FILE) > temp-$RUN_ID.json \ + jq --arg telescope_run_id $RUN_ID \ + -c '. + {telescope_run_id: $telescope_run_id}' $(TEST_RESULTS_FILE) > temp-$RUN_ID.json \ && mv temp-$RUN_ID.json $(TEST_RESULTS_FILE) else echo "##vso[task.logissue type=warning;]File $(TEST_RESULTS_FILE) does not exist." diff --git a/steps/engine/clusterloader2/network-policy-scale/execute.yml b/steps/engine/clusterloader2/network-policy-scale/execute.yml index 3f618afe90..11de7f0bf4 100644 --- a/steps/engine/clusterloader2/network-policy-scale/execute.yml +++ b/steps/engine/clusterloader2/network-policy-scale/execute.yml @@ -9,12 +9,6 @@ parameters: type: string steps: - - script: | - echo "Set the start time for test execution" - startTimestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") - echo "Start: $startTimestamp" - echo "##vso[task.setvariable variable=SLO_START_TIME]$startTimestamp" - displayName: set up timestamp variable - script: | set -eo pipefail diff --git a/steps/execute-tests.yml b/steps/execute-tests.yml index 4330d7c33d..ed8a32c752 100644 --- a/steps/execute-tests.yml +++ b/steps/execute-tests.yml @@ -13,12 +13,6 @@ parameters: default: {} steps: -- script: | - echo "Set the start time for test execution" - startTimestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") - echo "Start: $startTimestamp" - echo "##vso[task.setvariable variable=START]$startTimestamp" - displayName: Set up start timestamp variable - template: /steps/topology/${{ parameters.topology }}/execute-${{ parameters.engine }}.yml@self parameters: diff --git a/steps/topology/network-policy-scale/collect-clusterloader2.yml b/steps/topology/network-policy-scale/collect-clusterloader2.yml index 430bf27e1e..12cd440c24 100644 --- a/steps/topology/network-policy-scale/collect-clusterloader2.yml +++ b/steps/topology/network-policy-scale/collect-clusterloader2.yml @@ -15,8 +15,22 @@ steps: cloud: ${{ parameters.cloud }} engine_input: ${{ parameters.engine_input }} region: ${{ parameters.regions[0] }} + - script: | run_id=$(Build.BuildId)-$(System.JobId) echo "Run ID: $run_id" echo "##vso[task.setvariable variable=RUN_ID]$run_id" displayName: "Set unique Run ID before publish" + +- script: | + # Append timestamp to the test results file if the file exists + set -eux + if [ -f "$(TEST_RESULTS_FILE)" ]; then + jq --arg start_timestamp $START \ + -c '. + {start_timestamp: $start_timestamp}' $(TEST_RESULTS_FILE) > temp-$START.json \ + && mv temp-$START.json $(TEST_RESULTS_FILE) + else + echo "##vso[task.logissue type=warning;]File $(TEST_RESULTS_FILE) does not exist." + fi + displayName: "Add timestamp to Test Results" + condition: always() From 2188ec3375b8f7731fd13476a7ce13f26000679d Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Tue, 20 May 2025 17:59:59 +0100 Subject: [PATCH 34/36] Update Azure Terraform inputs and validate resources configuration for network policy churn scenario --- .../network-policy-churn/terraform-inputs/azure.tfvars | 4 ++-- steps/topology/network-churn/validate-resources.yml | 7 ------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars b/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars index 8484e92af2..80408f971d 100644 --- a/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars +++ b/scenarios/perf-eval/network-policy-churn/terraform-inputs/azure.tfvars @@ -54,10 +54,10 @@ aks_config_list = [ }, { name = "userpool0" - node_count = 0 + node_count = 240 min_count = 0 max_count = 500 - auto_scaling_enabled = true + auto_scaling_enabled = false vm_size = "Standard_D4_v3" max_pods = 250 node_labels = { "slo" = "true", diff --git a/steps/topology/network-churn/validate-resources.yml b/steps/topology/network-churn/validate-resources.yml index 3c31bb76ce..1ce5244edd 100644 --- a/steps/topology/network-churn/validate-resources.yml +++ b/steps/topology/network-churn/validate-resources.yml @@ -12,13 +12,6 @@ steps: role: net region: ${{ parameters.regions[0] }} - - template: /steps/engine/clusterloader2/cilium/scale-cluster.yml - parameters: - role: net - region: ${{ parameters.regions[0] }} - nodes_per_nodepool: 240 - enable_autoscale: "false" - - script: | #!/bin/bash set -e From a8054256e7fe7aedc503681da49fcbfb6b116cb7 Mon Sep 17 00:00:00 2001 From: ItiAgrawal Date: Tue, 20 May 2025 18:15:46 +0100 Subject: [PATCH 35/36] Refactor Azure NPM configuration paths and migrate azure-npm.yaml to scenario-specific directory --- .../perf-eval/network-policy-churn/kubernetes}/azure-npm.yaml | 0 steps/topology/network-churn/validate-resources.yml | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename {scripts => scenarios/perf-eval/network-policy-churn/kubernetes}/azure-npm.yaml (100%) diff --git a/scripts/azure-npm.yaml b/scenarios/perf-eval/network-policy-churn/kubernetes/azure-npm.yaml similarity index 100% rename from scripts/azure-npm.yaml rename to scenarios/perf-eval/network-policy-churn/kubernetes/azure-npm.yaml diff --git a/steps/topology/network-churn/validate-resources.yml b/steps/topology/network-churn/validate-resources.yml index 1ce5244edd..9840317079 100644 --- a/steps/topology/network-churn/validate-resources.yml +++ b/steps/topology/network-churn/validate-resources.yml @@ -21,7 +21,7 @@ steps: echo "NPM is enabled, applying Azure NPM configuration..." # Path to the NPM configuration file - npm_config_file="$(Pipeline.Workspace)/s/scripts/azure-npm.yaml" + npm_config_file="$(Pipeline.Workspace)/s/scenarios/$(SCENARIO_TYPE)/$(SCENARIO_NAME)/kubernetes/azure-npm.yaml" # Check if the file exists if [[ -f "$npm_config_file" ]]; then @@ -31,7 +31,7 @@ steps: else echo "Error: NPM configuration file not found at $npm_config_file" echo "Looking for file in current directory:" - ls -la "$(Pipeline.Workspace)/s/scripts/" + ls -la "$(Pipeline.Workspace)/s/scenarios/$(SCENARIO_TYPE)/$(SCENARIO_NAME)/kubernetes/" exit 1 fi else From 239c7bcc54a36104375ac441df4227dc8d258de8 Mon Sep 17 00:00:00 2001 From: Iti Agrawal Date: Wed, 21 May 2025 14:15:54 +0100 Subject: [PATCH 36/36] Add functionality to scrape kubelets metrics and capture npm metrics in case of network test (#610) Add functionality to scrape kubelets metrics and capture npm metrics in case of network test (#610) --- .../slo/config/load-config.yaml | 15 +++++ .../slo/config/modules/npm-measurements.yaml | 66 +++++++++++++++++++ modules/python/clusterloader2/slo/slo.py | 15 ++++- .../network-churn/cilium-network-churn.yml | 1 + steps/engine/clusterloader2/slo/execute.yml | 4 +- 5 files changed, 96 insertions(+), 5 deletions(-) create mode 100644 modules/python/clusterloader2/slo/config/modules/npm-measurements.yaml diff --git a/modules/python/clusterloader2/slo/config/load-config.yaml b/modules/python/clusterloader2/slo/config/load-config.yaml index 64dd73eb23..45d7ed8df6 100644 --- a/modules/python/clusterloader2/slo/config/load-config.yaml +++ b/modules/python/clusterloader2/slo/config/load-config.yaml @@ -28,6 +28,7 @@ name: load-config {{$podStartupLatencyThreshold := DefaultParam .CL2_POD_STARTUP_LATENCY_THRESHOLD "15s"}} {{$CILIUM_METRICS_ENABLED := DefaultParam .CL2_CILIUM_METRICS_ENABLED false}} +{{$SCRAPE_KUBELETS := DefaultParam .CL2_SCRAPE_KUBELETS false}} {{$SCRAPE_CONTAINERD := DefaultParam .CL2_SCRAPE_CONTAINERD false}} # Service test @@ -102,6 +103,13 @@ steps: path: /modules/network-policy/net-policy-metrics.yaml params: action: start + +{{if $SCRAPE_KUBELETS}} + - module: + path: /modules/npm-measurements.yaml + params: + action: start +{{end}} {{end}} {{if $SCRAPE_CONTAINERD}} @@ -296,6 +304,13 @@ steps: params: action: gather +{{if $SCRAPE_KUBELETS}} + - module: + path: /modules/npm-measurements.yaml + params: + action: gather +{{end}} + - module: path: modules/network-policy/net-policy-enforcement-latency.yaml params: diff --git a/modules/python/clusterloader2/slo/config/modules/npm-measurements.yaml b/modules/python/clusterloader2/slo/config/modules/npm-measurements.yaml new file mode 100644 index 0000000000..a1c685b982 --- /dev/null +++ b/modules/python/clusterloader2/slo/config/modules/npm-measurements.yaml @@ -0,0 +1,66 @@ +{{$action := .action}} # start, gather +# Measurement modules for Azure NPM metrics + +steps: + - name: {{$action}} NPM Agent Metrics + measurements: + - Identifier: NPMAgentsAvgCPUUsage + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: NPM Agent Avg CPU Usage + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:])) + - name: Perc90 + query: quantile(0.90, avg_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:])) + - name: Perc50 + query: quantile(0.50, avg_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:])) + - Identifier: NPMAgentsMaxCPUUsage + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: NPM Agent Max CPU Usage + metricVersion: v1 + unit: cpu + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:])) + - name: Perc90 + query: quantile(0.90, max_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:])) + - name: Perc50 + query: quantile(0.50, max_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:])) + - Identifier: NPMAgentsAvgMemUsage + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: NPM Agent Avg Memory Usage + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, avg_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, avg_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.50, avg_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024) + - Identifier: NPMAgentsMaxMemUsage + Method: GenericPrometheusQuery + Params: + action: {{$action}} + metricName: NPM Agent Max Memory Usage + metricVersion: v1 + unit: MB + enableViolations: true + queries: + - name: Perc99 + query: quantile(0.99, max_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024) + - name: Perc90 + query: quantile(0.90, max_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024) + - name: Perc50 + query: quantile(0.50, max_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024) diff --git a/modules/python/clusterloader2/slo/slo.py b/modules/python/clusterloader2/slo/slo.py index 5d83a5342a..15d8cddaee 100644 --- a/modules/python/clusterloader2/slo/slo.py +++ b/modules/python/clusterloader2/slo/slo.py @@ -52,6 +52,7 @@ def configure_clusterloader2( operation_timeout, provider, cilium_enabled, + scrape_kubelets, scrape_containerd, service_test, network_test, @@ -91,6 +92,9 @@ def configure_clusterloader2( file.write(f"CL2_SCRAPE_CONTAINERD: {str(scrape_containerd).lower()}\n") file.write("CONTAINERD_SCRAPE_INTERVAL: 5m\n") + if scrape_kubelets: + file.write(f"CL2_SCRAPE_KUBELETS: {str(scrape_kubelets).lower()}\n") + if cilium_enabled: file.write("CL2_CILIUM_METRICS_ENABLED: true\n") file.write("CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: true\n") @@ -161,11 +165,12 @@ def execute_clusterloader2( cl2_config_file, kubeconfig, provider, + scrape_kubelets, scrape_containerd ): run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider, cl2_config_file=cl2_config_file, overrides=True, enable_prometheus=True, - scrape_containerd=scrape_containerd) + scrape_kubelets=scrape_kubelets, scrape_containerd=scrape_containerd) def collect_clusterloader2( cpu_per_node, @@ -264,6 +269,8 @@ def main(): parser_configure.add_argument("provider", type=str, help="Cloud provider name") parser_configure.add_argument("cilium_enabled", type=str2bool, choices=[True, False], default=False, help="Whether cilium is enabled. Must be either True or False") + parser_configure.add_argument("scrape_kubelets", type=str2bool, choices=[True, False], default=False, + help="Whether to scrape kubelet metrics. Must be either True or False") parser_configure.add_argument("scrape_containerd", type=str2bool, choices=[True, False], default=False, help="Whether to scrape containerd metrics. Must be either True or False") parser_configure.add_argument("service_test", type=str2bool, choices=[True, False], default=False, @@ -295,6 +302,8 @@ def main(): parser_execute.add_argument("cl2_config_file", type=str, help="Path to the CL2 config file") parser_execute.add_argument("kubeconfig", type=str, help="Path to the kubeconfig file") parser_execute.add_argument("provider", type=str, help="Cloud provider name") + parser_execute.add_argument("scrape_kubelets", type=str2bool, choices=[True, False], default=False, + help="Whether to scrape kubelet metrics. Must be either True or False") parser_execute.add_argument("scrape_containerd", type=str2bool, choices=[True, False], default=False, help="Whether to scrape containerd metrics. Must be either True or False") @@ -326,14 +335,14 @@ def main(): if args.command == "configure": configure_clusterloader2(args.cpu_per_node, args.node_count, args.node_per_step, args.max_pods, args.repeats, args.operation_timeout, args.provider, - args.cilium_enabled, args.scrape_containerd, + args.cilium_enabled, args.scrape_kubelets, args.scrape_containerd, args.service_test, args.network_test, args.no_of_namespaces, args.total_network_policies, args.cnp_test, args.ccnp_test, args.num_cnps, args.num_ccnps, args.dualstack, args.cl2_override_file) elif args.command == "validate": validate_clusterloader2(args.node_count, args.operation_timeout) elif args.command == "execute": execute_clusterloader2(args.cl2_image, args.cl2_config_dir, args.cl2_report_dir, args.cl2_config_file, - args.kubeconfig, args.provider, args.scrape_containerd) + args.kubeconfig, args.provider, args.scrape_kubelets, args.scrape_containerd) elif args.command == "collect": collect_clusterloader2(args.cpu_per_node, args.node_count, args.max_pods, args.repeats, args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url, diff --git a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml index 9f5bdaa6ad..ad0f68fa14 100644 --- a/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml +++ b/pipelines/perf-eval/CNI Benchmark/network-churn/cilium-network-churn.yml @@ -93,6 +93,7 @@ stages: no_of_namespaces: ${{ parameters.no_of_namespaces }} total_network_policies: ${{ parameters.total_nework_policies }} cilium_enabled: False + scrape_kubelets: True npm_enabled: True service_test: False network_test: True diff --git a/steps/engine/clusterloader2/slo/execute.yml b/steps/engine/clusterloader2/slo/execute.yml index f69c71ac0d..ffe5a56049 100644 --- a/steps/engine/clusterloader2/slo/execute.yml +++ b/steps/engine/clusterloader2/slo/execute.yml @@ -21,12 +21,12 @@ steps: PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE configure \ $CPU_PER_NODE $NODE_COUNT $NODE_PER_STEP ${MAX_PODS:-0} \ - $REPEATS $SCALE_TIMEOUT $CLOUD $CILIUM_ENABLED ${SCRAPE_CONTAINERD:-False} \ + $REPEATS $SCALE_TIMEOUT $CLOUD $CILIUM_ENABLED ${SCRAPE_KUBELETS:-False} ${SCRAPE_CONTAINERD:-False} \ $SERVICE_TEST ${NETWORK_TEST:-False} ${NO_OF_NAMESPACES:-1} ${TOTAL_NETWORK_POLICIES:-0} \ ${CNP_TEST:-False} ${CCNP_TEST:-False} ${NUM_CNPS:-0} ${NUM_CCNPS:-0} ${DUALSTACK:-False} ${CL2_CONFIG_DIR}/overrides.yaml PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \ ${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR $CL2_CONFIG_FILE \ - ${HOME}/.kube/config $CLOUD ${SCRAPE_CONTAINERD:-False} + ${HOME}/.kube/config $CLOUD ${SCRAPE_KUBELETS:-False} ${SCRAPE_CONTAINERD:-False} workingDirectory: modules/python env: ${{ if eq(parameters.cloud, 'azure') }}: