Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions modules/python/clusterloader2/slo/config/load-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ name: load-config
{{$podStartupLatencyThreshold := DefaultParam .CL2_POD_STARTUP_LATENCY_THRESHOLD "15s"}}

{{$CILIUM_METRICS_ENABLED := DefaultParam .CL2_CILIUM_METRICS_ENABLED false}}
{{$SCRAPE_KUBELETS := DefaultParam .CL2_SCRAPE_KUBELETS false}}
{{$SCRAPE_CONTAINERD := DefaultParam .CL2_SCRAPE_CONTAINERD false}}

# Service test
Expand Down Expand Up @@ -102,6 +103,13 @@ steps:
path: /modules/network-policy/net-policy-metrics.yaml
params:
action: start

{{if $SCRAPE_KUBELETS}}
- module:
path: /modules/npm-measurements.yaml
Comment thread
agrawaliti marked this conversation as resolved.
params:
action: start
{{end}}
{{end}}

{{if $SCRAPE_CONTAINERD}}
Expand Down Expand Up @@ -296,6 +304,13 @@ steps:
params:
action: gather

{{if $SCRAPE_KUBELETS}}
- module:
path: /modules/npm-measurements.yaml
params:
action: gather
{{end}}

- module:
path: modules/network-policy/net-policy-enforcement-latency.yaml
params:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
{{$action := .action}} # start, gather
# Measurement modules for Azure NPM metrics

steps:
- name: {{$action}} NPM Agent Metrics
measurements:
- Identifier: NPMAgentsAvgCPUUsage
Method: GenericPrometheusQuery
Params:
action: {{$action}}
metricName: NPM Agent Avg CPU Usage
metricVersion: v1
unit: cpu
enableViolations: true
queries:
- name: Perc99
query: quantile(0.99, avg_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:]))
- name: Perc90
query: quantile(0.90, avg_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:]))
- name: Perc50
query: quantile(0.50, avg_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:]))
- Identifier: NPMAgentsMaxCPUUsage
Method: GenericPrometheusQuery
Params:
action: {{$action}}
metricName: NPM Agent Max CPU Usage
metricVersion: v1
unit: cpu
enableViolations: true
queries:
- name: Perc99
query: quantile(0.99, max_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:]))
- name: Perc90
query: quantile(0.90, max_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:]))
- name: Perc50
query: quantile(0.50, max_over_time(rate(container_cpu_usage_seconds_total{namespace="kube-system", pod=~"azure-npm-.*"}[1m])[5m:]))
- Identifier: NPMAgentsAvgMemUsage
Method: GenericPrometheusQuery
Params:
action: {{$action}}
metricName: NPM Agent Avg Memory Usage
metricVersion: v1
unit: MB
enableViolations: true
queries:
- name: Perc99
query: quantile(0.99, avg_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024)
- name: Perc90
query: quantile(0.90, avg_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024)
- name: Perc50
query: quantile(0.50, avg_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024)
- Identifier: NPMAgentsMaxMemUsage
Method: GenericPrometheusQuery
Params:
action: {{$action}}
metricName: NPM Agent Max Memory Usage
metricVersion: v1
unit: MB
enableViolations: true
queries:
- name: Perc99
query: quantile(0.99, max_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024)
- name: Perc90
query: quantile(0.90, max_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024)
- name: Perc50
query: quantile(0.50, max_over_time(container_memory_usage_bytes{namespace="kube-system", pod=~"azure-npm-.*"}[5m:]) / 1024 / 1024)
15 changes: 12 additions & 3 deletions modules/python/clusterloader2/slo/slo.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def configure_clusterloader2(
operation_timeout,
provider,
cilium_enabled,
scrape_kubelets,
scrape_containerd,
service_test,
network_test,
Expand Down Expand Up @@ -91,6 +92,9 @@ def configure_clusterloader2(
file.write(f"CL2_SCRAPE_CONTAINERD: {str(scrape_containerd).lower()}\n")
file.write("CONTAINERD_SCRAPE_INTERVAL: 5m\n")

if scrape_kubelets:
file.write(f"CL2_SCRAPE_KUBELETS: {str(scrape_kubelets).lower()}\n")

if cilium_enabled:
file.write("CL2_CILIUM_METRICS_ENABLED: true\n")
file.write("CL2_PROMETHEUS_SCRAPE_CILIUM_OPERATOR: true\n")
Expand Down Expand Up @@ -161,11 +165,12 @@ def execute_clusterloader2(
cl2_config_file,
kubeconfig,
provider,
scrape_kubelets,
scrape_containerd
):
run_cl2_command(kubeconfig, cl2_image, cl2_config_dir, cl2_report_dir, provider,
cl2_config_file=cl2_config_file, overrides=True, enable_prometheus=True,
scrape_containerd=scrape_containerd)
scrape_kubelets=scrape_kubelets, scrape_containerd=scrape_containerd)

def collect_clusterloader2(
cpu_per_node,
Expand Down Expand Up @@ -264,6 +269,8 @@ def main():
parser_configure.add_argument("provider", type=str, help="Cloud provider name")
parser_configure.add_argument("cilium_enabled", type=str2bool, choices=[True, False], default=False,
help="Whether cilium is enabled. Must be either True or False")
parser_configure.add_argument("scrape_kubelets", type=str2bool, choices=[True, False], default=False,
help="Whether to scrape kubelet metrics. Must be either True or False")
parser_configure.add_argument("scrape_containerd", type=str2bool, choices=[True, False], default=False,
help="Whether to scrape containerd metrics. Must be either True or False")
parser_configure.add_argument("service_test", type=str2bool, choices=[True, False], default=False,
Expand Down Expand Up @@ -295,6 +302,8 @@ def main():
parser_execute.add_argument("cl2_config_file", type=str, help="Path to the CL2 config file")
parser_execute.add_argument("kubeconfig", type=str, help="Path to the kubeconfig file")
parser_execute.add_argument("provider", type=str, help="Cloud provider name")
parser_execute.add_argument("scrape_kubelets", type=str2bool, choices=[True, False], default=False,
help="Whether to scrape kubelet metrics. Must be either True or False")
parser_execute.add_argument("scrape_containerd", type=str2bool, choices=[True, False], default=False,
help="Whether to scrape containerd metrics. Must be either True or False")

Expand Down Expand Up @@ -326,14 +335,14 @@ def main():
if args.command == "configure":
configure_clusterloader2(args.cpu_per_node, args.node_count, args.node_per_step, args.max_pods,
args.repeats, args.operation_timeout, args.provider,
args.cilium_enabled, args.scrape_containerd,
args.cilium_enabled, args.scrape_kubelets, args.scrape_containerd,
args.service_test, args.network_test, args.no_of_namespaces, args.total_network_policies,
args.cnp_test, args.ccnp_test, args.num_cnps, args.num_ccnps, args.dualstack, args.cl2_override_file)
elif args.command == "validate":
validate_clusterloader2(args.node_count, args.operation_timeout)
elif args.command == "execute":
execute_clusterloader2(args.cl2_image, args.cl2_config_dir, args.cl2_report_dir, args.cl2_config_file,
args.kubeconfig, args.provider, args.scrape_containerd)
args.kubeconfig, args.provider, args.scrape_kubelets, args.scrape_containerd)
elif args.command == "collect":
collect_clusterloader2(args.cpu_per_node, args.node_count, args.max_pods, args.repeats,
args.cl2_report_dir, args.cloud_info, args.run_id, args.run_url,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ stages:
no_of_namespaces: ${{ parameters.no_of_namespaces }}
total_network_policies: ${{ parameters.total_nework_policies }}
cilium_enabled: False
scrape_kubelets: True
npm_enabled: True
service_test: False
network_test: True
Expand Down
4 changes: 2 additions & 2 deletions steps/engine/clusterloader2/slo/execute.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ steps:

PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE configure \
$CPU_PER_NODE $NODE_COUNT $NODE_PER_STEP ${MAX_PODS:-0} \
$REPEATS $SCALE_TIMEOUT $CLOUD $CILIUM_ENABLED ${SCRAPE_CONTAINERD:-False} \
$REPEATS $SCALE_TIMEOUT $CLOUD $CILIUM_ENABLED ${SCRAPE_KUBELETS:-False} ${SCRAPE_CONTAINERD:-False} \
$SERVICE_TEST ${NETWORK_TEST:-False} ${NO_OF_NAMESPACES:-1} ${TOTAL_NETWORK_POLICIES:-0} \
${CNP_TEST:-False} ${CCNP_TEST:-False} ${NUM_CNPS:-0} ${NUM_CCNPS:-0} ${DUALSTACK:-False} ${CL2_CONFIG_DIR}/overrides.yaml
PYTHONPATH=$PYTHONPATH:$(pwd) python3 $PYTHON_SCRIPT_FILE execute \
${CL2_IMAGE} ${CL2_CONFIG_DIR} $CL2_REPORT_DIR $CL2_CONFIG_FILE \
${HOME}/.kube/config $CLOUD ${SCRAPE_CONTAINERD:-False}
${HOME}/.kube/config $CLOUD ${SCRAPE_KUBELETS:-False} ${SCRAPE_CONTAINERD:-False}
workingDirectory: modules/python
env:
${{ if eq(parameters.cloud, 'azure') }}:
Expand Down