Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/labels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

#
# GitHub label definitions for Skyhook (NodeWright).
# Sync to GitHub with: make labels
#
Expand Down
1 change: 1 addition & 0 deletions agent/go/cmd/agent/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
* SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,4 +111,3 @@
"modes"
]
}

32 changes: 17 additions & 15 deletions agent/go/internal/interrupts/interrupts.go
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package interrupts

Expand Down
32 changes: 17 additions & 15 deletions agent/go/internal/interrupts/interrupts_test.go
Original file line number Diff line number Diff line change
@@ -1,18 +1,20 @@
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package interrupts

Expand Down
4 changes: 2 additions & 2 deletions chart/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ All notable changes to this project will be documented in this file.

### Bug Fixes

- *(chart)* Repair immutable Deployment selector on skyhook->nodewright upgrade
- *(chart)* Repair immutable Deployment selector on skyhook->nodewright upgrade

## [chart/v0.17.0] - 2026-06-12

Expand All @@ -34,7 +34,7 @@ fix(chart): agent container path pointing to skyhook not nodewright
- Merge pull request #255 from NVIDIA/chore/chart-bump-v0.16.1

chore(chart): bump to v0.16.1 with operator webhook cert deadlock fix
- Bump chart versions
- Bump chart versions

## [chart/v0.16.1] - 2026-05-26

Expand Down
46 changes: 46 additions & 0 deletions chart/templates/skyhook-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,52 @@ spec:
setting for this Skyhook
type: boolean
type: object
drainConfig:
description: |-
DrainConfig tunes how nodes are drained before running interrupt packages.
If unset, the operator preserves its existing drain behavior.
properties:
deleteEmptyDirData:
default: true
description: |-
DeleteEmptyDirData allows draining pods that use emptyDir volumes.
Defaults to true to preserve the operator's existing behavior.
nullable: true
type: boolean
disableEviction:
default: false
description: |-
DisableEviction bypasses the eviction API and deletes pods directly.
This bypasses PodDisruptionBudgets.
nullable: true
type: boolean
force:
default: true
description: |-
Force allows draining pods not managed by a controller.
Defaults to true to preserve the operator's existing behavior.
nullable: true
type: boolean
gracePeriod:
description: |-
GracePeriod overrides the grace period used on pod eviction/delete.
Unset uses each pod's own terminationGracePeriodSeconds.
nullable: true
type: string
ignoreDaemonSets:
default: true
description: |-
IgnoreDaemonSets skips DaemonSet-managed pods during drain.
Defaults to true to preserve the operator's existing behavior.
nullable: true
type: boolean
timeout:
description: |-
Timeout bounds how long the operator waits for a node to drain.
Zero or unset means no timeout.
nullable: true
type: string
type: object
interruptionBudget:
description: InterruptionBudget configures how many nodes that match
node selectors that allowed to be interrupted at once.
Expand Down
69 changes: 69 additions & 0 deletions docs/interrupt_flow.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,75 @@ The interrupt flow is managed by the `ProcessInterrupt` and `EnsureNodeIsReadyFo
- Ensure the node is ready before proceeding with package operations
- Handle the timing and sequencing of all stages

## Drain Configuration

Interrupt-enabled Skyhooks can tune drain behavior with `spec.drainConfig`.
Unset fields preserve the operator's existing behavior:

```yaml
apiVersion: skyhook.nvidia.com/v1alpha1
kind: Skyhook
metadata:
name: gpu-mode-switch
spec:
drainConfig:
disableEviction: false
deleteEmptyDirData: true
force: true
ignoreDaemonSets: true
timeout: 10m
gracePeriod: 30s
```

The fields map to Kubernetes drain behavior:

- `disableEviction`: when `true`, pods are deleted directly instead of evicted. This bypasses PodDisruptionBudgets. The default is `false`, so the eviction API is used.
- `deleteEmptyDirData`: when `false`, pods with `emptyDir` volumes block drain. The default is `true`.
- `force`: when `false`, pods without a managing controller block drain. The default is `true`.
- `ignoreDaemonSets`: when `true`, DaemonSet-managed pods are skipped during drain. The default is `true`.
- `timeout`: bounds how long a node may spend draining. Unset or zero means no timeout. When the timeout expires, the node is marked `erroring` and package stages do not proceed on that node.
- `gracePeriod`: overrides the grace period used for eviction or direct deletion. Unset uses each pod's own `terminationGracePeriodSeconds`.

The operator also skips pods that are already terminating, pods that tolerate
the `node.kubernetes.io/unschedulable` taint, mirror/static pods, and pods in
`kube-system`. These exclusions are not user-configurable.

Compared to earlier releases, the default drain filter now follows Kubernetes
matching more closely: the unschedulable toleration check uses Kubernetes
`ToleratesTaint` semantics, DaemonSet pods are identified from the controller
owner reference, and already-terminating or mirror/static pods are ignored.

`podNonInterruptLabels` remains a pre-drain barrier. Matching pods must finish
or move away before the operator starts the configurable drain step.

### Recovering From a Drain Timeout

When `spec.drainConfig.timeout` expires, the operator records a `DrainTimeout`
warning event, marks the node and Skyhook `erroring`, and leaves the node
cordoned. The operator stops issuing further evict/delete actions while the
blocking condition remains, so package stages do not proceed on that node.

To recover, remove the underlying blocker first, such as a PDB with zero allowed
disruptions, an unmanaged pod when `force: false`, or an `emptyDir` pod when
`deleteEmptyDirData: false`. Then reset the failed rollout metadata:

```bash
kubectl skyhook reset <skyhook-name> --confirm
```

For a single node, use:

```bash
kubectl skyhook node reset <node-name> --skyhook <skyhook-name> --confirm
```

If the blocker clears after the timeout without a reset, a later reconcile can
observe the node as drained and continue from current cluster state. Reset is
still the recommended recovery workflow in production because it explicitly
clears the `erroring` status, drain-start metadata, cordon metadata, and batch
state before retrying. If the blocker is still present after reset, the drain
will time out again.

## Best Practices

- Always test interrupt-enabled packages in non-production environments first
Expand Down
138 changes: 138 additions & 0 deletions k8s-tests/chainsaw/skyhook/drain-config/chainsaw-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json
apiVersion: chainsaw.kyverno.io/v1alpha1
kind: Test
metadata:
name: drain-config
labels:
pool: interrupt
spec:
Comment thread
coderabbitai[bot] marked this conversation as resolved.
timeouts:
assert: 240s
exec: 90s
catch:
- get:
apiVersion: v1
kind: Node
selector: skyhook.nvidia.com/test-node=skyhooke2e
format: yaml
- get:
apiVersion: skyhook.nvidia.com/v1alpha1
kind: Skyhook
name: drain-config-disable-eviction
namespace: skyhook
format: yaml
- get:
apiVersion: skyhook.nvidia.com/v1alpha1
kind: Skyhook
name: drain-config-timeout
namespace: skyhook
format: yaml
- get:
apiVersion: v1
kind: Pod
namespace: skyhook
selector: app in (drain-config-pdb,drain-config-blocker)
format: yaml
steps:
- name: disable-eviction-bypasses-pdb
description: Verify disableEviction deletes through a zero-disruption PDB and lets the interrupt complete
try:
- script:
content: |
../skyhook-cli reset drain-config-disable-eviction --confirm 2>/dev/null || true
kubectl -n skyhook delete skyhook drain-config-disable-eviction --ignore-not-found --wait=false
kubectl -n skyhook delete deployment drain-config-pdb --ignore-not-found --wait=false
kubectl -n skyhook delete pdb drain-config-pdb --ignore-not-found
kubectl -n skyhook delete configmap drain-config-original-pod --ignore-not-found
kubectl -n skyhook delete pod -l app=drain-config-pdb --ignore-not-found --wait=false
- apply:
file: disable-eviction.yaml
- script:
content: |
kubectl -n skyhook rollout status deployment/drain-config-pdb --timeout=60s
pod="$(kubectl -n skyhook get pod -l app=drain-config-pdb -o jsonpath='{.items[0].metadata.name}')"
kubectl -n skyhook wait --for=condition=Ready "pod/${pod}" --timeout=30s
kubectl -n skyhook create configmap drain-config-original-pod --from-literal=name="${pod}" --dry-run=client -o yaml | kubectl apply -f -
kubectl -n skyhook wait --for=jsonpath='{.status.disruptionsAllowed}'=0 pdb/drain-config-pdb --timeout=60s
- apply:
file: disable-eviction-skyhook.yaml
- script:
content: |
kubectl -n skyhook wait --for=condition=Ready skyhook/drain-config-disable-eviction --timeout=180s
- script:
content: |
pod="$(kubectl -n skyhook get configmap drain-config-original-pod -o jsonpath='{.data.name}')"
if kubectl -n skyhook get pod "${pod}" >/dev/null 2>&1; then
echo "expected original PDB-protected pod ${pod} to be deleted directly"
exit 1
fi
finally:
- script:
content: |
../skyhook-cli reset drain-config-disable-eviction --confirm 2>/dev/null || true
kubectl -n skyhook delete skyhook drain-config-disable-eviction --ignore-not-found --wait=false
kubectl -n skyhook delete deployment drain-config-pdb --ignore-not-found --wait=false
kubectl -n skyhook delete pdb drain-config-pdb --ignore-not-found
kubectl -n skyhook delete configmap drain-config-original-pod --ignore-not-found
kubectl -n skyhook delete pod -l app=drain-config-pdb --ignore-not-found --wait=false
kubectl get nodes -l skyhook.nvidia.com/test-node=skyhooke2e -o name | while read -r node; do kubectl uncordon "${node}" || true; done
- name: timeout-surfaces-erroring
description: Verify timeout marks the node erroring and emits a drain event when drain cannot proceed
try:
- script:
content: |
../skyhook-cli reset drain-config-timeout --confirm 2>/dev/null || true
kubectl -n skyhook delete skyhook drain-config-timeout --ignore-not-found --wait=false
kubectl -n skyhook delete pod drain-config-blocker --ignore-not-found --wait=false
- apply:
file: timeout.yaml
- wait:
apiVersion: v1
kind: Pod
name: drain-config-blocker
namespace: skyhook
timeout: 30s
for:
condition:
name: Ready
value: 'true'
- script:
content: |
kubectl -n skyhook wait --for=jsonpath='{.status.status}'=erroring skyhook/drain-config-timeout --timeout=180s
- assert:
file: timeout-assert.yaml
- script:
content: |
for _ in $(seq 1 30); do
events="$(kubectl get events -A -o jsonpath='{range .items[*]}{.metadata.namespace}{"\t"}{.reason}{"\t"}{.action}{"\t"}{.involvedObject.kind}{"\t"}{.involvedObject.name}{"\t"}{.regarding.kind}{"\t"}{.regarding.name}{"\t"}{.message}{"\t"}{.note}{"\n"}{end}' 2>/dev/null || true)"
if printf "%s\n" "${events}" | grep -F "Drain" | grep -F "drain-config-timeout" | grep -F "drain timed out"; then
exit 0
fi
sleep 2
done
printf "%s\n" "${events}"
echo "expected drain timeout event for drain-config-timeout"
exit 1
Comment thread
coderabbitai[bot] marked this conversation as resolved.
finally:
- script:
content: |
../skyhook-cli reset drain-config-timeout --confirm 2>/dev/null || true
kubectl -n skyhook delete skyhook drain-config-timeout --ignore-not-found --wait=false
kubectl -n skyhook delete pod drain-config-blocker --ignore-not-found --wait=false
kubectl get nodes -l skyhook.nvidia.com/test-node=skyhooke2e -o name | while read -r node; do kubectl uncordon "${node}" || true; done
Loading
Loading