diff --git a/.github/actions/setup/action.yaml b/.github/actions/setup/action.yaml new file mode 100644 index 0000000..e52c75d --- /dev/null +++ b/.github/actions/setup/action.yaml @@ -0,0 +1,13 @@ +name: "Setup" +description: "install dependencies" +runs: + using: composite + steps: + - uses: docker/setup-qemu-action@v3 + with: + platforms: amd64,arm64 + - uses: docker/setup-buildx-action@v3 + - uses: actions/setup-go@v6 + with: + go-version-file: go.mod + cache: true diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f262e58..793a94f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,22 +1,16 @@ name: CI on: - push: - branches: [ main ] pull_request: - branches: [ main ] - release: - types: [ created ] + branches: [main] jobs: - build-lint: + ci: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 - - uses: actions/setup-go@v6 - with: - go-version: '1.25' - - name: CI - run: make ci - - name: Build - run: make build + - uses: actions/checkout@v6 + - uses: ./.github/actions/setup + - name: CI + run: make ci + - name: Build + run: make build diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml new file mode 100644 index 0000000..5afcc4d --- /dev/null +++ b/.github/workflows/e2e.yml @@ -0,0 +1,40 @@ +name: e2e + +on: + push: + branches: ["feature/*"] + # pull_request: + # branches: [main] + +jobs: + e2e: + permissions: + contents: read + timeout-minutes: 60 + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: ./.github/actions/setup + + - name: Install kubectl + run: | + set -euo pipefail + ARCH=amd64 + VERSION=v1.30.2 + curl -sSL -o kubectl https://dl.k8s.io/release/${VERSION}/bin/linux/${ARCH}/kubectl + sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl + + - name: Install kind + run: | + set -euo pipefail + KIND_VERSION=v0.31.0 + curl -sSL -o kind https://kind.sigs.k8s.io/dl/${KIND_VERSION}/kind-linux-amd64 + sudo install -o root -g root -m 0755 kind /usr/local/bin/kind + + - name: Run limit end-to-end tests + env: + GOCACHE: /tmp/go-build + run: make e2e + + - name: Delete cluster + run: /usr/local/bin/kind delete cluster diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 21f6f0e..aee2457 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -3,7 +3,7 @@ name: Release on: push: tags: - - 'v*.*.*' + - "v*.*.*" permissions: contents: write @@ -14,52 +14,42 @@ env: ONLINE_REGISTER_USER: ${{ github.actor }} ONLINE_REGISTER_PASSWORD: ${{ secrets.PACKAGE_TOKEN }} - jobs: release: if: github.repository == 'kcrow-io/plugins' runs-on: ubuntu-latest steps: - - uses: actions/checkout@v6 - - uses: actions/setup-go@v6 - with: - go-version: '1.25' - - - name: Set up QEMU - uses: docker/setup-qemu-action@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v4 - - - name: Login to online register - uses: docker/login-action@v4 - with: - username: ${{ env.ONLINE_REGISTER_USER }} - password: ${{ env.ONLINE_REGISTER_PASSWORD }} - registry: ${{ env.ONLINE_REGISTER }} - - - name: Build and Push Image - uses: docker/build-push-action@v7 - id: docker_build_release - with: - context: . - file: Dockerfile - push: true - provenance: false - platforms: ${{ env.BUILD_PLATFORM }} - tags: | - ${{ env.ONLINE_REGISTER }}/${{ github.repository }}:${{ github.ref_name }}, - ${{ env.ONLINE_REGISTER }}/${{ github.repository }}:latest - build-args: | - GIT_COMMIT_VERSION=${{ env.commitver }} - GIT_COMMIT_TIME=${{ env.committime }} - VERSION=${{ github.ref_name }} - - - name: Run GoReleaser - uses: goreleaser/goreleaser-action@v7 - with: - distribution: goreleaser - version: '~> v2' - args: release --clean - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file + - uses: actions/checkout@v6 + - uses: ./.github/actions/setup + - name: Login to online register + uses: docker/login-action@v3 + with: + username: ${{ env.ONLINE_REGISTER_USER }} + password: ${{ env.ONLINE_REGISTER_PASSWORD }} + registry: ${{ env.ONLINE_REGISTER }} + + - name: Build and Push Image + uses: docker/build-push-action@v6 + id: docker_build_release + with: + context: . + file: Dockerfile + push: true + provenance: false + platforms: ${{ env.BUILD_PLATFORM }} + tags: | + ${{ env.ONLINE_REGISTER }}/${{ github.repository }}:${{ github.ref_name }}, + ${{ env.ONLINE_REGISTER }}/${{ github.repository }}:latest + build-args: | + GIT_COMMIT_VERSION=${{ env.commitver }} + GIT_COMMIT_TIME=${{ env.committime }} + VERSION=${{ github.ref_name }} + + - name: Run GoReleaser + uses: goreleaser/goreleaser-action@v6 + with: + distribution: goreleaser + version: "~> v2" + args: release --clean + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore index dce0f41..73b2a2d 100644 --- a/.gitignore +++ b/.gitignore @@ -99,5 +99,7 @@ test/e2edebugLog.txt .plan* .claude* +plans/ + out/ bin/ \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md index 6f04b5f..5639fc3 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,289 +1,139 @@ ## Project Overview -This is a Go-based NRI (Node Resource Interface) plugins collection for extending containerd's container runtime capabilities. The project provides specialized plugins that can modify container behavior at runtime through the containerd NRI framework. +This is a collection of NRI (Node Resource Interface) plugins for containerd, written in Go. The plugins extend containerd's container runtime capabilities for resource management, including memory limits, I/O throttling, and resource limit escaping. -## Development Commands +## Build Commands -### Building ```bash -# Build all plugins for all platforms (linux/amd64, linux/arm64) +# Build all plugins (formats and tidies first) make build -# Build specific plugin for specific platform -GOOS=linux GOARCH=amd64 go build -o bin/linux/amd64/limit ./cmd/limit +# Format code +make fmt -# Build with race detection (requires CGO) -RACE=1 make build +# Tidy dependencies +make tidy -# Build without optimization (for debugging) -NOOPT=1 make build +# Run linter +make lint -# Build without stripping symbols -NOSTRIP=1 make build +# Run all unit tests +make test + +# Run specific test +go test ./pkg/plugins/memory/... + +# Run single test function +go test -run TestFunctionName ./path/to/package + +# Run e2e tests (requires docker, kind, kubectl) +make e2e + +# Avoid test caching during iteration +GOFLAGS='-count=1' make e2e + +# Run vulnerability scan +make govulncheck + +# Run full CI suite (lint + test + govulncheck) +make ci # Build container image make image ``` -### Testing and Linting +## Build Options + ```bash -# Run all linting, tests, and vulnerability checks -make ci +# Enable race detection (requires CGO) +RACE=1 make build -# Run Go linting (uses golangci-lint) -make lint +# Keep debug symbols (no stripping) +NOSTRIP=1 make build -# Run tests -make test +# Disable optimizations (for debugging) +NOOPT=1 make build -# Run tests with module mode -go test -mod=mod ./... +# Build for specific platform +BUILD_PLATFORMS=linux/arm64 make build +``` -# Run tests for specific package -go test -mod=mod ./pkg/plugins/limit/... +## Architecture -# Run tests with race detection -RACE=1 go test -mod=mod ./... +### Plugin System -# Run tests with verbose output -go test -v -mod=mod ./... +All plugins implement the NRI protocol via `github.com/containerd/nri/pkg/stub`. The common pattern: -# Run vulnerability detection -make govulncheck -``` +1. Each plugin implements the `Pluginer` interface (`pkg/plugins/pluginer.go`) +2. Plugin main.go calls `plugins.RunStub()` with the plugin instance +3. Plugins communicate with containerd via the NRI socket +4. Configuration is loaded from JSON files via the `Configer` interface -### Cross-compilation -The build system supports cross-compilation for: -- `linux/amd64` -- `linux/arm64` +### Directory Structure -Binaries are output to `bin/{platform}/` directories. +- `cmd/` - Plugin binaries (memory, limit, escape, override, example) +- `pkg/plugins/` - Plugin implementations +- `pkg/cgroup/` - cgroup v2 utilities +- `pkg/containerd/` - containerd client and watcher +- `pkg/pool/` - Worker pool for concurrent operations +- `pkg/log/` - Logging utilities +- `integration/` - KinD-based e2e tests -Build flags are controlled via `Makefile.defs`: -- `RACE=1`: Enable race detection (requires CGO) -- `NOOPT=1`: Disable optimizations for debugging -- `NOSTRIP=1`: Keep debug symbols -- `LOCKDEBUG=1`: Enable lock debugging +### Plugin Execution Order -## Architecture +Plugins are named with numeric prefixes that determine execution order: +- `06-memory` (START_NUM=06 in Makefile) +- `07-limit` +- `08-escape` -### Plugin System -- **Common Interface**: All plugins implement the `Pluginer` interface in `pkg/plugins/pluginer.go:22` -- **NRI Integration**: Uses containerd's NRI stub for plugin lifecycle management via `plugins.RunStub()` -- **Configuration**: Plugins support JSON-based configuration via the `Configer` interface (pkg/plugins/pluginer.go:17) -- **Annotation-based Control**: Plugins respond to container annotations with prefix `io.kcrow.` (defined in pkg/plugins/pluginer.go:13) - -### Plugin Lifecycle -1. Each plugin's `main.go` creates a new plugin instance (e.g., `limitplugin.New()`) -2. `plugins.RunStub()` initializes the NRI stub and starts the plugin -3. The plugin registers with containerd's NRI framework -4. Container lifecycle events trigger plugin callbacks (CreateContainer, UpdateContainer, etc.) -5. Plugins read configuration from `/opt/nri/conf/.conf` or `/etc/nri/conf.d/.conf` - -### Available Plugins - -1. **Override Plugin** (`cmd/override/`, `pkg/plugins/override/`) - - Overrides container configurations according to OCI spec config files - - Handles rlimit settings, hooks, and runtime parameters - -2. **Escape Plugin** (`cmd/escape/`, `pkg/plugins/escape/`) - - Allows containers to escape resource limits (CPU, memory) - - Controlled via annotation: `io.kcrow.escape: cpu,memory` - -3. **Memory Plugin** (`cmd/memory/`, `pkg/plugins/memory/`) - - Automatically sets `memory.high` to a percentage of container's memory limit - - Supports namespace filtering (include/exclude lists) - - Configurable high percentage (default: 80%) - -4. **Limit Plugin** (`cmd/limit/`, `pkg/plugins/limit/`) - - Monitors container disk usage and automatically applies I/O bandwidth limits - - Also monitors memory cache/RSS ratio and clears cache when needed - - **I/O Limiting**: Applies limits when disk usage exceeds `max_disk_bytes` threshold - - **Memory Management**: Clears cache when: - - Container memory usage/limit ratio exceeds `pods-usage-percent` (default: 80%) - - Cache is greater than `min-cache-bytes` (default: 512MB) - - Cache/RSS ratio exceeds `cache-rss-ratio` (default: 10) - - Supports both cgroup v1 and v2 - - Uses containerd client to watch container stats at configurable intervals - - Configuration in `pkg/plugins/limit/config.go:82` - -### Key Directories -- `cmd/`: Main entry points for each plugin (one subdirectory per plugin) -- `pkg/plugins/`: Plugin implementations - - `pkg/plugins/pluginer.go`: Core interfaces (`Pluginer`, `Configer`) - - `pkg/plugins/limit/`, `memory/`, `escape/`, `override/`: Individual plugin implementations -- `pkg/containerd/`: Containerd client and watcher utilities -- `pkg/cgroup/`: Cgroup utilities for v1/v2 detection and path normalization -- `pkg/log/`: Structured logging setup -- `deploy/`: Kubernetes deployment manifests -- `docs/`: Plugin-specific documentation - -### Cgroup Handling -The project includes sophisticated cgroup path handling in `pkg/cgroup/`: -- **Version Detection**: Automatically detects cgroup v1 vs v2 (cached for performance) -- **Systemd Path Conversion**: Converts systemd-style paths (with colons) to filesystem paths -- **Path Normalization**: Handles both systemd and cgroupfs drivers -- **PID-based Path Resolution**: Reads actual cgroup paths from `/proc/[pid]/cgroup` for reliability in Kubernetes - -### Build Configuration -- **GoReleaser**: Creates DEB/RPM packages, installs to `/opt/nri/bin` - - Configuration in `.goreleaser.yml` - - Builds 4 plugins: override, escape, memory, limit - - Supports linux/amd64 and linux/arm64 -- **Multi-stage Dockerfile**: Supports multi-arch builds -- **Makefile system**: Uses `Makefile.defs` for build configuration with support for CGO, race detection, and debug builds - - `START_NUM` in Makefile controls plugin numbering (default: 06) - - Plugins are numbered sequentially (06-memory, 07-limit, 08-escape) - - Build output: `bin/{platform}/{number}-{plugin-name}` - -### Module Management -```bash -# Download dependencies -make download +### Key Conventions -# Tidy go.mod and go.sum -make tidy +- **Annotation prefix**: `io.kcrow.` (e.g., `io.kcrow.escape=cpu,memory`) +- **Config location**: `/opt/nri/conf/` or `/etc/nri/conf.d/` +- **Binary location**: `/opt/nri/plugins/` +- **Config format**: JSON files named `.conf` -# Format code -make fmt -``` +## Plugin Implementations -## Installation and Deployment +### memory +Automatically sets `memory.high` to a percentage of container's memory limit for better memory management. -### Package Installation -```bash -# Debian/Ubuntu -sudo dpkg -i nri-plugins_*.deb +### escape +Allows container's main process to escape resource limits based on annotation `io.kcrow.escape: cpu,memory`. -# RHEL/CentOS -sudo rpm -ivh nri-plugins_*.rpm -``` +### limit +Monitors container disk usage and applies I/O bandwidth limits when thresholds are exceeded. Also monitors memory cache/RSS ratio and clears cache under memory pressure. -### Containerd Configuration -```bash -# Enable NRI in containerd -sudo mkdir -p /etc/containerd/conf.d -echo 'disabled_plugins = []' | sudo tee /etc/containerd/conf.d/enable-nri.toml -sudo systemctl restart containerd -``` +### override +Overrides container configurations according to ocispec config file, including rlimit settings and hooks. -### Kubernetes Deployment -Use the DaemonSet in `deploy/daemonset.yml` for cluster-wide deployment. +## Testing -## Testing Plugins +### Unit Tests +Located in `pkg/` subdirectories. Run with `make test` or `go test ./...`. -```bash -# Test escape plugin -sudo ctr run --rm --runtime io.containerd.runc.v2 \ - --annotation io.kcrow.escape=cpu,memory \ - docker.io/library/alpine:latest test - -# Test memory plugin (requires container with memory limit) -sudo ctr run --rm --runtime io.containerd.runc.v2 \ - --memory 1073741824 \ - docker.io/library/alpine:latest test-memory - -# Test limit plugin (check that I/O limits are applied when disk usage exceeds threshold) -# Note: This requires the container to actually use disk space beyond the configured threshold -sudo ctr run --rm --runtime io.containerd.runc.v2 \ - docker.io/library/alpine:latest test-limit sh -c "dd if=/dev/zero of=/tmp/testfile bs=1M count=5000" -``` +### E2E Tests +Located in `integration/`. These provision a KinD cluster, install plugins, and run stress workloads. -## Code Patterns - -### Plugin Implementation -1. Implement the `Pluginer` interface (requires `Name() string` method) -2. Optionally implement the `Configer` interface for configuration support -3. Use `plugins.RunStub()` to start the NRI stub in `main()` -4. Handle container lifecycle events through NRI callbacks (CreateContainer, UpdateContainer, RemoveContainer, etc.) -5. Use structured logging with logrus via `pkg/log` package - -Example plugin structure: -```go -type MyPlugin struct { - stub stub.Stub - config *Config -} - -func (p *MyPlugin) Name() string { - return "my-plugin" -} - -func main() { - if err := plugins.RunStub(New()); err != nil { - log.G(context.TODO()).WithError(err).Fatal("Failed to run plugin") - os.Exit(1) - } -} -``` +**Prerequisites**: docker, kind, kubectl in PATH -### Configuration Management -- Plugins use JSON-based configuration files -- Configuration files are typically located in `/opt/nri/conf/` or `/etc/nri/conf.d/` -- Implement `ReadFrom(io.Reader)` and `WriteTo(io.Writer)` from the `Configer` interface -- Use `Parse()` method to parse and validate configuration -- Configuration is read during plugin initialization - -**Limit Plugin Configuration Structure** (`pkg/plugins/limit/config.go`): -```json -{ - "disabled": false, - "containerd_config_path": "/etc/containerd/config.toml", - "watch_interval": 60, - "io": { - "max_disk_bytes": 10737418240, - "bps_limit": 4194304, - "iops_limit": 10 - }, - "memory": { - "pods-usage-percent": 80, - "cache-rss-ratio": 10.0, - "min-cache-bytes": 536870912 - }, - "log_path": "/var/log/nri-limit.log" -} -``` +The e2e tests verify: +- Disk I/O throttling (looks for "Applied io limit" in logs) +- Memory cache clearing (looks for "memory exceeds" in logs) + +## Development Notes + +- **CGO**: Disabled by default (`CGO_ENABLED=0`) for static binaries +- **Cross-compilation**: Supports linux/amd64 and linux/arm64 +- **Linting**: Uses golangci-lint v2 (imported as Go module dependency) +- **Build tags**: `osusergo` (always), `lockdebug` (if LOCKDEBUG set) +- **Binary stripping**: Enabled by default (use NOSTRIP=1 to disable) + +## Adding a New Plugin -Key configuration fields: -- `io.max_disk_bytes`: Disk usage threshold for applying I/O limits (0 = disabled) -- `io.bps_limit`: Bandwidth limit in bytes per second -- `io.iops_limit`: IOPS limit -- `memory.pods-usage-percent`: Memory usage/limit percentage threshold (0 = disabled) -- `memory.cache-rss-ratio`: Minimum cache/RSS ratio to trigger cache clearing -- `memory.min-cache-bytes`: Minimum cache size before clearing - -### Annotation Processing -- All plugin annotations use the prefix `io.kcrow.` (defined in `pkg/plugins/pluginer.go:13`) -- Annotations are comma-separated values (separator defined in `pkg/plugins/pluginer.go:14`) -- Example: `io.kcrow.escape: cpu,memory` - -### Cgroup Operations -When working with cgroups: -1. Use `cgroup.DetectCgroupVersion()` to detect v1 vs v2 (cached, safe for concurrent use) -2. Use `cgroup.NormalizeCgroupPath()` to handle systemd vs cgroupfs paths -3. Use `cgroup.GetCgroupPathFromPid()` to get the actual cgroup path from a process PID -4. Be aware that Kubernetes adds additional parent slices (e.g., `kubelet.slice/kubepods-burstable.slice/`) - -### Containerd Integration -The `pkg/containerd/` package provides: -- **Containerd Client**: Connect to containerd socket and interact with containers -- **Config Parsing**: Parse containerd config to get root directory and socket path (`containerd.ParseContainerdConfig()`) -- **Container Watcher**: Watch container stats (disk usage, memory usage) at configurable intervals - - Watcher runs in background goroutine, polling containers at specified intervals - - Calls custom callback function for each container (e.g., `checkContainer` in limit plugin) - - Used by limit plugin to periodically check and apply/remove limits based on usage - -### Limit Plugin Architecture -The limit plugin (`pkg/plugins/limit/`) has two main monitoring functions: - -1. **I/O Limiting** (`limitblkio` in `limit_linux.go:94`): - - Monitors disk usage via containerd snapshot service - - Applies I/O limits (BPS and IOPS) when usage exceeds threshold - - Removes limits when usage drops below threshold - - Tracks limited containers in memory map to avoid redundant operations - -2. **Memory Cache Management** (`clearCache` in `limit_linux.go:170`): - - Monitors container memory usage/limit ratio - - Checks cache/RSS ratio from cgroup memory.stat - - Clears cache when all conditions are met (usage percent, min cache bytes, cache/RSS ratio) - - Uses `memory.force_empty` or similar mechanisms to reclaim cache \ No newline at end of file +1. Create plugin implementation in `pkg/plugins//` +2. Implement the NRI stub interface methods +3. Create `cmd//main.go` that calls `plugins.RunStub()` +4. Add plugin to `BIN_SUBDIRS` in Makefile +5. Create configuration struct implementing `Configer` interface +6. Add documentation in `docs/.md` diff --git a/Dockerfile b/Dockerfile index cec1da5..2cdeba7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,7 @@ ENV GOPROXY=https://goproxy.cn,direct RUN BUILD_PLATFORMS=$TARGETPLATFORM make build -FROM gcr.io/distroless/static:nonroot +FROM busybox:1.36.1 ARG TARGETPLATFORM diff --git a/Makefile b/Makefile index 6139f40..27301f1 100644 --- a/Makefile +++ b/Makefile @@ -62,3 +62,7 @@ test: ## Runs all tests @go test $(ARGS) ./... ci: lint test govulncheck ## Executes vulnerability scan, lint, test and generates reports + +.PHONY: e2e +e2e: ## Runs KinD-based end-to-end tests (requires docker, kind, kubectl) + @go test -v -tags e2e ./integration/... diff --git a/README.md b/README.md index e7d9329..e4ae8d8 100644 --- a/README.md +++ b/README.md @@ -211,6 +211,25 @@ sudo ctr run --rm --runtime io.containerd.runc.v2 \ docker.io/library/alpine:latest test ``` +## Integration Tests + +The `integration/` suite provisions a disposable KinD cluster, installs the limit plugin, and runs stress workloads to ensure disk I/O throttling and cache eviction paths behave as expected. + +**Prerequisites** + +- Docker engine (with BuildKit enabled) +- [kind](https://kind.sigs.k8s.io/) and `kubectl` binaries in `PATH` + +**Run locally** + +```bash +make e2e +``` + +The tests will build a local limit plugin image, load it into KinD, and assert on plugin logs for I/O throttling (`Applied io limit`) and memory cache clearing (`memory exceeds`). Use `GOFLAGS='-count=1' make e2e` when iterating to avoid test caching. + +These checks also run in CI via `.github/workflows/e2e.yml`. + ## Dependency Management This project uses automated dependency management through GitHub Actions and Dependabot to keep dependencies up-to-date and secure. diff --git a/go.mod b/go.mod index 10db20f..9a56536 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/kcrow-io/plugins -go 1.25.0 +go 1.26 exclude google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013 diff --git a/integration/helpers/exec.go b/integration/helpers/exec.go new file mode 100644 index 0000000..158c2c1 --- /dev/null +++ b/integration/helpers/exec.go @@ -0,0 +1,66 @@ +package helpers + +import ( + "bytes" + "context" + "fmt" + "os/exec" + "strings" + "time" +) + +// Command represents an external command invocation with contextual logging. +type Command struct { + Name string + Args []string + Env []string + Timeout time.Duration + Stdin []byte +} + +// Run executes the configured command and returns stdout/stderr or error. +func (c Command) Run(ctx context.Context) (string, string, error) { + if c.Name == "" { + return "", "", fmt.Errorf("command name is required") + } + + ctxToUse := ctx + var cancel context.CancelFunc + if c.Timeout > 0 { + ctxToUse, cancel = context.WithTimeout(ctx, c.Timeout) + } else { + ctxToUse, cancel = context.WithCancel(ctx) + } + defer cancel() + + cmd := exec.CommandContext(ctxToUse, c.Name, c.Args...) + cmd.Env = append(cmd.Env, c.Env...) + + var stdoutBuf, stderrBuf bytes.Buffer + cmd.Stdout = &stdoutBuf + cmd.Stderr = &stderrBuf + if len(c.Stdin) > 0 { + cmd.Stdin = bytes.NewReader(c.Stdin) + } + + err := cmd.Run() + // When the context was canceled/timeout, surface a more descriptive error. + if ctxToUse.Err() != nil && err == context.DeadlineExceeded { + return stdoutBuf.String(), stderrBuf.String(), fmt.Errorf("%s timed out after %s", c.String(), c.Timeout) + } + + return stdoutBuf.String(), stderrBuf.String(), err +} + +// String returns a shell-like representation of the command. +func (c Command) String() string { + parts := []string{c.Name} + for _, arg := range c.Args { + if strings.ContainsAny(arg, " \t") { + parts = append(parts, fmt.Sprintf("%q", arg)) + continue + } + parts = append(parts, arg) + } + return strings.Join(parts, " ") +} diff --git a/integration/helpers/exec_test.go b/integration/helpers/exec_test.go new file mode 100644 index 0000000..7e2fe83 --- /dev/null +++ b/integration/helpers/exec_test.go @@ -0,0 +1,40 @@ +package helpers + +import ( + "context" + "strings" + "testing" + "time" +) + +func TestCommandRunSuccess(t *testing.T) { + cmd := Command{ + Name: "sh", + Args: []string{"-c", "echo hello"}, + } + stdout, stderr, err := cmd.Run(context.Background()) + if err != nil { + t.Fatalf("expected success, got err: %v (stderr=%s)", err, stderr) + } + if strings.TrimSpace(stdout) != "hello" { + t.Fatalf("unexpected stdout: %q", stdout) + } + if stderr != "" { + t.Fatalf("expected empty stderr, got %q", stderr) + } +} + +func TestCommandRunTimeout(t *testing.T) { + cmd := Command{ + Name: "sh", + Args: []string{"-c", "sleep 2"}, + Timeout: 250 * time.Millisecond, + } + _, _, err := cmd.Run(context.Background()) + if err == nil { + t.Fatalf("expected timeout error") + } + if !strings.Contains(err.Error(), "timed out") && !strings.Contains(err.Error(), "signal: killed") { + t.Fatalf("expected timeout error message, got %v", err) + } +} diff --git a/integration/kindenv/kindenv.go b/integration/kindenv/kindenv.go new file mode 100644 index 0000000..de52a01 --- /dev/null +++ b/integration/kindenv/kindenv.go @@ -0,0 +1,134 @@ +package kindenv + +import ( + "context" + "errors" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + "github.com/kcrow-io/plugins/integration/helpers" +) + +// Cluster wraps a KinD cluster and related helpers such as kubeconfig path. +type Cluster struct { + Name string + Kubeconfig string +} + +// CreateCluster provisions a KinD cluster with the provided configuration. +func CreateCluster(ctx context.Context, name, configPath string) (*Cluster, error) { + if name == "" { + name = "nri-limit-e2e" + } + if err := ensureBinary("kind"); err != nil { + return nil, err + } + + config := `kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +networking: + ipFamily: ipv4` + + args := []string{"create", "cluster", "-v7", "--wait", "5m", "--retain", "--name", name, "--config", "-"} + + cmd := helpers.Command{ + Name: "kind", + Args: args, + Timeout: 5 * time.Minute, + Stdin: []byte(config), + } + if _, stderr, err := cmd.Run(ctx); err != nil { + return nil, fmt.Errorf("create cluster: %w (stderr: %s)", err, stderr) + } + + kubeconfig, err := kubeconfigFor(ctx, name) + if err != nil { + return nil, err + } + + return &Cluster{ + Name: name, + Kubeconfig: kubeconfig, + }, nil +} + +// Destroy tears down the cluster. +func (c *Cluster) Destroy(ctx context.Context) error { + if c == nil || c.Name == "" { + return nil + } + cmd := helpers.Command{ + Name: "kind", + Args: []string{"delete", "cluster", "--name", c.Name}, + Timeout: 2 * time.Minute, + } + _, stderr, err := cmd.Run(ctx) + if err != nil { + return fmt.Errorf("delete cluster: %w (stderr: %s)", err, stderr) + } + return nil +} + +// Kubectl invokes kubectl with the cluster kubeconfig. +func (c *Cluster) Kubectl(ctx context.Context, args ...string) (string, string, error) { + if err := ensureBinary("kubectl"); err != nil { + return "", "", err + } + cmd := helpers.Command{ + Name: "kubectl", + Args: append([]string{"--kubeconfig", c.Kubeconfig}, args...), + Timeout: 2 * time.Minute, + } + return cmd.Run(ctx) +} + +func kubeconfigFor(ctx context.Context, name string) (string, error) { + cmd := helpers.Command{ + Name: "kind", + Args: []string{"get", "kubeconfig", "--name", name}, + Timeout: 30 * time.Second, + } + stdout, stderr, err := cmd.Run(ctx) + if err != nil { + return "", fmt.Errorf("get kubeconfig: %w (stderr: %s)", err, stderr) + } + + dir, err := os.MkdirTemp("", "kind-kubeconfig-*") + if err != nil { + return "", fmt.Errorf("create kubeconfig temp dir: %w", err) + } + path := filepath.Join(dir, "config") + if err := os.WriteFile(path, []byte(stdout), 0o600); err != nil { + return "", fmt.Errorf("write kubeconfig: %w", err) + } + return path, nil +} + +func ensureBinary(name string) error { + if _, err := exec.LookPath(name); err != nil { + return fmt.Errorf("%s binary is required in PATH: %w", name, err) + } + return nil +} + +// ApplyManifest applies the provided manifest string against the cluster. +func (c *Cluster) ApplyManifest(ctx context.Context, manifest string) error { + if strings.TrimSpace(manifest) == "" { + return errors.New("manifest cannot be empty") + } + cmd := helpers.Command{ + Name: "kubectl", + Args: []string{"--kubeconfig", c.Kubeconfig, "apply", "-f", "-"}, + Stdin: []byte(manifest), + Timeout: 2 * time.Minute, + } + stdout, stderr, err := cmd.Run(ctx) + if err != nil { + return fmt.Errorf("kubectl apply failed: %w (stderr: %s, stdout: %s)", err, stderr, stdout) + } + return nil +} diff --git a/integration/kindenv/kindenv_test.go b/integration/kindenv/kindenv_test.go new file mode 100644 index 0000000..383329b --- /dev/null +++ b/integration/kindenv/kindenv_test.go @@ -0,0 +1,21 @@ +package kindenv + +import ( + "os/exec" + "strings" + "testing" +) + +func TestEnsureBinaryMissing(t *testing.T) { + name := "definitely-missing-binary" + if _, err := exec.LookPath(name); err == nil { + t.Skipf("%s unexpectedly present in PATH", name) + } + err := ensureBinary(name) + if err == nil { + t.Fatalf("expected error when %s binary missing", name) + } + if !strings.Contains(err.Error(), name) { + t.Fatalf("expected error to mention binary name, got %v", err) + } +} diff --git a/integration/kindenv/kubectl.go b/integration/kindenv/kubectl.go new file mode 100644 index 0000000..1129c4c --- /dev/null +++ b/integration/kindenv/kubectl.go @@ -0,0 +1,37 @@ +package kindenv + +import ( + "context" + "fmt" + "time" +) + +// RolloutStatus waits for a resource rollout to finish. +func (c *Cluster) RolloutStatus(ctx context.Context, resource string, timeout time.Duration) error { + if timeout == 0 { + timeout = 2 * time.Minute + } + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + args := []string{"rollout", "status", resource, "-n", "kube-system", "--timeout", timeout.String()} + if _, stderr, err := c.Kubectl(ctx, args...); err != nil { + return fmt.Errorf("kubectl rollout status failed: %w (stderr: %s)", err, stderr) + } + return nil +} + +// WaitForJobCompletion waits for the named Job to complete. +func (c *Cluster) WaitForJobCompletion(ctx context.Context, namespace, job string, timeout time.Duration) error { + if timeout == 0 { + timeout = 5 * time.Minute + } + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + args := []string{"wait", "--for=condition=complete", fmt.Sprintf("job/%s", job), "-n", namespace, "--timeout", timeout.String()} + if _, stderr, err := c.Kubectl(ctx, args...); err != nil { + return fmt.Errorf("wait for job completion failed: %w (stderr: %s)", err, stderr) + } + return nil +} diff --git a/integration/limit_e2e_test.go b/integration/limit_e2e_test.go new file mode 100644 index 0000000..90189ff --- /dev/null +++ b/integration/limit_e2e_test.go @@ -0,0 +1,287 @@ +//go:build e2e + +package integration + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/kcrow-io/plugins/integration/helpers" + "github.com/kcrow-io/plugins/integration/kindenv" +) + +const ( + diskJobName = "limit-disk-stress" + memoryJobName = "limit-memory-stress" +) + +func TestLimitPluginE2E(t *testing.T) { + ctx := context.Background() + requireCommands(t, "docker", "kind", "kubectl") + + manifestDir := manifestsDir(t) + + // Build binary using make build + binPath, confPath := buildLimitBinary(t, ctx) + + cluster, err := kindenv.CreateCluster(ctx, "limit-e2e", filepath.Join(manifestDir, "kind-config.yaml")) + if err != nil { + t.Fatalf("failed to create KinD cluster: %v", err) + } + t.Cleanup(func() { + cluster.Destroy(context.Background()) + }) + + // Install plugin directly into kind nodes + if err := installPluginIntoNodes(ctx, cluster.Name, binPath, confPath); err != nil { + t.Fatalf("failed to install plugin into nodes: %v", err) + } + + applyJobAndWait(t, ctx, cluster, filepath.Join(manifestDir, "stress-disk-job.yaml"), + map[string]string{"__DISK_JOB_NAME__": diskJobName}, "default", diskJobName) + + if err := waitForPluginLog(ctx, cluster.Name, "Applied io limit", 2*time.Minute); err != nil { + t.Fatalf("io throttling log not observed: %v", err) + } + + applyJobAndWait(t, ctx, cluster, filepath.Join(manifestDir, "stress-memory-job.yaml"), + map[string]string{"__MEMORY_JOB_NAME__": memoryJobName}, "default", memoryJobName) + + if err := waitForPluginLog(ctx, cluster.Name, "memory exceeds", 2*time.Minute); err != nil { + t.Fatalf("memory cache clearing log not observed: %v", err) + } +} + +func requireCommands(t *testing.T, names ...string) { + t.Helper() + for _, name := range names { + if _, err := exec.LookPath(name); err != nil { + t.Skipf("%s not found in PATH: %v", name, err) + } + } +} + +func buildLimitBinary(t *testing.T, ctx context.Context) (string, string) { + t.Helper() + root := repoRoot(t) + + // Run make build + buildCmd := exec.CommandContext(ctx, "make", "build") + buildCmd.Dir = root + buildCmd.Stdout = os.Stdout + buildCmd.Stderr = os.Stderr + if err := buildCmd.Run(); err != nil { + t.Fatalf("failed to build limit binary with make: %v", err) + } + + // The binary is built as bin/linux/amd64/07-limit + binPath := filepath.Join(root, "bin", "linux", "amd64", "07-limit") + if _, err := os.Stat(binPath); err != nil { + t.Fatalf("binary not found at %s: %v", binPath, err) + } + + // The config file is at bin/linux/amd64/limit.conf + confPath := filepath.Join(root, "bin", "linux", "amd64", "limit.conf") + if _, err := os.Stat(confPath); err != nil { + t.Fatalf("config file not found at %s: %v", confPath, err) + } + + return binPath, confPath +} + +func installPluginIntoNodes(ctx context.Context, clusterName, binPath, confPath string) error { + // Get list of nodes in the cluster + cmd := helpers.Command{ + Name: "docker", + Args: []string{"ps", "-q", "--filter", fmt.Sprintf("name=%s-", clusterName)}, + Timeout: 30 * time.Second, + } + stdout, stderr, err := cmd.Run(ctx) + if err != nil { + return fmt.Errorf("failed to list nodes: %w (stderr: %s)", err, stderr) + } + + nodeIDs := strings.Fields(strings.TrimSpace(stdout)) + if len(nodeIDs) == 0 { + return fmt.Errorf("no nodes found for cluster %s", clusterName) + } + + for _, nodeID := range nodeIDs { + // Create directories in the node + createDirsCmd := helpers.Command{ + Name: "docker", + Args: []string{"exec", nodeID, "mkdir", "-p", "/opt/nri/plugins", "/etc/nri/conf.d"}, + Timeout: 30 * time.Second, + } + if _, stderr, err := createDirsCmd.Run(ctx); err != nil { + return fmt.Errorf("failed to create directories in node %s: %w (stderr: %s)", nodeID, err, stderr) + } + + // Copy binary to /opt/nri/plugins/ + copyBinCmd := helpers.Command{ + Name: "docker", + Args: []string{"cp", binPath, fmt.Sprintf("%s:/opt/nri/plugins/06-limit", nodeID)}, + Timeout: 30 * time.Second, + } + if _, stderr, err := copyBinCmd.Run(ctx); err != nil { + return fmt.Errorf("failed to copy binary to node %s: %w (stderr: %s)", nodeID, err, stderr) + } + + // Make binary executable + chmodCmd := helpers.Command{ + Name: "docker", + Args: []string{"exec", nodeID, "chmod", "+x", "/opt/nri/plugins/06-limit"}, + Timeout: 30 * time.Second, + } + if _, stderr, err := chmodCmd.Run(ctx); err != nil { + return fmt.Errorf("failed to chmod binary in node %s: %w (stderr: %s)", nodeID, err, stderr) + } + + // Copy config to /etc/nri/conf.d/ + copyConfCmd := helpers.Command{ + Name: "docker", + Args: []string{"cp", confPath, fmt.Sprintf("%s:/etc/nri/conf.d/limit.conf", nodeID)}, + Timeout: 30 * time.Second, + } + if _, stderr, err := copyConfCmd.Run(ctx); err != nil { + return fmt.Errorf("failed to copy config to node %s: %w (stderr: %s)", nodeID, err, stderr) + } + + // Restart containerd + restartCmd := helpers.Command{ + Name: "docker", + Args: []string{"exec", nodeID, "service", "containerd", "restart"}, + Timeout: 1 * time.Minute, + } + if _, stderr, err := restartCmd.Run(ctx); err != nil { + return fmt.Errorf("failed to restart containerd in node %s: %w (stderr: %s)", nodeID, err, stderr) + } + + // Wait a bit for containerd to fully restart + time.Sleep(5 * time.Second) + } + + return nil +} + +func readManifest(t *testing.T, path string) string { + t.Helper() + data, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read manifest %s: %v", path, err) + } + return string(data) +} + +func applyJobAndWait(t *testing.T, ctx context.Context, cluster *kindenv.Cluster, manifestPath string, replacements map[string]string, namespace, jobName string) { + t.Helper() + if _, _, err := cluster.Kubectl(ctx, "delete", "job", jobName, "-n", namespace, "--ignore-not-found"); err != nil { + t.Logf("job cleanup failed (non-fatal): %v", err) + } + manifest := readManifest(t, manifestPath) + for k, v := range replacements { + manifest = strings.ReplaceAll(manifest, k, v) + } + if err := cluster.ApplyManifest(ctx, manifest); err != nil { + t.Fatalf("failed to apply manifest %s: %v", manifestPath, err) + } + if err := cluster.WaitForJobCompletion(ctx, namespace, jobName, 5*time.Minute); err != nil { + t.Fatalf("job %s did not complete: %v", jobName, err) + } +} + +func runOrLog(t *testing.T, ctx context.Context, name string, args ...string) { + t.Helper() + cmd := helpers.Command{Name: name, Args: args} + if _, _, err := cmd.Run(ctx); err != nil { + t.Logf("command %s %v failed: %v", name, args, err) + } +} + +func waitForLogSubstring(ctx context.Context, cluster *kindenv.Cluster, substr string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for { + stdout, _, err := cluster.Kubectl(ctx, "-n", "nri-system", "logs", "daemonset/nri-limit") + if err == nil && strings.Contains(stdout, substr) { + return nil + } + if time.Now().After(deadline) { + if err != nil { + return fmt.Errorf("failed to fetch logs before deadline: %w", err) + } + return fmt.Errorf("log substring %q not found before deadline", substr) + } + time.Sleep(5 * time.Second) + } +} + +func waitForPluginLog(ctx context.Context, clusterName, substr string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + + // Get the first node ID + cmd := helpers.Command{ + Name: "docker", + Args: []string{"ps", "-q", "--filter", fmt.Sprintf("name=%s-", clusterName)}, + Timeout: 30 * time.Second, + } + stdout, _, err := cmd.Run(ctx) + if err != nil { + return fmt.Errorf("failed to list nodes: %w", err) + } + + nodeIDs := strings.Fields(strings.TrimSpace(stdout)) + if len(nodeIDs) == 0 { + return fmt.Errorf("no nodes found for cluster %s", clusterName) + } + nodeID := nodeIDs[0] + + for { + // Read the log file from the node + readLogCmd := helpers.Command{ + Name: "docker", + Args: []string{"exec", nodeID, "cat", "/var/log/nri-limit.log"}, + Timeout: 30 * time.Second, + } + logContent, _, err := readLogCmd.Run(ctx) + if err == nil && strings.Contains(logContent, substr) { + return nil + } + + if time.Now().After(deadline) { + if err != nil { + return fmt.Errorf("failed to read log file before deadline: %w", err) + } + return fmt.Errorf("log substring %q not found before deadline", substr) + } + time.Sleep(5 * time.Second) + } +} + +func repoRoot(t *testing.T) string { + t.Helper() + dir, err := os.Getwd() + if err != nil { + t.Fatalf("get working directory: %v", err) + } + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + return dir + } + parent := filepath.Dir(dir) + if parent == dir { + t.Fatalf("could not locate go.mod starting from %s", dir) + } + dir = parent + } +} + +func manifestsDir(t *testing.T) string { + t.Helper() + return filepath.Join(repoRoot(t), "integration", "manifests") +} diff --git a/integration/manifests/kind-config.yaml b/integration/manifests/kind-config.yaml new file mode 100644 index 0000000..6cf03f2 --- /dev/null +++ b/integration/manifests/kind-config.yaml @@ -0,0 +1,19 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: + - role: control-plane + kubeadmConfigPatches: + - | + kind: ClusterConfiguration + apiServer: + extraArgs: + "enable-admission-plugins": NodeRestriction + extraMounts: + - hostPath: /var/lib/docker + containerPath: /var/lib/docker + containerdConfigPatches: + - | + [plugins."io.containerd.grpc.v1.cri"] + enable_nri = true + [plugins."io.containerd.grpc.v1.cri".containerd] + snapshotter = "overlayfs" diff --git a/integration/manifests/limit-plugin.yaml b/integration/manifests/limit-plugin.yaml new file mode 100644 index 0000000..6e40aaf --- /dev/null +++ b/integration/manifests/limit-plugin.yaml @@ -0,0 +1,84 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: nri-system +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: limit-config + namespace: nri-system +data: + limit.conf: | + { + "watch_interval": 5, + "io": { + "max_disk_bytes": 10485760, + "bps_limit": 1048576, + "iops_limit": 5 + }, + "memory": { + "pods-usage-percent": 20, + "cache-rss-ratio": 3.5, + "min-cache-bytes": 10485760 + }, + "log_path": "" + } +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nri-limit + namespace: nri-system +spec: + selector: + matchLabels: + app: limit-plugin + template: + metadata: + labels: + app: limit-plugin + spec: + hostPID: true + containers: + - name: limit + image: __PLUGIN_IMAGE__ + imagePullPolicy: Never + securityContext: + privileged: true + command: ["/opt/nri/plugins/limit"] + volumeMounts: + - mountPath: /etc/containerd + name: containerd-config + - mountPath: /run/containerd + name: containerd-socket + - mountPath: /opt/nri/plugins + name: plugin-dir + - mountPath: /var/lib/containerd + name: containerd-root + - mountPath: /etc/nri/conf.d + name: nri-conf + env: + - name: LIMIT_CONFIG_FILE + value: /etc/nri/conf.d/limit.conf + volumes: + - name: containerd-config + hostPath: + path: /etc/containerd + - name: containerd-socket + hostPath: + path: /run/containerd + - name: plugin-dir + hostPath: + path: /opt/nri/plugins + - name: containerd-root + hostPath: + path: /var/lib/containerd + - name: nri-conf + projected: + sources: + - configMap: + name: limit-config + items: + - key: limit.conf + path: limit.conf diff --git a/integration/manifests/stress-disk-job.yaml b/integration/manifests/stress-disk-job.yaml new file mode 100644 index 0000000..809efb2 --- /dev/null +++ b/integration/manifests/stress-disk-job.yaml @@ -0,0 +1,18 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: __DISK_JOB_NAME__ + namespace: default +spec: + template: + spec: + restartPolicy: Never + containers: + - name: stress + image: alpine + command: + - /bin/sh + - -c + - | + dd if=/dev/zero of=/tmp/bigfile bs=1M count=20 && sync && sleep 5 + backoffLimit: 0 diff --git a/integration/manifests/stress-memory-job.yaml b/integration/manifests/stress-memory-job.yaml new file mode 100644 index 0000000..ab40ad1 --- /dev/null +++ b/integration/manifests/stress-memory-job.yaml @@ -0,0 +1,18 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: __MEMORY_JOB_NAME__ + namespace: default +spec: + template: + spec: + restartPolicy: Never + containers: + - name: stress + image: alpine + command: + - /bin/sh + - -c + - | + dd if=/dev/zero of=/tmp/cachefile bs=1M count=200 && sync && sleep 10 + backoffLimit: 0 diff --git a/pkg/containerd/containerd.go b/pkg/containerd/containerd.go index 6b5baeb..34027c6 100644 --- a/pkg/containerd/containerd.go +++ b/pkg/containerd/containerd.go @@ -17,8 +17,8 @@ const ( type containerdConfig struct { Root string `toml:"root"` GRPC struct { - Address string `toml:"address"` - } `toml:"grpc"` + Address string `toml:"address,omit"` + } `toml:"grpc,omit"` } type Cntrd struct { @@ -37,7 +37,9 @@ func ParseContainerdConfig(configPath string) (*Cntrd, error) { if _, err := toml.DecodeFile(configPath, &config); err != nil { return nil, fmt.Errorf("failed to parse containerd config: %w", err) } - + if config.GRPC.Address == "" { + config.GRPC.Address = "/run/containerd/containerd.sock" + } client, err := client.New(config.GRPC.Address, client.WithDefaultNamespace(DefaultNamespace)) if err != nil { return nil, fmt.Errorf("failed to connect to containerd: %w", err)