From 1e3cc47800d7029c89806301da413c7c4d8dc6f1 Mon Sep 17 00:00:00 2001 From: Kacy Fortner Date: Fri, 10 Apr 2026 03:08:30 +0000 Subject: [PATCH 01/14] Broaden app control plane to all workloads --- README.md | 17 +- docs/cluster-guide.md | 12 +- docs/users-guide.md | 7 +- src/api/routes/cluster_agents.zig | 2 + src/api/routes/cluster_agents/app_routes.zig | 23 +- .../routes/cluster_agents/apply_request.zig | 82 +-- .../routes/cluster_agents/deploy_routes.zig | 57 ++- .../routes/cluster_agents/workload_routes.zig | 402 +++++++++++++++ src/lib/command_registry.zig | 4 +- src/lib/completion.zig | 13 +- src/lib/json_helpers.zig | 43 ++ src/manifest/app_snapshot.zig | 213 ++++++++ src/manifest/app_spec.zig | 472 +++++++++++++++--- src/manifest/cli/ops.zig | 67 ++- src/manifest/cli/train.zig | 171 ++++++- src/manifest/loader.zig | 37 +- src/runtime/cli/status_command.zig | 33 +- src/state/store.zig | 6 + src/state/store/training.zig | 24 + src/test_root.zig | 2 + 20 files changed, 1524 insertions(+), 163 deletions(-) create mode 100644 src/api/routes/cluster_agents/workload_routes.zig create mode 100644 src/manifest/app_snapshot.zig diff --git a/README.md b/README.md index a026fd7b..ae597f22 100644 --- a/README.md +++ b/README.md @@ -186,6 +186,7 @@ yoq up --dev watch and hot-restart on changes yoq up --server host:port deploy to a cluster yoq down [-f manifest.toml] stop services from a manifest yoq run-worker run a one-shot worker +yoq run-worker --server host:port yoq init [-f path] scaffold a manifest yoq validate [-f manifest.toml] [-q] validate a manifest ``` @@ -260,13 +261,13 @@ yoq gpu bench [--gpus N] GPU-to-GPU bandwidth benchmark ### training ```text -yoq train start start a training job -yoq train status show training job status -yoq train stop stop a training job -yoq train pause pause a training job -yoq train resume resume a paused job -yoq train scale scale training ranks -yoq train logs [--rank N] show logs for a training rank +yoq train start [--server host:port] start a training job +yoq train status [--server host:port] show training job status +yoq train stop [--server host:port] stop a training job +yoq train pause [--server host:port] pause a training job +yoq train resume [--server host:port] resume a paused job +yoq train scale [--server host:port] --gpus scale training ranks +yoq train logs [--server host:port] [--rank N] show logs for a training rank ``` ### diagnostics @@ -290,7 +291,7 @@ Notes: - `--json` is available on `ps`, `images`, `prune`, `version`, `gpu topo`, and `doctor`. - crons defined in the manifest start automatically with `yoq up`. - deployment, metrics, and certificate commands also support `--server host:port`. -- clustered manifest deploys now go through the app-first `/apps/apply` API. the older `/deploy` route remains as a compatibility shim for legacy callers. +- clustered manifest deploys now go through the app-first `/apps/apply` API and carry services, workers, crons, and training definitions in one app snapshot. the older `/deploy` route remains as a compatibility shim for legacy callers. ## current status diff --git a/docs/cluster-guide.md b/docs/cluster-guide.md index 167c9435..3249cf4e 100644 --- a/docs/cluster-guide.md +++ b/docs/cluster-guide.md @@ -136,7 +136,7 @@ port = 3000 yoq up --server ``` -the `--server` flag tells yoq to submit the manifest to the cluster API instead of running locally. under the hood the CLI now sends a canonical app snapshot to `POST /apps/apply`; the older `/deploy` route remains only for compatibility. the scheduler places containers on agents using bin-packing (scores by free CPU + memory). service discovery and load balancing work transparently across nodes via the WireGuard overlay and eBPF. +the `--server` flag tells yoq to submit the manifest to the cluster API instead of running locally. under the hood the CLI now sends a canonical app snapshot to `POST /apps/apply`; that snapshot carries services, workers, crons, and training jobs together. the older `/deploy` route remains only for compatibility. the scheduler places containers on agents using bin-packing (scores by free CPU + memory). service discovery and load balancing work transparently across nodes via the WireGuard overlay and eBPF. after deploy, use the app-first day-2 commands: @@ -147,7 +147,7 @@ yoq history --app [name] --server 10.0.0.1:7700 yoq rollback --app [name] --server 10.0.0.1:7700 --release ``` -`yoq apps` shows the latest release summary for every app, `status --app` shows the latest release metadata for one app, `history --app` lists prior releases, and remote `rollback --app ... --release` re-applies a stored app snapshot. +`yoq apps` shows the latest release summary for every app, `status --app` shows the latest release metadata for one app, `history --app` lists prior releases, and remote `rollback --app ... --release` re-applies a stored app snapshot. `yoq run-worker --server ...` and `yoq train ... --server ...` now resolve workers and training jobs from the current app release on the server. --- @@ -342,12 +342,20 @@ for app operations, the important write paths are: - `POST /apps/apply` - `POST /apps//rollback` +- `POST /apps//workers//run` +- `POST /apps//training//start` +- `POST /apps//training//stop` +- `POST /apps//training//pause` +- `POST /apps//training//resume` +- `POST /apps//training//scale` the important read paths are: - `GET /apps` - `GET /apps//status` - `GET /apps//history` +- `GET /apps//training//status` +- `GET /apps//training//logs` ### draining a node diff --git a/docs/users-guide.md b/docs/users-guide.md index 5e338983..9b154e27 100644 --- a/docs/users-guide.md +++ b/docs/users-guide.md @@ -170,6 +170,8 @@ this gives the operator one app-first day-2 model: - `yoq rollback --app [name]` — print the last successful local app snapshot - `yoq rollback --app [name] --server host:port --release ` — re-apply a prior remote app release - `yoq apps` — list app release summaries across all known apps +- `yoq run-worker [--server host:port] ` — run a worker from the current app release +- `yoq train start|status|stop|pause|resume|scale|logs [--server host:port] ` — manage training jobs from the current app release ### dev mode @@ -217,7 +219,7 @@ if the leader changes, agents follow automatically — heartbeat responses inclu ### app-first control plane -cluster manifest deploys now use `POST /apps/apply` as the canonical write path. the older `POST /deploy` route is still accepted as a compatibility shim, but new CLI work targets the app-first route. +cluster manifest deploys now use `POST /apps/apply` as the canonical write path. the app snapshot includes services, workers, crons, and training jobs. the older `POST /deploy` route is still accepted as a compatibility shim, but new CLI work targets the app-first route. the cluster API also exposes app-scoped day-2 reads and rollback: @@ -225,6 +227,9 @@ the cluster API also exposes app-scoped day-2 reads and rollback: - `GET /apps//status` — latest app release metadata - `GET /apps//history` — app release history - `POST /apps//rollback` with `{"release_id":"..."}` — re-apply a stored app release snapshot +- `POST /apps//workers//run` — run a worker from the current app release +- `POST /apps//training//start|stop|pause|resume|scale` — manage training jobs for the current app release +- `GET /apps//training//status|logs` — inspect training jobs for the current app release ### rolling upgrades diff --git a/src/api/routes/cluster_agents.zig b/src/api/routes/cluster_agents.zig index 3c634755..d94f9dc6 100644 --- a/src/api/routes/cluster_agents.zig +++ b/src/api/routes/cluster_agents.zig @@ -6,6 +6,7 @@ const testing = std.testing; const cluster_routes = @import("cluster_agents/cluster_routes.zig"); const agent_routes = @import("cluster_agents/agent_routes.zig"); const app_routes = @import("cluster_agents/app_routes.zig"); +const workload_routes = @import("cluster_agents/workload_routes.zig"); const deploy_routes = @import("cluster_agents/deploy_routes.zig"); const writers = @import("cluster_agents/writers.zig"); @@ -37,6 +38,7 @@ pub fn route(request: http.Request, alloc: std.mem.Allocator, ctx: RouteContext) } if (app_routes.route(request, alloc, ctx)) |resp| return resp; + if (workload_routes.route(request, alloc, ctx)) |resp| return resp; if (path.len > "/agents/".len and std.mem.startsWith(u8, path, "/agents/")) { const rest = path["/agents/".len..]; diff --git a/src/api/routes/cluster_agents/app_routes.zig b/src/api/routes/cluster_agents/app_routes.zig index e4f0fc6b..089d8e43 100644 --- a/src/api/routes/cluster_agents/app_routes.zig +++ b/src/api/routes/cluster_agents/app_routes.zig @@ -4,6 +4,7 @@ const sqlite = @import("sqlite"); const cluster_node = @import("../../../cluster/node.zig"); const json_helpers = @import("../../../lib/json_helpers.zig"); const apply_release = @import("../../../manifest/apply_release.zig"); +const app_snapshot = @import("../../../manifest/app_snapshot.zig"); const schema = @import("../../../state/schema.zig"); const store = @import("../../../state/store.zig"); const common = @import("../common.zig"); @@ -164,6 +165,7 @@ fn formatAppStatusResponseFromDeployments( alloc, apply_release.reportFromDeployment(latest), if (previous_successful) |dep| apply_release.reportFromDeployment(dep) else null, + app_snapshot.summarize(latest.config_snapshot), ); } @@ -175,6 +177,7 @@ fn formatAppHistoryResponse(alloc: std.mem.Allocator, deployments: []const store try writer.writeByte('['); for (deployments, 0..) |dep, i| { const report = apply_release.reportFromDeployment(dep); + const summary = app_snapshot.summarize(dep.config_snapshot); if (i > 0) try writer.writeByte(','); try writer.writeByte('{'); try json_helpers.writeJsonStringField(writer, "id", report.release_id orelse ""); @@ -189,7 +192,11 @@ fn formatAppHistoryResponse(alloc: std.mem.Allocator, deployments: []const store try writer.writeByte(','); try json_helpers.writeJsonStringField(writer, "manifest_hash", report.manifest_hash); try writer.print(",\"created_at\":{d}", .{report.created_at}); - try writer.print(",\"completed_targets\":{d},\"failed_targets\":{d},\"remaining_targets\":{d}", .{ + try writer.print(",\"service_count\":{d},\"worker_count\":{d},\"cron_count\":{d},\"training_job_count\":{d},\"completed_targets\":{d},\"failed_targets\":{d},\"remaining_targets\":{d}", .{ + summary.service_count, + summary.worker_count, + summary.cron_count, + summary.training_job_count, report.completed_targets, report.failed_targets, report.remainingTargets(), @@ -208,6 +215,7 @@ fn formatAppStatusResponse( alloc: std.mem.Allocator, report: apply_release.ApplyReport, previous_successful: ?apply_release.ApplyReport, + summary: app_snapshot.Summary, ) ![]u8 { var json_buf: std.ArrayList(u8) = .empty; errdefer json_buf.deinit(alloc); @@ -223,9 +231,12 @@ fn formatAppStatusResponse( try json_helpers.writeJsonStringField(writer, "status", report.status.toString()); try writer.writeByte(','); try json_helpers.writeJsonStringField(writer, "manifest_hash", report.manifest_hash); - try writer.print(",\"created_at\":{d},\"service_count\":{d},\"completed_targets\":{d},\"failed_targets\":{d},\"remaining_targets\":{d}", .{ + try writer.print(",\"created_at\":{d},\"service_count\":{d},\"worker_count\":{d},\"cron_count\":{d},\"training_job_count\":{d},\"completed_targets\":{d},\"failed_targets\":{d},\"remaining_targets\":{d}", .{ report.created_at, - report.service_count, + summary.service_count, + summary.worker_count, + summary.cron_count, + summary.training_job_count, report.completed_targets, report.failed_targets, report.remainingTargets(), @@ -390,7 +401,7 @@ test "formatAppStatusResponse summarizes latest release" { .created_at = 200, }; - const json = try formatAppStatusResponse(alloc, apply_release.reportFromDeployment(latest), null); + const json = try formatAppStatusResponse(alloc, apply_release.reportFromDeployment(latest), null, app_snapshot.summarize(latest.config_snapshot)); defer alloc.free(json); try std.testing.expect(std.mem.indexOf(u8, json, "\"app_name\":\"demo-app\"") != null); @@ -491,7 +502,7 @@ test "formatAppStatusResponse includes structured rollback metadata" { .created_at = 300, }; - const json = try formatAppStatusResponse(alloc, apply_release.reportFromDeployment(latest), null); + const json = try formatAppStatusResponse(alloc, apply_release.reportFromDeployment(latest), null, app_snapshot.summarize(latest.config_snapshot)); defer alloc.free(json); try std.testing.expect(std.mem.indexOf(u8, json, "\"trigger\":\"rollback\"") != null); @@ -513,7 +524,7 @@ test "formatAppStatusResponse falls back to rollback metadata inferred from lega .created_at = 400, }; - const json = try formatAppStatusResponse(alloc, apply_release.reportFromDeployment(latest), null); + const json = try formatAppStatusResponse(alloc, apply_release.reportFromDeployment(latest), null, app_snapshot.summarize(latest.config_snapshot)); defer alloc.free(json); try std.testing.expect(std.mem.indexOf(u8, json, "\"trigger\":\"rollback\"") != null); diff --git a/src/api/routes/cluster_agents/apply_request.zig b/src/api/routes/cluster_agents/apply_request.zig index bda0730c..798a8d48 100644 --- a/src/api/routes/cluster_agents/apply_request.zig +++ b/src/api/routes/cluster_agents/apply_request.zig @@ -2,6 +2,7 @@ const std = @import("std"); const scheduler = @import("../../../cluster/scheduler.zig"); const volumes_mod = @import("../../../state/volumes.zig"); const json_helpers = @import("../../../lib/json_helpers.zig"); +const app_snapshot = @import("../../../manifest/app_snapshot.zig"); const common = @import("../common.zig"); const extractJsonString = json_helpers.extractJsonString; @@ -10,6 +11,7 @@ const extractJsonArray = json_helpers.extractJsonArray; pub const ApplyRequest = struct { app_name: ?[]const u8, + summary: app_snapshot.Summary, requests: std.ArrayListUnmanaged(scheduler.PlacementRequest) = .empty, pub fn deinit(self: *ApplyRequest, alloc: std.mem.Allocator) void { @@ -36,6 +38,7 @@ pub const ParseError = error{ pub fn parse(alloc: std.mem.Allocator, body: []const u8, require_app_name: bool) ParseError!ApplyRequest { var parsed: ApplyRequest = .{ .app_name = extractJsonString(body, "app_name") orelse extractJsonString(body, "volume_app"), + .summary = app_snapshot.summarize(body), }; errdefer parsed.deinit(alloc); @@ -43,41 +46,46 @@ pub fn parse(alloc: std.mem.Allocator, body: []const u8, require_app_name: bool) return ParseError.MissingAppName; } - const services_json = extractJsonArray(body, "services") orelse return ParseError.MissingServicesArray; + if (extractJsonArray(body, "services")) |services_json| { + var iter = json_helpers.extractJsonObjects(services_json); + while (iter.next()) |block| { + const image = extractJsonString(block, "image") orelse continue; + const command = extractCommandString(alloc, block) catch return ParseError.OutOfMemory; + errdefer alloc.free(command); - var iter = json_helpers.extractJsonObjects(services_json); - while (iter.next()) |block| { - const image = extractJsonString(block, "image") orelse continue; - const command = extractCommandString(alloc, block) catch return ParseError.OutOfMemory; - errdefer alloc.free(command); + if (!common.validateClusterInput(image)) { + alloc.free(command); + continue; + } + if (command.len > 0 and !common.validateClusterInput(command)) { + alloc.free(command); + continue; + } - if (!common.validateClusterInput(image)) { - alloc.free(command); - continue; - } - if (command.len > 0 and !common.validateClusterInput(command)) { - alloc.free(command); - continue; + parsed.requests.append(alloc, .{ + .image = image, + .command = command, + .cpu_limit = extractJsonInt(block, "cpu_limit") orelse 1000, + .memory_limit_mb = extractJsonInt(block, "memory_limit_mb") orelse 256, + .gpu_limit = extractJsonInt(block, "gpu_limit") orelse 0, + .gpu_model = extractJsonString(block, "gpu_model"), + .gpu_vram_min_mb = if (extractJsonInt(block, "gpu_vram_min_mb")) |v| @as(u64, @intCast(@max(0, v))) else null, + .required_labels = extractJsonString(block, "required_labels") orelse "", + .gang_world_size = if (extractJsonInt(block, "gang_world_size")) |v| @intCast(@max(0, v)) else 0, + .gpus_per_rank = if (extractJsonInt(block, "gpus_per_rank")) |v| @intCast(@max(1, v)) else 1, + }) catch { + alloc.free(command); + return ParseError.OutOfMemory; + }; } - - parsed.requests.append(alloc, .{ - .image = image, - .command = command, - .cpu_limit = extractJsonInt(block, "cpu_limit") orelse 1000, - .memory_limit_mb = extractJsonInt(block, "memory_limit_mb") orelse 256, - .gpu_limit = extractJsonInt(block, "gpu_limit") orelse 0, - .gpu_model = extractJsonString(block, "gpu_model"), - .gpu_vram_min_mb = if (extractJsonInt(block, "gpu_vram_min_mb")) |v| @as(u64, @intCast(@max(0, v))) else null, - .required_labels = extractJsonString(block, "required_labels") orelse "", - .gang_world_size = if (extractJsonInt(block, "gang_world_size")) |v| @intCast(@max(0, v)) else 0, - .gpus_per_rank = if (extractJsonInt(block, "gpus_per_rank")) |v| @intCast(@max(1, v)) else 1, - }) catch { - alloc.free(command); - return ParseError.OutOfMemory; - }; + } else if (parsed.summary.hasAny()) { + return parsed; } - if (parsed.requests.items.len == 0) return ParseError.NoServices; + if (parsed.requests.items.len == 0) { + if (parsed.summary.hasAny()) return parsed; + return ParseError.NoServices; + } return parsed; } @@ -154,3 +162,17 @@ test "parse joins structured command arrays" { try std.testing.expectEqual(@as(usize, 1), parsed.requests.items.len); try std.testing.expectEqualStrings("nginx -g daemon off", parsed.requests.items[0].command); } + +test "parse accepts training-only app apply payloads" { + const alloc = std.testing.allocator; + const json = + \\{"app_name":"demo-app","workers":[],"crons":[],"training_jobs":[{"name":"finetune","image":"trainer:v1","command":["torchrun","train.py"],"gpus":4}],"services":[]} + ; + + var parsed = try parse(alloc, json, true); + defer parsed.deinit(alloc); + + try std.testing.expectEqualStrings("demo-app", parsed.app_name.?); + try std.testing.expectEqual(@as(usize, 0), parsed.requests.items.len); + try std.testing.expectEqual(@as(usize, 1), parsed.summary.training_job_count); +} diff --git a/src/api/routes/cluster_agents/deploy_routes.zig b/src/api/routes/cluster_agents/deploy_routes.zig index a152b729..26c77ca4 100644 --- a/src/api/routes/cluster_agents/deploy_routes.zig +++ b/src/api/routes/cluster_agents/deploy_routes.zig @@ -4,6 +4,7 @@ const scheduler = @import("../../../cluster/scheduler.zig"); const cluster_node = @import("../../../cluster/node.zig"); const json_helpers = @import("../../../lib/json_helpers.zig"); const apply_release = @import("../../../manifest/apply_release.zig"); +const app_snapshot = @import("../../../manifest/app_snapshot.zig"); const apply_request = @import("apply_request.zig"); const volumes_mod = @import("../../../state/volumes.zig"); const agent_registry = @import("../../../cluster/registry.zig"); @@ -18,7 +19,7 @@ const ResponseMode = enum { app, }; -const ClusterApplyError = error{ +pub const ClusterApplyError = error{ NotLeader, InternalError, }; @@ -85,7 +86,7 @@ const ClusterReleaseTracker = struct { } }; -const ClusterApplyBackend = struct { +pub const ClusterApplyBackend = struct { alloc: std.mem.Allocator, node: *cluster_node.Node, requests: []scheduler.PlacementRequest, @@ -260,13 +261,16 @@ fn handleApply( parsed.setVolumeConstraints(vol_constraints); - const agents = agent_registry.listAgents(alloc, db) catch return common.internalError(); + const agents = if (parsed.requests.items.len > 0) + agent_registry.listAgents(alloc, db) catch return common.internalError() + else + alloc.alloc(agent_registry.AgentRecord, 0) catch return common.internalError(); defer { for (agents) |a| a.deinit(alloc); alloc.free(agents); } - if (agents.len == 0) { + if (parsed.requests.items.len > 0 and agents.len == 0) { return .{ .status = .bad_request, .body = "{\"error\":\"no agents available\"}", .allocated = false }; } @@ -292,7 +296,7 @@ fn handleApply( const body = switch (response_mode) { .legacy => formatLegacyApplyResponse(alloc, apply_report.placed, apply_report.failed) catch return common.internalError(), - .app => formatAppApplyResponse(alloc, apply_report) catch return common.internalError(), + .app => formatAppApplyResponse(alloc, apply_report, parsed.summary) catch return common.internalError(), }; return .{ .status = .ok, .body = body, .allocated = true }; } @@ -321,7 +325,11 @@ fn formatLegacyApplyResponse(alloc: std.mem.Allocator, placed: usize, failed: us return std.fmt.allocPrint(alloc, "{{\"placed\":{d},\"failed\":{d}}}", .{ placed, failed }); } -fn formatAppApplyResponse(alloc: std.mem.Allocator, report: apply_release.ApplyReport) ![]u8 { +fn formatAppApplyResponse( + alloc: std.mem.Allocator, + report: apply_release.ApplyReport, + summary: app_snapshot.Summary, +) ![]u8 { var json_buf: std.ArrayList(u8) = .empty; errdefer json_buf.deinit(alloc); const writer = json_buf.writer(alloc); @@ -334,8 +342,11 @@ fn formatAppApplyResponse(alloc: std.mem.Allocator, report: apply_release.ApplyR try json_helpers.writeJsonStringField(writer, "release_id", report.release_id orelse ""); try writer.writeByte(','); try json_helpers.writeJsonStringField(writer, "status", report.status.toString()); - try writer.print(",\"service_count\":{d},\"placed\":{d},\"failed\":{d},\"completed_targets\":{d},\"failed_targets\":{d},\"remaining_targets\":{d}", .{ - report.service_count, + try writer.print(",\"service_count\":{d},\"worker_count\":{d},\"cron_count\":{d},\"training_job_count\":{d},\"placed\":{d},\"failed\":{d},\"completed_targets\":{d},\"failed_targets\":{d},\"remaining_targets\":{d}", .{ + summary.service_count, + summary.worker_count, + summary.cron_count, + summary.training_job_count, report.placed, report.failed, report.completed_targets, @@ -366,7 +377,7 @@ test "formatAppApplyResponse includes app release metadata" { .failed = 0, .completed_targets = 2, .failed_targets = 0, - }); + }, .{ .service_count = 2 }); defer alloc.free(json); try std.testing.expect(std.mem.indexOf(u8, json, "\"app_name\":\"demo-app\"") != null); @@ -374,6 +385,7 @@ test "formatAppApplyResponse includes app release metadata" { try std.testing.expect(std.mem.indexOf(u8, json, "\"release_id\":\"abc123def456\"") != null); try std.testing.expect(std.mem.indexOf(u8, json, "\"status\":\"completed\"") != null); try std.testing.expect(std.mem.indexOf(u8, json, "\"service_count\":2") != null); + try std.testing.expect(std.mem.indexOf(u8, json, "\"worker_count\":0") != null); try std.testing.expect(std.mem.indexOf(u8, json, "\"placed\":2") != null); try std.testing.expect(std.mem.indexOf(u8, json, "\"failed\":0") != null); try std.testing.expect(std.mem.indexOf(u8, json, "\"completed_targets\":2") != null); @@ -397,7 +409,7 @@ test "formatAppApplyResponse includes rollback trigger metadata" { .message = "all placements succeeded", .trigger = .rollback, .source_release_id = "dep-1", - }); + }, .{ .service_count = 2 }); defer alloc.free(json); try std.testing.expect(std.mem.indexOf(u8, json, "\"trigger\":\"rollback\"") != null); @@ -417,7 +429,7 @@ test "formatAppApplyResponse includes partially failed status" { .completed_targets = 1, .failed_targets = 1, .message = "one or more placements failed", - }); + }, .{ .service_count = 2 }); defer alloc.free(json); try std.testing.expect(std.mem.indexOf(u8, json, "\"status\":\"partially_failed\"") != null); @@ -433,3 +445,26 @@ test "formatLegacyApplyResponse preserves compact deploy shape" { try std.testing.expectEqualStrings("{\"placed\":1,\"failed\":1}", json); } + +test "formatAppApplyResponse includes non-service workload counts" { + const alloc = std.testing.allocator; + const json = try formatAppApplyResponse(alloc, .{ + .app_name = "demo-app", + .release_id = "dep-4", + .status = .completed, + .service_count = 0, + .placed = 0, + .failed = 0, + .completed_targets = 0, + .failed_targets = 0, + }, .{ + .worker_count = 1, + .cron_count = 2, + .training_job_count = 1, + }); + defer alloc.free(json); + + try std.testing.expect(std.mem.indexOf(u8, json, "\"worker_count\":1") != null); + try std.testing.expect(std.mem.indexOf(u8, json, "\"cron_count\":2") != null); + try std.testing.expect(std.mem.indexOf(u8, json, "\"training_job_count\":1") != null); +} diff --git a/src/api/routes/cluster_agents/workload_routes.zig b/src/api/routes/cluster_agents/workload_routes.zig new file mode 100644 index 00000000..d948a2c4 --- /dev/null +++ b/src/api/routes/cluster_agents/workload_routes.zig @@ -0,0 +1,402 @@ +const std = @import("std"); +const scheduler = @import("../../../cluster/scheduler.zig"); +const cluster_node = @import("../../../cluster/node.zig"); +const agent_registry = @import("../../../cluster/registry.zig"); +const json_helpers = @import("../../../lib/json_helpers.zig"); +const app_snapshot = @import("../../../manifest/app_snapshot.zig"); +const apply_release = @import("../../../manifest/apply_release.zig"); +const deploy_routes = @import("deploy_routes.zig"); +const store = @import("../../../state/store.zig"); +const common = @import("../common.zig"); +const http = @import("../../http.zig"); + +const Response = common.Response; +const RouteContext = common.RouteContext; + +pub fn route(request: http.Request, alloc: std.mem.Allocator, ctx: RouteContext) ?Response { + if (!std.mem.startsWith(u8, request.path_only, "/apps/")) return null; + + const rest = request.path_only["/apps/".len..]; + if (matchWorkerRun(rest)) |parsed| { + if (!common.validateClusterInput(parsed.app_name) or !common.validateClusterInput(parsed.worker_name)) { + return common.badRequest("invalid app or worker name"); + } + if (request.method != .POST) return common.methodNotAllowed(); + return handleWorkerRun(alloc, parsed.app_name, parsed.worker_name, ctx); + } + + if (matchTrainingAction(rest)) |parsed| { + if (!common.validateClusterInput(parsed.app_name) or !common.validateClusterInput(parsed.job_name)) { + return common.badRequest("invalid app or training job name"); + } + if (parsed.action == TrainingAction.start) { + if (request.method != .POST) return common.methodNotAllowed(); + return handleTrainingStart(alloc, parsed.app_name, parsed.job_name, ctx); + } + if (parsed.action == TrainingAction.status) { + if (request.method != .GET) return common.methodNotAllowed(); + return handleTrainingStatus(alloc, parsed.app_name, parsed.job_name, ctx); + } + if (parsed.action == TrainingAction.stop) { + if (request.method != .POST) return common.methodNotAllowed(); + return handleTrainingStateChange(alloc, parsed.app_name, parsed.job_name, "stopped", ctx); + } + if (parsed.action == TrainingAction.pause) { + if (request.method != .POST) return common.methodNotAllowed(); + return handleTrainingStateChange(alloc, parsed.app_name, parsed.job_name, "paused", ctx); + } + if (parsed.action == TrainingAction.resume_) { + if (request.method != .POST) return common.methodNotAllowed(); + return handleTrainingResume(alloc, parsed.app_name, parsed.job_name, ctx); + } + if (parsed.action == TrainingAction.scale) { + if (request.method != .POST) return common.methodNotAllowed(); + return handleTrainingScale(alloc, parsed.app_name, parsed.job_name, request, ctx); + } + if (request.method != .GET) return common.methodNotAllowed(); + return handleTrainingLogs(alloc, parsed.app_name, parsed.job_name, request, ctx); + } + + return null; +} + +const WorkerRunPath = struct { + app_name: []const u8, + worker_name: []const u8, +}; + +fn matchWorkerRun(rest: []const u8) ?WorkerRunPath { + const workers_idx = std.mem.indexOf(u8, rest, "/workers/") orelse return null; + const app_name = rest[0..workers_idx]; + const tail = rest[workers_idx + "/workers/".len ..]; + const slash = std.mem.indexOfScalar(u8, tail, '/') orelse return null; + const worker_name = tail[0..slash]; + if (!std.mem.eql(u8, tail[slash..], "/run")) return null; + if (app_name.len == 0 or worker_name.len == 0) return null; + return .{ .app_name = app_name, .worker_name = worker_name }; +} + +const TrainingAction = enum { start, status, stop, pause, resume_, scale, logs }; + +const TrainingPath = struct { + app_name: []const u8, + job_name: []const u8, + action: TrainingAction, +}; + +fn matchTrainingAction(rest: []const u8) ?TrainingPath { + const idx = std.mem.indexOf(u8, rest, "/training/") orelse return null; + const app_name = rest[0..idx]; + const tail = rest[idx + "/training/".len ..]; + const slash = std.mem.indexOfScalar(u8, tail, '/') orelse return null; + const job_name = tail[0..slash]; + const action_str = tail[slash + 1 ..]; + if (app_name.len == 0 or job_name.len == 0) return null; + + const action = if (std.mem.eql(u8, action_str, "start")) + TrainingAction.start + else if (std.mem.eql(u8, action_str, "status")) + TrainingAction.status + else if (std.mem.eql(u8, action_str, "stop")) + TrainingAction.stop + else if (std.mem.eql(u8, action_str, "pause")) + TrainingAction.pause + else if (std.mem.eql(u8, action_str, "resume")) + TrainingAction.resume_ + else if (std.mem.eql(u8, action_str, "scale")) + TrainingAction.scale + else if (std.mem.eql(u8, action_str, "logs")) + TrainingAction.logs + else + return null; + + return .{ .app_name = app_name, .job_name = job_name, .action = action }; +} + +fn handleWorkerRun(alloc: std.mem.Allocator, app_name: []const u8, worker_name: []const u8, ctx: RouteContext) Response { + const node = ctx.cluster orelse return common.badRequest("not running in cluster mode"); + const latest = store.getLatestDeploymentByAppInDb(node.stateMachineDb(), alloc, app_name) catch |err| return switch (err) { + error.NotFound => common.notFound(), + else => common.internalError(), + }; + defer latest.deinit(alloc); + + const worker = app_snapshot.findWorkerRunSpec(alloc, latest.config_snapshot, worker_name) catch return common.internalError(); + if (worker == null) return common.notFound(); + defer worker.?.deinit(alloc); + + const outcome = runPlacementRequests(alloc, node, &[_]scheduler.PlacementRequest{.{ + .image = worker.?.image, + .command = worker.?.command, + .cpu_limit = 1000, + .memory_limit_mb = 256, + .gpu_limit = worker.?.gpu_limit, + .gpu_model = worker.?.gpu_model, + .gpu_vram_min_mb = worker.?.gpu_vram_min_mb, + .required_labels = worker.?.required_labels, + }}) catch |err| return switch (err) { + error.NotLeader => common.notLeader(alloc, node), + else => common.internalError(), + }; + + const body = std.fmt.allocPrint( + alloc, + "{{\"app_name\":\"{s}\",\"worker\":\"{s}\",\"placed\":{d},\"failed\":{d},\"message\":\"{s}\"}}", + .{ app_name, worker_name, outcome.placed, outcome.failed, if (outcome.failed == 0) "worker scheduled" else "worker scheduling failed" }, + ) catch return common.internalError(); + return .{ .status = .ok, .body = body, .allocated = true }; +} + +fn handleTrainingStart(alloc: std.mem.Allocator, app_name: []const u8, job_name: []const u8, ctx: RouteContext) Response { + return scheduleTrainingJob(alloc, app_name, job_name, null, null, ctx); +} + +fn handleTrainingResume(alloc: std.mem.Allocator, app_name: []const u8, job_name: []const u8, ctx: RouteContext) Response { + const node = ctx.cluster orelse return common.badRequest("not running in cluster mode"); + const rec = store.findTrainingJobInDb(node.stateMachineDb(), alloc, app_name, job_name) catch return common.internalError(); + if (rec == null) return common.notFound(); + defer rec.?.deinit(alloc); + return scheduleTrainingJob(alloc, app_name, job_name, rec.?.id, null, ctx); +} + +fn handleTrainingScale( + alloc: std.mem.Allocator, + app_name: []const u8, + job_name: []const u8, + request: http.Request, + ctx: RouteContext, +) Response { + const node = ctx.cluster orelse return common.badRequest("not running in cluster mode"); + const gpus = json_helpers.extractJsonInt(request.body, "gpus") orelse return common.badRequest("missing gpus"); + if (gpus <= 0) return common.badRequest("invalid gpus"); + + const rec = store.findTrainingJobInDb(node.stateMachineDb(), alloc, app_name, job_name) catch return common.internalError(); + if (rec == null) return common.notFound(); + defer rec.?.deinit(alloc); + + store.updateTrainingJobGpusInDb(node.stateMachineDb(), rec.?.id, @intCast(gpus), std.time.timestamp()) catch return common.internalError(); + return scheduleTrainingJob(alloc, app_name, job_name, rec.?.id, @intCast(gpus), ctx); +} + +fn handleTrainingStateChange( + alloc: std.mem.Allocator, + app_name: []const u8, + job_name: []const u8, + new_state: []const u8, + ctx: RouteContext, +) Response { + const node = ctx.cluster orelse return common.badRequest("not running in cluster mode"); + const rec = store.findTrainingJobInDb(node.stateMachineDb(), alloc, app_name, job_name) catch return common.internalError(); + if (rec == null) return common.notFound(); + defer rec.?.deinit(alloc); + + store.updateTrainingJobStateInDb(node.stateMachineDb(), rec.?.id, new_state, std.time.timestamp()) catch return common.internalError(); + const updated = store.getTrainingJobInDb(node.stateMachineDb(), alloc, rec.?.id) catch return common.internalError(); + defer updated.deinit(alloc); + return formatTrainingRecordResponse( + alloc, + updated, + new_state, + if (std.mem.eql(u8, new_state, "paused")) "training job paused" else "training job stopped", + ); +} + +fn handleTrainingStatus(alloc: std.mem.Allocator, app_name: []const u8, job_name: []const u8, ctx: RouteContext) Response { + const node = ctx.cluster orelse return common.badRequest("not running in cluster mode"); + const rec = store.findTrainingJobInDb(node.stateMachineDb(), alloc, app_name, job_name) catch return common.internalError(); + if (rec == null) return common.notFound(); + defer rec.?.deinit(alloc); + + const body = formatTrainingRecordJson(alloc, rec.?, null, null) catch return common.internalError(); + return .{ .status = .ok, .body = body, .allocated = true }; +} + +fn handleTrainingLogs( + alloc: std.mem.Allocator, + app_name: []const u8, + job_name: []const u8, + request: http.Request, + ctx: RouteContext, +) Response { + _ = ctx.cluster orelse return common.badRequest("not running in cluster mode"); + const rank = if (common.extractQueryValue(request.query, "rank")) |rank_str| + std.fmt.parseInt(u32, rank_str, 10) catch 0 + else + 0; + + var hostname_buf: [128]u8 = undefined; + const hostname = std.fmt.bufPrint(&hostname_buf, "{s}-rank-{d}", .{ job_name, rank }) catch return common.internalError(); + const record = store.findAppContainer(alloc, app_name, hostname) catch return common.internalError(); + if (record == null) return common.notFound(); + defer record.?.deinit(alloc); + + const logs = @import("../../../runtime/logs.zig"); + const data = logs.readLogs(alloc, record.?.id) catch return common.notFound(); + return .{ .status = .ok, .body = data, .allocated = true, .content_type = "text/plain" }; +} + +fn scheduleTrainingJob( + alloc: std.mem.Allocator, + app_name: []const u8, + job_name: []const u8, + existing_job_id: ?[]const u8, + gpus_override: ?u32, + ctx: RouteContext, +) Response { + const node = ctx.cluster orelse return common.badRequest("not running in cluster mode"); + const latest = store.getLatestDeploymentByAppInDb(node.stateMachineDb(), alloc, app_name) catch |err| return switch (err) { + error.NotFound => common.notFound(), + else => common.internalError(), + }; + defer latest.deinit(alloc); + + const job = app_snapshot.findTrainingJobSpec(alloc, latest.config_snapshot, job_name) catch return common.internalError(); + if (job == null) return common.notFound(); + defer job.?.deinit(alloc); + + const job_id = if (existing_job_id) |id| + alloc.dupe(u8, id) catch return common.internalError() + else + generateClusterTrainingJobId(alloc, app_name, job_name) catch return common.internalError(); + defer alloc.free(job_id); + + const now = std.time.timestamp(); + const existing = store.findTrainingJobInDb(node.stateMachineDb(), alloc, app_name, job_name) catch return common.internalError(); + defer if (existing) |rec| rec.deinit(alloc); + const restarts = if (existing) |rec| rec.restart_count else 0; + + store.saveTrainingJobInDb(node.stateMachineDb(), .{ + .id = job_id, + .name = job_name, + .app_name = app_name, + .state = "scheduling", + .image = job.?.image, + .gpus = if (gpus_override) |gpus| @intCast(gpus) else @intCast(job.?.gpus), + .checkpoint_path = job.?.checkpoint_path, + .checkpoint_interval = null, + .checkpoint_keep = null, + .restart_count = restarts, + .created_at = if (existing_job_id == null) now else if (existing) |rec| rec.created_at else now, + .updated_at = now, + }) catch return common.internalError(); + + const outcome = runPlacementRequests(alloc, node, &[_]scheduler.PlacementRequest{.{ + .image = job.?.image, + .command = job.?.command, + .cpu_limit = job.?.cpu_limit, + .memory_limit_mb = job.?.memory_limit_mb, + .gpu_limit = if (gpus_override) |gpus| gpus else job.?.gpus, + .gpu_model = job.?.gpu_type, + .gang_world_size = if (gpus_override) |gpus| gpus else job.?.gpus, + .gpus_per_rank = 1, + }}) catch |err| return switch (err) { + error.NotLeader => common.notLeader(alloc, node), + else => common.internalError(), + }; + + const final_state = if (outcome.failed == 0 and outcome.placed > 0) "running" else "failed"; + store.updateTrainingJobStateInDb(node.stateMachineDb(), job_id, final_state, std.time.timestamp()) catch return common.internalError(); + + const rec = store.getTrainingJobInDb(node.stateMachineDb(), alloc, job_id) catch return common.internalError(); + defer rec.deinit(alloc); + + return formatTrainingRecordResponse( + alloc, + rec, + final_state, + if (std.mem.eql(u8, final_state, "running")) "training job scheduled" else "training job scheduling failed", + ); +} + +fn runPlacementRequests( + alloc: std.mem.Allocator, + node: *cluster_node.Node, + requests: []const scheduler.PlacementRequest, +) deploy_routes.ClusterApplyError!apply_release.ApplyOutcome { + const owned_requests = alloc.dupe(scheduler.PlacementRequest, requests) catch return deploy_routes.ClusterApplyError.InternalError; + defer alloc.free(owned_requests); + const agents = agent_registry.listAgents(alloc, node.stateMachineDb()) catch return deploy_routes.ClusterApplyError.InternalError; + defer { + for (agents) |a| a.deinit(alloc); + alloc.free(agents); + } + if (agents.len == 0) return error.InternalError; + + var backend = deploy_routes.ClusterApplyBackend{ + .alloc = alloc, + .node = node, + .requests = owned_requests, + .agents = agents, + }; + return backend.apply(); +} + +fn generateClusterTrainingJobId(alloc: std.mem.Allocator, app_name: []const u8, job_name: []const u8) ![]u8 { + return std.fmt.allocPrint(alloc, "cluster-{s}-{s}-{d}", .{ app_name, job_name, std.time.timestamp() }); +} + +fn formatTrainingRecordResponse( + alloc: std.mem.Allocator, + record: store.TrainingJobRecord, + state: []const u8, + message: []const u8, +) Response { + const body = formatTrainingRecordJson(alloc, record, state, message) catch return common.internalError(); + return .{ .status = .ok, .body = body, .allocated = true }; +} + +fn formatTrainingRecordJson( + alloc: std.mem.Allocator, + record: store.TrainingJobRecord, + state_override: ?[]const u8, + message: ?[]const u8, +) ![]u8 { + var json_buf: std.ArrayList(u8) = .empty; + errdefer json_buf.deinit(alloc); + const writer = json_buf.writer(alloc); + + try writer.writeByte('{'); + try json_helpers.writeJsonStringField(writer, "app_name", record.app_name); + try writer.writeByte(','); + try json_helpers.writeJsonStringField(writer, "training_job", record.name); + try writer.writeByte(','); + try json_helpers.writeJsonStringField(writer, "job_id", record.id); + try writer.writeByte(','); + try json_helpers.writeJsonStringField(writer, "state", state_override orelse record.state); + try writer.print(",\"gpus\":{d},\"restart_count\":{d}", .{ record.gpus, record.restart_count }); + try writer.writeByte(','); + try json_helpers.writeNullableJsonStringField(writer, "checkpoint_path", record.checkpoint_path); + try writer.print(",\"updated_at\":{d}", .{record.updated_at}); + if (message) |msg| { + try writer.writeByte(','); + try json_helpers.writeJsonStringField(writer, "message", msg); + } + try writer.writeByte('}'); + return json_buf.toOwnedSlice(alloc); +} + +fn testRequest(method: http.Method, path: []const u8) http.Request { + return .{ + .method = method, + .path = path, + .path_only = path, + .query = "", + .headers_raw = "", + .body = "", + .content_length = 0, + }; +} + +test "route rejects worker run without cluster" { + const ctx: RouteContext = .{ .cluster = null, .join_token = null }; + const req = testRequest(.POST, "/apps/demo-app/workers/migrate/run"); + const resp = route(req, std.testing.allocator, ctx).?; + try std.testing.expectEqual(http.StatusCode.bad_request, resp.status); +} + +test "route rejects training status without cluster" { + const ctx: RouteContext = .{ .cluster = null, .join_token = null }; + const req = testRequest(.GET, "/apps/demo-app/training/finetune/status"); + const resp = route(req, std.testing.allocator, ctx).?; + try std.testing.expectEqual(http.StatusCode.bad_request, resp.status); +} diff --git a/src/lib/command_registry.zig b/src/lib/command_registry.zig index 14564842..154a71e9 100644 --- a/src/lib/command_registry.zig +++ b/src/lib/command_registry.zig @@ -60,10 +60,10 @@ pub const command_specs = [_]CommandSpec{ .{ .name = "validate", .group = .build_manifest, .usage = "validate [-f manifest.toml] [-q]", .description = "validate a manifest file", .handler = manifest_cmds.validate }, .{ .name = "up", .group = .build_manifest, .usage = "up [-f manifest.toml] [--dev] [--server host:port] [service...]", .description = "start services from a manifest", .handler = manifest_cmds.up }, .{ .name = "down", .group = .build_manifest, .usage = "down [-f manifest.toml]", .description = "stop all services from manifest", .handler = manifest_cmds.down }, - .{ .name = "run-worker", .group = .build_manifest, .usage = "run-worker [-f manifest.toml] ", .description = "run a one-shot worker task", .handler = manifest_cmds.runWorker }, + .{ .name = "run-worker", .group = .build_manifest, .usage = "run-worker [-f manifest.toml] [--server host:port] ", .description = "run a one-shot worker task", .handler = manifest_cmds.runWorker }, .{ .name = "rollback", .group = .build_manifest, .usage = "rollback | --app [name] [--server h:p --release id]", .description = "rollback a service or app release", .handler = manifest_cmds.rollback }, .{ .name = "history", .group = .build_manifest, .usage = "history | --app [name] [--server h:p] [--json]", .description = "show service or app release history", .handler = manifest_cmds.history }, - .{ .name = "train", .group = .build_manifest, .usage = "train ", .description = "manage training jobs", .handler = manifest_cmds.train }, + .{ .name = "train", .group = .build_manifest, .usage = "train [--server host:port] ", .description = "manage training jobs", .handler = manifest_cmds.train }, .{ .name = "serve", .group = .cluster, .usage = "serve [--port PORT] [--http-proxy-bind ADDR] [--http-proxy-port PORT]", .description = "start the API server (default: 7700)", .handler = cluster_cmds.serve }, .{ .name = "init-server", .group = .cluster, .usage = "init-server [opts]", .description = "start a cluster server node", .handler = cluster_cmds.initServer }, diff --git a/src/lib/completion.zig b/src/lib/completion.zig index 33493300..bfd70f0c 100644 --- a/src/lib/completion.zig +++ b/src/lib/completion.zig @@ -62,16 +62,17 @@ const command_meta = [_]CommandMeta{ .{ .name = "validate", .flags = &.{ "-f", "-q", "--quiet" } }, .{ .name = "up", .flags = &.{ "-f", "--dev", "--server" } }, .{ .name = "down", .flags = &.{"-f"} }, - .{ .name = "run-worker", .flags = &.{"-f"} }, + .{ .name = "run-worker", .flags = &.{ "-f", "--server" } }, .{ .name = "rollback", .flags = &.{ "--app", "--server", "--release" } }, .{ .name = "history", .flags = &.{ "--app", "--server", "--json" } }, .{ .name = "train", .flags = &.{ "-f", "--server", "--rank" }, .subcommands = &.{ .{ .name = "start", .flags = &.{ "-f", "--server" } }, - .{ .name = "status", .flags = &.{"-f"} }, - .{ .name = "stop" }, - .{ .name = "pause" }, - .{ .name = "resume" }, - .{ .name = "logs", .flags = &.{"--rank"} }, + .{ .name = "status", .flags = &.{ "-f", "--server" } }, + .{ .name = "stop", .flags = &.{ "-f", "--server" } }, + .{ .name = "pause", .flags = &.{ "-f", "--server" } }, + .{ .name = "resume", .flags = &.{ "-f", "--server" } }, + .{ .name = "scale", .flags = &.{ "--gpus", "--server" } }, + .{ .name = "logs", .flags = &.{ "--rank", "--server" } }, } }, // cluster diff --git a/src/lib/json_helpers.zig b/src/lib/json_helpers.zig index cc457479..b87535e5 100644 --- a/src/lib/json_helpers.zig +++ b/src/lib/json_helpers.zig @@ -164,6 +164,49 @@ pub fn extractJsonArray(json: []const u8, key: []const u8) ?[]const u8 { return null; } +/// extract a top-level object value from a JSON object: {"key":{...}} +pub fn extractJsonObject(json: []const u8, key: []const u8) ?[]const u8 { + var search_buf: [128]u8 = undefined; + const needle = std.fmt.bufPrint(&search_buf, "\"{s}\":{{", .{key}) catch return null; + const start_pos = std.mem.indexOf(u8, json, needle) orelse return null; + + const object_start = start_pos + needle.len - 1; + var pos = object_start; + var depth: usize = 0; + var in_string = false; + var escape = false; + + while (pos < json.len) : (pos += 1) { + const c = json[pos]; + + if (escape) { + escape = false; + continue; + } + + if (c == '\\' and in_string) { + escape = true; + continue; + } + + if (c == '"') { + in_string = !in_string; + continue; + } + + if (in_string) continue; + + if (c == '{') { + depth += 1; + } else if (c == '}') { + depth -= 1; + if (depth == 0) return json[object_start .. pos + 1]; + } + } + + return null; +} + // -- JSON array iteration -- // iterate over top-level objects in a JSON array like [{...},{...}]. // returns slices into the original buffer — no allocation needed. diff --git a/src/manifest/app_snapshot.zig b/src/manifest/app_snapshot.zig new file mode 100644 index 00000000..a8941d4b --- /dev/null +++ b/src/manifest/app_snapshot.zig @@ -0,0 +1,213 @@ +const std = @import("std"); +const json_helpers = @import("../lib/json_helpers.zig"); + +pub const Summary = struct { + service_count: usize = 0, + worker_count: usize = 0, + cron_count: usize = 0, + training_job_count: usize = 0, + + pub fn hasAny(self: Summary) bool { + return self.service_count + self.worker_count + self.cron_count + self.training_job_count > 0; + } +}; + +pub const WorkerRunSpec = struct { + name: []const u8, + image: []const u8, + command: []const u8, + required_labels: []const u8 = "", + gpu_limit: i64 = 0, + gpu_model: ?[]const u8 = null, + gpu_vram_min_mb: ?u64 = null, + + pub fn deinit(self: WorkerRunSpec, alloc: std.mem.Allocator) void { + alloc.free(self.command); + } +}; + +pub const TrainingJobSpec = struct { + name: []const u8, + image: []const u8, + command: []const u8, + gpus: u32, + gpu_type: ?[]const u8, + cpu_limit: i64, + memory_limit_mb: i64, + checkpoint_path: ?[]const u8, + + pub fn deinit(self: TrainingJobSpec, alloc: std.mem.Allocator) void { + alloc.free(self.command); + } +}; + +pub fn summarize(json: []const u8) Summary { + return .{ + .service_count = countArrayObjects(json, "services"), + .worker_count = countArrayObjects(json, "workers"), + .cron_count = countArrayObjects(json, "crons"), + .training_job_count = countArrayObjects(json, "training_jobs"), + }; +} + +pub fn findWorkerRunSpec(alloc: std.mem.Allocator, json: []const u8, name: []const u8) !?WorkerRunSpec { + const obj = findNamedObject(json, "workers", name) orelse return null; + + const image = json_helpers.extractJsonString(obj, "image") orelse return null; + const command = try extractCommandString(alloc, obj); + errdefer alloc.free(command); + + var gpu_limit: i64 = 0; + var gpu_model: ?[]const u8 = null; + var gpu_vram_min_mb: ?u64 = null; + if (json_helpers.extractJsonObject(obj, "gpu")) |gpu| { + gpu_limit = json_helpers.extractJsonInt(gpu, "count") orelse 0; + gpu_model = json_helpers.extractJsonString(gpu, "model"); + if (json_helpers.extractJsonInt(gpu, "vram_min_mb")) |v| { + gpu_vram_min_mb = @intCast(@max(@as(i64, 0), v)); + } + } + + return .{ + .name = name, + .image = image, + .command = command, + .required_labels = json_helpers.extractJsonString(obj, "required_labels") orelse "", + .gpu_limit = gpu_limit, + .gpu_model = gpu_model, + .gpu_vram_min_mb = gpu_vram_min_mb, + }; +} + +pub fn findTrainingJobSpec(alloc: std.mem.Allocator, json: []const u8, name: []const u8) !?TrainingJobSpec { + const obj = findNamedObject(json, "training_jobs", name) orelse return null; + + const image = json_helpers.extractJsonString(obj, "image") orelse return null; + const command = try extractCommandString(alloc, obj); + errdefer alloc.free(command); + + const checkpoint_path = if (json_helpers.extractJsonObject(obj, "checkpoint")) |checkpoint| + json_helpers.extractJsonString(checkpoint, "path") + else + null; + + return .{ + .name = name, + .image = image, + .command = command, + .gpus = @intCast(@max(@as(i64, 0), json_helpers.extractJsonInt(obj, "gpus") orelse 0)), + .gpu_type = json_helpers.extractJsonString(obj, "gpu_type"), + .cpu_limit = json_helpers.extractJsonInt(obj, "cpu_limit") orelse 1000, + .memory_limit_mb = json_helpers.extractJsonInt(obj, "memory_limit_mb") orelse 65536, + .checkpoint_path = checkpoint_path, + }; +} + +fn countArrayObjects(json: []const u8, key: []const u8) usize { + const array = json_helpers.extractJsonArray(json, key) orelse return 0; + var count: usize = 0; + var iter = json_helpers.extractJsonObjects(array); + while (iter.next() != null) count += 1; + return count; +} + +fn findNamedObject(json: []const u8, key: []const u8, name: []const u8) ?[]const u8 { + const array = json_helpers.extractJsonArray(json, key) orelse return null; + var iter = json_helpers.extractJsonObjects(array); + while (iter.next()) |obj| { + const obj_name = json_helpers.extractJsonString(obj, "name") orelse continue; + if (std.mem.eql(u8, obj_name, name)) return obj; + } + return null; +} + +fn extractJsonStringArray(alloc: std.mem.Allocator, json: []const u8, key: []const u8) !?[]u8 { + const array_json = json_helpers.extractJsonArray(json, key) orelse return null; + if (array_json.len < 2) return null; + + var out: std.ArrayList(u8) = .empty; + errdefer out.deinit(alloc); + + var pos: usize = 1; + var first = true; + while (pos < array_json.len - 1) { + while (pos < array_json.len - 1 and (array_json[pos] == ' ' or array_json[pos] == '\n' or array_json[pos] == '\r' or array_json[pos] == '\t' or array_json[pos] == ',')) : (pos += 1) {} + if (pos >= array_json.len - 1) break; + if (array_json[pos] != '"') return error.InvalidRequest; + pos += 1; + const start = pos; + + while (pos < array_json.len - 1) : (pos += 1) { + if (array_json[pos] == '\\') { + pos += 1; + if (pos >= array_json.len - 1) return error.InvalidRequest; + continue; + } + if (array_json[pos] == '"') break; + } + if (pos >= array_json.len - 1) return error.InvalidRequest; + + if (!first) try out.append(alloc, ' '); + first = false; + try out.appendSlice(alloc, array_json[start..pos]); + pos += 1; + } + + return try out.toOwnedSlice(alloc); +} + +fn extractCommandString(alloc: std.mem.Allocator, obj: []const u8) ![]const u8 { + if (json_helpers.extractJsonString(obj, "command")) |command| { + return alloc.dupe(u8, command); + } + if (try extractJsonStringArray(alloc, obj, "command")) |joined| { + return joined; + } + return alloc.dupe(u8, ""); +} + +test "summarize counts all workload kinds" { + const summary = summarize( + \\{"app_name":"demo","services":[{"name":"web"}],"workers":[{"name":"migrate"}],"crons":[{"name":"cleanup"}],"training_jobs":[{"name":"train"}]} + ); + + try std.testing.expectEqual(@as(usize, 1), summary.service_count); + try std.testing.expectEqual(@as(usize, 1), summary.worker_count); + try std.testing.expectEqual(@as(usize, 1), summary.cron_count); + try std.testing.expectEqual(@as(usize, 1), summary.training_job_count); +} + +test "findWorkerRunSpec extracts worker scheduler fields" { + const alloc = std.testing.allocator; + const json = + \\{"app_name":"demo","workers":[{"name":"migrate","image":"alpine","command":["sh","-c","migrate"],"required_labels":"gpu=true","gpu":{"count":1,"model":"L4","vram_min_mb":20480}}]} + ; + + const worker = (try findWorkerRunSpec(alloc, json, "migrate")).?; + defer worker.deinit(alloc); + + try std.testing.expectEqualStrings("alpine", worker.image); + try std.testing.expectEqualStrings("sh -c migrate", worker.command); + try std.testing.expectEqualStrings("gpu=true", worker.required_labels); + try std.testing.expectEqual(@as(i64, 1), worker.gpu_limit); + try std.testing.expectEqualStrings("L4", worker.gpu_model.?); + try std.testing.expectEqual(@as(u64, 20480), worker.gpu_vram_min_mb.?); +} + +test "findTrainingJobSpec extracts training scheduler fields" { + const alloc = std.testing.allocator; + const json = + \\{"app_name":"demo","training_jobs":[{"name":"finetune","image":"trainer:v1","command":["torchrun","train.py"],"gpus":4,"gpu_type":"H100","cpu_limit":2000,"memory_limit_mb":131072,"checkpoint":{"path":"/ckpt","interval_secs":1800,"keep":3}}]} + ; + + const job = (try findTrainingJobSpec(alloc, json, "finetune")).?; + defer job.deinit(alloc); + + try std.testing.expectEqualStrings("trainer:v1", job.image); + try std.testing.expectEqualStrings("torchrun train.py", job.command); + try std.testing.expectEqual(@as(u32, 4), job.gpus); + try std.testing.expectEqualStrings("H100", job.gpu_type.?); + try std.testing.expectEqual(@as(i64, 2000), job.cpu_limit); + try std.testing.expectEqual(@as(i64, 131072), job.memory_limit_mb); + try std.testing.expectEqualStrings("/ckpt", job.checkpoint_path.?); +} diff --git a/src/manifest/app_spec.zig b/src/manifest/app_spec.zig index 512a5d92..6b680b9e 100644 --- a/src/manifest/app_spec.zig +++ b/src/manifest/app_spec.zig @@ -23,14 +23,69 @@ pub const ApplicationServiceSpec = struct { required_labels: []const u8 = "", }; +pub const ApplicationWorkerSpec = struct { + name: []const u8, + image: []const u8, + command: []const []const u8, + env: []const []const u8, + depends_on: []const []const u8, + working_dir: ?[]const u8, + volumes: []const spec.VolumeMount, + gpu: ?spec.GpuSpec, + gpu_mesh: ?spec.GpuMeshSpec, + required_labels: []const u8 = "", +}; + +pub const ApplicationCronSpec = struct { + name: []const u8, + image: []const u8, + command: []const []const u8, + env: []const []const u8, + working_dir: ?[]const u8, + volumes: []const spec.VolumeMount, + every: u64, +}; + +pub const ApplicationTrainingJobSpec = struct { + name: []const u8, + image: []const u8, + command: []const []const u8, + env: []const []const u8, + working_dir: ?[]const u8, + volumes: []const spec.VolumeMount, + gpus: u32, + gpu_type: ?[]const u8, + data: ?spec.DataSpec, + checkpoint: ?spec.CheckpointSpec, + resources: spec.TrainingResourceSpec, + fault_tolerance: spec.FaultToleranceSpec, +}; + +pub const WorkloadCounts = struct { + services: usize = 0, + workers: usize = 0, + crons: usize = 0, + training_jobs: usize = 0, + + pub fn hasAny(self: WorkloadCounts) bool { + return self.services + self.workers + self.crons + self.training_jobs > 0; + } +}; + pub const ApplicationSpec = struct { app_name: []const u8, services: []const ApplicationServiceSpec, + workers: []const ApplicationWorkerSpec, + crons: []const ApplicationCronSpec, + training_jobs: []const ApplicationTrainingJobSpec, alloc: std.mem.Allocator, pub fn deinit(self: *ApplicationSpec) void { self.alloc.free(self.app_name); self.alloc.free(self.services); + self.alloc.free(self.workers); + self.alloc.free(self.crons); + self.alloc.free(self.training_jobs); } pub fn serviceByName(self: *const ApplicationSpec, name: []const u8) ?*const ApplicationServiceSpec { @@ -40,6 +95,29 @@ pub const ApplicationSpec = struct { return null; } + pub fn workerByName(self: *const ApplicationSpec, name: []const u8) ?*const ApplicationWorkerSpec { + for (self.workers) |*worker| { + if (std.mem.eql(u8, worker.name, name)) return worker; + } + return null; + } + + pub fn trainingJobByName(self: *const ApplicationSpec, name: []const u8) ?*const ApplicationTrainingJobSpec { + for (self.training_jobs) |*job| { + if (std.mem.eql(u8, job.name, name)) return job; + } + return null; + } + + pub fn workloadCounts(self: *const ApplicationSpec) WorkloadCounts { + return .{ + .services = self.services.len, + .workers = self.workers.len, + .crons = self.crons.len, + .training_jobs = self.training_jobs.len, + }; + } + pub fn selectServices(self: *const ApplicationSpec, alloc: std.mem.Allocator, targets: []const []const u8) !ApplicationSpec { if (targets.len == 0) return self.clone(alloc); @@ -79,9 +157,19 @@ pub const ApplicationSpec = struct { out_idx += 1; } + const workers = try alloc.dupe(ApplicationWorkerSpec, self.workers); + errdefer alloc.free(workers); + const crons = try alloc.dupe(ApplicationCronSpec, self.crons); + errdefer alloc.free(crons); + const training_jobs = try alloc.dupe(ApplicationTrainingJobSpec, self.training_jobs); + errdefer alloc.free(training_jobs); + return .{ .app_name = try alloc.dupe(u8, self.app_name), .services = services, + .workers = workers, + .crons = crons, + .training_jobs = training_jobs, .alloc = alloc, }; } @@ -89,10 +177,19 @@ pub const ApplicationSpec = struct { pub fn clone(self: *const ApplicationSpec, alloc: std.mem.Allocator) !ApplicationSpec { const services = try alloc.dupe(ApplicationServiceSpec, self.services); errdefer alloc.free(services); + const workers = try alloc.dupe(ApplicationWorkerSpec, self.workers); + errdefer alloc.free(workers); + const crons = try alloc.dupe(ApplicationCronSpec, self.crons); + errdefer alloc.free(crons); + const training_jobs = try alloc.dupe(ApplicationTrainingJobSpec, self.training_jobs); + errdefer alloc.free(training_jobs); return .{ .app_name = try alloc.dupe(u8, self.app_name), .services = services, + .workers = workers, + .crons = crons, + .training_jobs = training_jobs, .alloc = alloc, }; } @@ -162,65 +259,25 @@ pub const ApplicationSpec = struct { for (self.services, 0..) |svc, i| { if (i > 0) try writer.writeByte(','); - try writer.writeAll("{\"name\":\""); - try json_helpers.writeJsonEscaped(writer, svc.name); - try writer.writeAll("\",\"image\":\""); - try json_helpers.writeJsonEscaped(writer, svc.image); - try writer.writeAll("\",\"command\":"); - try writeJsonStringArray(writer, svc.command); - try writer.writeAll(",\"ports\":"); - try writeJsonPorts(writer, svc.ports); - try writer.writeAll(",\"env\":"); - try writeJsonStringArray(writer, svc.env); - try writer.writeAll(",\"depends_on\":"); - try writeJsonStringArray(writer, svc.depends_on); - try writer.print(",\"cpu_limit\":{d},\"memory_limit_mb\":{d}", .{ - svc.cpu_limit, - svc.memory_limit_mb, - }); - - if (svc.working_dir) |working_dir| { - try writer.writeAll(",\"working_dir\":\""); - try json_helpers.writeJsonEscaped(writer, working_dir); - try writer.writeByte('"'); - } - - try writer.writeAll(",\"volumes\":"); - try writeJsonVolumes(writer, svc.volumes); - try writer.writeAll(",\"restart\":\""); - try writer.writeAll(restartPolicyString(svc.restart)); - try writer.writeByte('"'); - - if (svc.health_check) |health_check| { - try writer.writeAll(",\"health_check\":"); - try writeJsonHealthCheck(writer, health_check); - } - - if (svc.tls) |tls| { - try writer.writeAll(",\"tls\":"); - try writeJsonTls(writer, tls); - } - - try writer.writeAll(",\"http_routes\":"); - try writeJsonHttpRoutes(writer, svc.http_routes); - - if (svc.gpu) |gpu| { - try writer.writeAll(",\"gpu\":"); - try writeJsonGpu(writer, gpu); - } + try writeJsonService(writer, svc); + } - if (svc.gpu_mesh) |mesh| { - try writer.writeAll(",\"gpu_mesh\":"); - try writeJsonGpuMesh(writer, mesh); - } + try writer.writeAll("],\"workers\":["); + for (self.workers, 0..) |worker, i| { + if (i > 0) try writer.writeByte(','); + try writeJsonWorker(writer, worker); + } - if (svc.required_labels.len > 0) { - try writer.writeAll(",\"required_labels\":\""); - try json_helpers.writeJsonEscaped(writer, svc.required_labels); - try writer.writeByte('"'); - } + try writer.writeAll("],\"crons\":["); + for (self.crons, 0..) |cron, i| { + if (i > 0) try writer.writeByte(','); + try writeJsonCron(writer, cron); + } - try writer.writeByte('}'); + try writer.writeAll("],\"training_jobs\":["); + for (self.training_jobs, 0..) |job, i| { + if (i > 0) try writer.writeByte(','); + try writeJsonTrainingJob(writer, job); } try writer.writeAll("]}"); @@ -231,6 +288,12 @@ pub const ApplicationSpec = struct { pub fn fromManifest(alloc: std.mem.Allocator, app_name: []const u8, manifest: *const spec.Manifest) !ApplicationSpec { const services = try alloc.alloc(ApplicationServiceSpec, manifest.services.len); errdefer alloc.free(services); + const workers = try alloc.alloc(ApplicationWorkerSpec, manifest.workers.len); + errdefer alloc.free(workers); + const crons = try alloc.alloc(ApplicationCronSpec, manifest.crons.len); + errdefer alloc.free(crons); + const training_jobs = try alloc.alloc(ApplicationTrainingJobSpec, manifest.training_jobs.len); + errdefer alloc.free(training_jobs); for (manifest.services, 0..) |svc, i| { services[i] = .{ @@ -251,9 +314,55 @@ pub fn fromManifest(alloc: std.mem.Allocator, app_name: []const u8, manifest: *c }; } + for (manifest.workers, 0..) |worker, i| { + workers[i] = .{ + .name = worker.name, + .image = worker.image, + .command = worker.command, + .env = worker.env, + .depends_on = worker.depends_on, + .working_dir = worker.working_dir, + .volumes = worker.volumes, + .gpu = worker.gpu, + .gpu_mesh = worker.gpu_mesh, + }; + } + + for (manifest.crons, 0..) |cron, i| { + crons[i] = .{ + .name = cron.name, + .image = cron.image, + .command = cron.command, + .env = cron.env, + .working_dir = cron.working_dir, + .volumes = cron.volumes, + .every = cron.every, + }; + } + + for (manifest.training_jobs, 0..) |job, i| { + training_jobs[i] = .{ + .name = job.name, + .image = job.image, + .command = job.command, + .env = job.env, + .working_dir = job.working_dir, + .volumes = job.volumes, + .gpus = job.gpus, + .gpu_type = job.gpu_type, + .data = job.data, + .checkpoint = job.checkpoint, + .resources = job.resources, + .fault_tolerance = job.fault_tolerance, + }; + } + return .{ .app_name = try alloc.dupe(u8, app_name), .services = services, + .workers = workers, + .crons = crons, + .training_jobs = training_jobs, .alloc = alloc, }; } @@ -303,6 +412,165 @@ fn writeJsonVolumes(writer: anytype, volumes: []const spec.VolumeMount) !void { try writer.writeByte(']'); } +fn writeJsonService(writer: anytype, svc: ApplicationServiceSpec) !void { + try writer.writeAll("{\"name\":\""); + try json_helpers.writeJsonEscaped(writer, svc.name); + try writer.writeAll("\",\"image\":\""); + try json_helpers.writeJsonEscaped(writer, svc.image); + try writer.writeAll("\",\"command\":"); + try writeJsonStringArray(writer, svc.command); + try writer.writeAll(",\"ports\":"); + try writeJsonPorts(writer, svc.ports); + try writer.writeAll(",\"env\":"); + try writeJsonStringArray(writer, svc.env); + try writer.writeAll(",\"depends_on\":"); + try writeJsonStringArray(writer, svc.depends_on); + try writer.print(",\"cpu_limit\":{d},\"memory_limit_mb\":{d}", .{ + svc.cpu_limit, + svc.memory_limit_mb, + }); + + if (svc.working_dir) |working_dir| { + try writer.writeAll(",\"working_dir\":\""); + try json_helpers.writeJsonEscaped(writer, working_dir); + try writer.writeByte('"'); + } + + try writer.writeAll(",\"volumes\":"); + try writeJsonVolumes(writer, svc.volumes); + try writer.writeAll(",\"restart\":\""); + try writer.writeAll(restartPolicyString(svc.restart)); + try writer.writeByte('"'); + + if (svc.health_check) |health_check| { + try writer.writeAll(",\"health_check\":"); + try writeJsonHealthCheck(writer, health_check); + } + + if (svc.tls) |tls| { + try writer.writeAll(",\"tls\":"); + try writeJsonTls(writer, tls); + } + + try writer.writeAll(",\"http_routes\":"); + try writeJsonHttpRoutes(writer, svc.http_routes); + + if (svc.gpu) |gpu| { + try writer.writeAll(",\"gpu\":"); + try writeJsonGpu(writer, gpu); + } + + if (svc.gpu_mesh) |mesh| { + try writer.writeAll(",\"gpu_mesh\":"); + try writeJsonGpuMesh(writer, mesh); + } + + if (svc.required_labels.len > 0) { + try writer.writeAll(",\"required_labels\":\""); + try json_helpers.writeJsonEscaped(writer, svc.required_labels); + try writer.writeByte('"'); + } + + try writer.writeByte('}'); +} + +fn writeJsonWorker(writer: anytype, worker: ApplicationWorkerSpec) !void { + try writer.writeAll("{\"name\":\""); + try json_helpers.writeJsonEscaped(writer, worker.name); + try writer.writeAll("\",\"image\":\""); + try json_helpers.writeJsonEscaped(writer, worker.image); + try writer.writeAll("\",\"command\":"); + try writeJsonStringArray(writer, worker.command); + try writer.writeAll(",\"env\":"); + try writeJsonStringArray(writer, worker.env); + try writer.writeAll(",\"depends_on\":"); + try writeJsonStringArray(writer, worker.depends_on); + if (worker.working_dir) |working_dir| { + try writer.writeAll(",\"working_dir\":\""); + try json_helpers.writeJsonEscaped(writer, working_dir); + try writer.writeByte('"'); + } + try writer.writeAll(",\"volumes\":"); + try writeJsonVolumes(writer, worker.volumes); + if (worker.gpu) |gpu| { + try writer.writeAll(",\"gpu\":"); + try writeJsonGpu(writer, gpu); + } + if (worker.gpu_mesh) |mesh| { + try writer.writeAll(",\"gpu_mesh\":"); + try writeJsonGpuMesh(writer, mesh); + } + if (worker.required_labels.len > 0) { + try writer.writeAll(",\"required_labels\":\""); + try json_helpers.writeJsonEscaped(writer, worker.required_labels); + try writer.writeByte('"'); + } + try writer.writeByte('}'); +} + +fn writeJsonCron(writer: anytype, cron: ApplicationCronSpec) !void { + try writer.writeAll("{\"name\":\""); + try json_helpers.writeJsonEscaped(writer, cron.name); + try writer.writeAll("\",\"image\":\""); + try json_helpers.writeJsonEscaped(writer, cron.image); + try writer.writeAll("\",\"command\":"); + try writeJsonStringArray(writer, cron.command); + try writer.writeAll(",\"env\":"); + try writeJsonStringArray(writer, cron.env); + if (cron.working_dir) |working_dir| { + try writer.writeAll(",\"working_dir\":\""); + try json_helpers.writeJsonEscaped(writer, working_dir); + try writer.writeByte('"'); + } + try writer.writeAll(",\"volumes\":"); + try writeJsonVolumes(writer, cron.volumes); + try writer.print(",\"every\":{d}", .{cron.every}); + try writer.writeByte('}'); +} + +fn writeJsonTrainingJob(writer: anytype, job: ApplicationTrainingJobSpec) !void { + try writer.writeAll("{\"name\":\""); + try json_helpers.writeJsonEscaped(writer, job.name); + try writer.writeAll("\",\"image\":\""); + try json_helpers.writeJsonEscaped(writer, job.image); + try writer.writeAll("\",\"command\":"); + try writeJsonStringArray(writer, job.command); + try writer.writeAll(",\"env\":"); + try writeJsonStringArray(writer, job.env); + if (job.working_dir) |working_dir| { + try writer.writeAll(",\"working_dir\":\""); + try json_helpers.writeJsonEscaped(writer, working_dir); + try writer.writeByte('"'); + } + try writer.writeAll(",\"volumes\":"); + try writeJsonVolumes(writer, job.volumes); + try writer.print(",\"gpus\":{d}", .{job.gpus}); + if (job.gpu_type) |gpu_type| { + try writer.writeAll(",\"gpu_type\":\""); + try json_helpers.writeJsonEscaped(writer, gpu_type); + try writer.writeByte('"'); + } + try writer.print(",\"cpu_limit\":{d},\"memory_limit_mb\":{d},\"ib_required\":{}", .{ + job.resources.cpu, + job.resources.memory_mb, + job.resources.ib_required, + }); + try writer.print(",\"spare_ranks\":{d},\"auto_restart\":{},\"max_restarts\":{d}", .{ + job.fault_tolerance.spare_ranks, + job.fault_tolerance.auto_restart, + job.fault_tolerance.max_restarts, + }); + if (job.data) |data| { + try writer.writeAll(",\"data\":"); + try writeJsonTrainingData(writer, data); + } + if (job.checkpoint) |checkpoint| { + try writer.writeAll(",\"checkpoint\":"); + try writeJsonCheckpoint(writer, checkpoint); + } + try writer.writeByte('}'); +} + fn writeJsonHealthCheck(writer: anytype, health_check: spec.HealthCheck) !void { try writer.writeByte('{'); switch (health_check.check_type) { @@ -442,6 +710,29 @@ fn writeJsonGpuMesh(writer: anytype, mesh: spec.GpuMeshSpec) !void { ); } +fn writeJsonTrainingData(writer: anytype, data: spec.DataSpec) !void { + try writer.writeAll("{\"dataset\":\""); + try json_helpers.writeJsonEscaped(writer, data.dataset); + try writer.writeAll("\",\"sharding\":\""); + try json_helpers.writeJsonEscaped(writer, data.sharding); + try writer.writeByte('"'); + if (data.preprocessing) |preprocessing| { + try writer.writeAll(",\"preprocessing\":\""); + try json_helpers.writeJsonEscaped(writer, preprocessing); + try writer.writeByte('"'); + } + try writer.writeByte('}'); +} + +fn writeJsonCheckpoint(writer: anytype, checkpoint: spec.CheckpointSpec) !void { + try writer.writeAll("{\"path\":\""); + try json_helpers.writeJsonEscaped(writer, checkpoint.path); + try writer.print("\",\"interval_secs\":{d},\"keep\":{d}}}", .{ + checkpoint.interval_secs, + checkpoint.keep, + }); +} + fn restartPolicyString(restart: spec.RestartPolicy) []const u8 { return switch (restart) { .none => "none", @@ -457,7 +748,7 @@ fn volumeKindString(kind: spec.VolumeMount.Kind) []const u8 { }; } -test "fromManifest builds canonical service app spec" { +test "fromManifest builds canonical workload app spec" { const alloc = std.testing.allocator; var manifest = try loader.loadFromString(alloc, @@ -481,6 +772,25 @@ test "fromManifest builds canonical service app spec" { \\ \\[service.db] \\image = "postgres:16" + \\ + \\[worker.migrate] + \\image = "alpine:latest" + \\command = ["sh", "-c", "migrate"] + \\depends_on = ["db"] + \\ + \\[cron.cleanup] + \\image = "busybox" + \\command = ["sh", "-c", "cleanup"] + \\every = "1h" + \\ + \\[training.finetune] + \\image = "trainer:v1" + \\command = ["torchrun", "train.py"] + \\gpus = 4 + \\ + \\[training.finetune.resources] + \\cpu = 2000 + \\memory_mb = 131072 ); defer manifest.deinit(); @@ -489,17 +799,15 @@ test "fromManifest builds canonical service app spec" { try std.testing.expectEqualStrings("demo-app", app.app_name); try std.testing.expectEqual(@as(usize, 2), app.services.len); + try std.testing.expectEqual(@as(usize, 1), app.workers.len); + try std.testing.expectEqual(@as(usize, 1), app.crons.len); + try std.testing.expectEqual(@as(usize, 1), app.training_jobs.len); try std.testing.expectEqualStrings("web", app.services[1].name); - try std.testing.expectEqualStrings("nginx:latest", app.services[1].image); - try std.testing.expectEqual(@as(usize, 3), app.services[1].command.len); - try std.testing.expectEqual(@as(usize, 1), app.services[1].ports.len); - try std.testing.expectEqual(@as(usize, 1), app.services[1].env.len); - try std.testing.expectEqual(@as(usize, 1), app.services[1].depends_on.len); - try std.testing.expectEqualStrings("/app", app.services[1].working_dir.?); - try std.testing.expectEqual(@as(usize, 1), app.services[1].volumes.len); - try std.testing.expectEqual(@as(u32, 1), app.services[1].gpu.?.count); - try std.testing.expectEqual(@as(u32, 4), app.services[1].gpu_mesh.?.world_size); - try std.testing.expectEqual(@as(u32, 2), app.services[1].gpu_mesh.?.gpus_per_rank); + try std.testing.expectEqualStrings("migrate", app.workers[0].name); + try std.testing.expectEqualStrings("cleanup", app.crons[0].name); + try std.testing.expectEqualStrings("finetune", app.training_jobs[0].name); + try std.testing.expectEqual(@as(u32, 4), app.training_jobs[0].gpus); + try std.testing.expectEqual(@as(u32, 2000), app.training_jobs[0].resources.cpu); } test "toLegacyDeployJson preserves service semantics needed by deploy shim" { @@ -552,6 +860,10 @@ test "selectServices includes transitive dependencies in manifest order" { \\ \\[service.db] \\image = "postgres:16" + \\ + \\[worker.migrate] + \\image = "alpine:latest" + \\command = ["sh", "-c", "migrate"] ); defer manifest.deinit(); @@ -562,12 +874,13 @@ test "selectServices includes transitive dependencies in manifest order" { defer filtered.deinit(); try std.testing.expectEqual(@as(usize, 3), filtered.services.len); + try std.testing.expectEqual(@as(usize, 1), filtered.workers.len); try std.testing.expectEqualStrings("db", filtered.services[0].name); try std.testing.expectEqualStrings("api", filtered.services[1].name); try std.testing.expectEqualStrings("web", filtered.services[2].name); } -test "toApplyJson preserves structured command and service metadata" { +test "toApplyJson preserves structured workload metadata" { const alloc = std.testing.allocator; var manifest = try loader.loadFromString(alloc, @@ -583,6 +896,20 @@ test "toApplyJson preserves structured command and service metadata" { \\ \\[service.db] \\image = "postgres:16" + \\ + \\[worker.migrate] + \\image = "alpine:latest" + \\command = ["sh", "-c", "migrate"] + \\ + \\[cron.cleanup] + \\image = "busybox" + \\command = ["sh", "-c", "cleanup"] + \\every = "1h" + \\ + \\[training.finetune] + \\image = "trainer:v1" + \\command = ["torchrun", "train.py"] + \\gpus = 4 ); defer manifest.deinit(); @@ -594,9 +921,8 @@ test "toApplyJson preserves structured command and service metadata" { try std.testing.expect(std.mem.indexOf(u8, json, "\"app_name\":\"demo-app\"") != null); try std.testing.expect(std.mem.indexOf(u8, json, "\"command\":[\"nginx\",\"-g\",\"daemon off;\"]") != null); - try std.testing.expect(std.mem.indexOf(u8, json, "\"ports\":[{\"host_port\":8080,\"container_port\":80}]") != null); - try std.testing.expect(std.mem.indexOf(u8, json, "\"env\":[\"MODE=prod\"]") != null); - try std.testing.expect(std.mem.indexOf(u8, json, "\"depends_on\":[\"db\"]") != null); - try std.testing.expect(std.mem.indexOf(u8, json, "\"working_dir\":\"/app\"") != null); - try std.testing.expect(std.mem.indexOf(u8, json, "\"restart\":\"always\"") != null); + try std.testing.expect(std.mem.indexOf(u8, json, "\"workers\":[{\"name\":\"migrate\"") != null); + try std.testing.expect(std.mem.indexOf(u8, json, "\"crons\":[{\"name\":\"cleanup\"") != null); + try std.testing.expect(std.mem.indexOf(u8, json, "\"training_jobs\":[{\"name\":\"finetune\"") != null); + try std.testing.expect(std.mem.indexOf(u8, json, "\"gpus\":4") != null); } diff --git a/src/manifest/cli/ops.zig b/src/manifest/cli/ops.zig index b204d22f..4acf3f75 100644 --- a/src/manifest/cli/ops.zig +++ b/src/manifest/cli/ops.zig @@ -3,6 +3,7 @@ const cli = @import("../../lib/cli.zig"); const json_helpers = @import("../../lib/json_helpers.zig"); const json_out = @import("../../lib/json_output.zig"); const apply_release = @import("../apply_release.zig"); +const app_snapshot = @import("../app_snapshot.zig"); const manifest_loader = @import("../loader.zig"); const orchestrator = @import("../orchestrator.zig"); const release_history = @import("../release_history.zig"); @@ -243,6 +244,10 @@ const HistoryEntryView = struct { status: []const u8, manifest_hash: []const u8, created_at: i64, + service_count: usize = 0, + worker_count: usize = 0, + cron_count: usize = 0, + training_job_count: usize = 0, completed_targets: usize, failed_targets: usize, remaining_targets: usize, @@ -252,6 +257,7 @@ const HistoryEntryView = struct { fn historyEntryFromDeployment(dep: store.DeploymentRecord) HistoryEntryView { const report = apply_release.reportFromDeployment(dep); + const summary = app_snapshot.summarize(dep.config_snapshot); return .{ .id = report.release_id orelse dep.id, .app = dep.app_name, @@ -260,6 +266,10 @@ fn historyEntryFromDeployment(dep: store.DeploymentRecord) HistoryEntryView { .status = report.status.toString(), .manifest_hash = report.manifest_hash, .created_at = report.created_at, + .service_count = summary.service_count, + .worker_count = summary.worker_count, + .cron_count = summary.cron_count, + .training_job_count = summary.training_job_count, .completed_targets = report.completed_targets, .failed_targets = report.failed_targets, .remaining_targets = report.remainingTargets(), @@ -277,6 +287,10 @@ fn parseHistoryObject(obj: []const u8) HistoryEntryView { .status = json_helpers.extractJsonString(obj, "status") orelse "?", .manifest_hash = json_helpers.extractJsonString(obj, "manifest_hash") orelse "?", .created_at = json_helpers.extractJsonInt(obj, "created_at") orelse 0, + .service_count = @intCast(@max(0, json_helpers.extractJsonInt(obj, "service_count") orelse 0)), + .worker_count = @intCast(@max(0, json_helpers.extractJsonInt(obj, "worker_count") orelse 0)), + .cron_count = @intCast(@max(0, json_helpers.extractJsonInt(obj, "cron_count") orelse 0)), + .training_job_count = @intCast(@max(0, json_helpers.extractJsonInt(obj, "training_job_count") orelse 0)), .completed_targets = @intCast(@max(0, json_helpers.extractJsonInt(obj, "completed_targets") orelse 0)), .failed_targets = @intCast(@max(0, json_helpers.extractJsonInt(obj, "failed_targets") orelse 0)), .remaining_targets = @intCast(@max(0, json_helpers.extractJsonInt(obj, "remaining_targets") orelse 0)), @@ -313,6 +327,10 @@ fn writeHistoryJsonObject(w: *json_out.JsonWriter, entry: HistoryEntryView) void w.stringField("status", entry.status); w.stringField("manifest_hash", entry.manifest_hash); w.intField("created_at", entry.created_at); + w.uintField("service_count", entry.service_count); + w.uintField("worker_count", entry.worker_count); + w.uintField("cron_count", entry.cron_count); + w.uintField("training_job_count", entry.training_job_count); w.uintField("completed_targets", entry.completed_targets); w.uintField("failed_targets", entry.failed_targets); w.uintField("remaining_targets", entry.remaining_targets); @@ -380,9 +398,9 @@ test "historyEntryFromDeployment matches remote app history shape" { }; const local = historyEntryFromDeployment(dep); - const remote = parseHistoryObject( - \\{"id":"dep-1","app":"demo-app","service":"demo-app","trigger":"apply","status":"completed","manifest_hash":"sha256:123","created_at":42,"completed_targets":0,"failed_targets":0,"remaining_targets":0,"source_release_id":null,"message":"healthy"} - ); + var w = json_out.JsonWriter{}; + writeHistoryJsonObject(&w, local); + const remote = parseHistoryObject(w.getWritten()); try std.testing.expectEqualStrings(local.id, remote.id); try std.testing.expectEqualStrings(local.app.?, remote.app.?); @@ -391,6 +409,7 @@ test "historyEntryFromDeployment matches remote app history shape" { try std.testing.expectEqualStrings(local.status, remote.status); try std.testing.expectEqualStrings(local.manifest_hash, remote.manifest_hash); try std.testing.expectEqual(local.created_at, remote.created_at); + try std.testing.expectEqual(local.service_count, remote.service_count); try std.testing.expectEqual(local.completed_targets, remote.completed_targets); try std.testing.expectEqual(local.failed_targets, remote.failed_targets); try std.testing.expectEqual(local.remaining_targets, remote.remaining_targets); @@ -407,6 +426,7 @@ test "writeHistoryJsonObject round-trips through remote parser" { .status = "completed", .manifest_hash = "sha256:123", .created_at = 42, + .service_count = 1, .completed_targets = 1, .failed_targets = 0, .remaining_targets = 0, @@ -425,6 +445,7 @@ test "writeHistoryJsonObject round-trips through remote parser" { try std.testing.expectEqualStrings(entry.status, parsed.status); try std.testing.expectEqualStrings(entry.manifest_hash, parsed.manifest_hash); try std.testing.expectEqual(entry.created_at, parsed.created_at); + try std.testing.expectEqual(entry.service_count, parsed.service_count); try std.testing.expectEqual(entry.completed_targets, parsed.completed_targets); try std.testing.expectEqual(entry.failed_targets, parsed.failed_targets); try std.testing.expectEqual(entry.remaining_targets, parsed.remaining_targets); @@ -447,9 +468,9 @@ test "historyEntryFromDeployment preserves partially failed local release state" }; const local = historyEntryFromDeployment(dep); - const remote = parseHistoryObject( - \\{"id":"dep-3","app":"demo-app","service":"demo-app","trigger":"apply","status":"partially_failed","manifest_hash":"sha256:333","created_at":300,"completed_targets":1,"failed_targets":1,"remaining_targets":0,"source_release_id":null,"message":"one or more placements failed"} - ); + var w = json_out.JsonWriter{}; + writeHistoryJsonObject(&w, local); + const remote = parseHistoryObject(w.getWritten()); try std.testing.expectEqualStrings(local.id, remote.id); try std.testing.expectEqualStrings(local.app.?, remote.app.?); @@ -458,6 +479,7 @@ test "historyEntryFromDeployment preserves partially failed local release state" try std.testing.expectEqualStrings(local.status, remote.status); try std.testing.expectEqualStrings(local.manifest_hash, remote.manifest_hash); try std.testing.expectEqual(local.created_at, remote.created_at); + try std.testing.expectEqual(local.service_count, remote.service_count); try std.testing.expectEqual(local.completed_targets, remote.completed_targets); try std.testing.expectEqual(local.failed_targets, remote.failed_targets); try std.testing.expectEqual(local.remaining_targets, remote.remaining_targets); @@ -468,6 +490,7 @@ test "historyEntryFromDeployment preserves partially failed local release state" pub fn runWorker(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void { var manifest_path: []const u8 = manifest_loader.default_filename; var worker_name: ?[]const u8 = null; + var server_addr: ?[]const u8 = null; while (args.next()) |arg| { if (std.mem.eql(u8, arg, "-f")) { @@ -475,16 +498,46 @@ pub fn runWorker(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void writeErr("-f requires a manifest path\n", .{}); return OpsError.InvalidArgument; }; + } else if (std.mem.eql(u8, arg, "--server")) { + server_addr = args.next() orelse { + writeErr("--server requires a host:port address\n", .{}); + return OpsError.InvalidArgument; + }; } else { worker_name = arg; } } const name = worker_name orelse { - writeErr("usage: yoq run-worker [-f manifest.toml] \n", .{}); + writeErr("usage: yoq run-worker [-f manifest.toml] [--server host:port] \n", .{}); return OpsError.InvalidArgument; }; + if (server_addr) |addr_str| { + const app_name = try currentAppNameAlloc(alloc); + defer alloc.free(app_name); + + const server = cli.parseServerAddr(addr_str); + const path = std.fmt.allocPrint(alloc, "/apps/{s}/workers/{s}/run", .{ app_name, name }) catch return OpsError.StoreError; + defer alloc.free(path); + + var token_buf: [64]u8 = undefined; + const token = cli.readApiToken(&token_buf); + var resp = http_client.postWithAuth(alloc, server.ip, server.port, path, "{}", token) catch { + writeErr("failed to connect to cluster server\n", .{}); + return OpsError.ConnectionFailed; + }; + defer resp.deinit(alloc); + + if (resp.status_code != 200) { + writeErr("worker run failed (status {d}): {s}\n", .{ resp.status_code, resp.body }); + return OpsError.DeploymentFailed; + } + + write("{s}\n", .{resp.body}); + return; + } + var manifest = manifest_loader.load(alloc, manifest_path) catch |err| { writeErr("failed to load manifest: {s} ({})", .{ manifest_path, err }); writeErr("hint: create one with 'yoq init'\n", .{}); diff --git a/src/manifest/cli/train.zig b/src/manifest/cli/train.zig index fcfec61e..4b5d92d2 100644 --- a/src/manifest/cli/train.zig +++ b/src/manifest/cli/train.zig @@ -4,6 +4,7 @@ const manifest_loader = @import("../loader.zig"); const manifest_spec = @import("../spec.zig"); const store = @import("../../state/store.zig"); const training = @import("../training.zig"); +const http_client = @import("../../cluster/http_client.zig"); const logs = @import("../../runtime/logs.zig"); const write = cli.write; @@ -62,6 +63,13 @@ fn trainStart(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void { return TrainError.InvalidArgument; }; + if (server_addr) |addr| { + const body = try remoteTrainingPost(alloc, addr, name, "start", "{}"); + defer alloc.free(body); + write("{s}\n", .{body}); + return; + } + var manifest = manifest_loader.load(alloc, manifest_path) catch |err| { writeErr("failed to load manifest: {s} ({})", .{ manifest_path, err }); return TrainError.ManifestLoadFailed; @@ -104,6 +112,7 @@ fn trainStart(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void { fn trainStatus(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void { var manifest_path: []const u8 = manifest_loader.default_filename; var job_name: ?[]const u8 = null; + var server_addr: ?[]const u8 = null; while (args.next()) |arg| { if (std.mem.eql(u8, arg, "-f")) { @@ -111,16 +120,26 @@ fn trainStatus(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void { writeErr("-f requires a manifest path\n", .{}); return TrainError.InvalidArgument; }; + } else if (std.mem.eql(u8, arg, "--server")) { + server_addr = args.next() orelse { + writeErr("--server requires a host:port address\n", .{}); + return TrainError.InvalidArgument; + }; } else { job_name = arg; } } const name = job_name orelse { - writeErr("usage: yoq train status [-f manifest.toml] \n", .{}); + writeErr("usage: yoq train status [-f manifest.toml] [--server host:port] \n", .{}); return TrainError.InvalidArgument; }; + if (server_addr) |addr| { + try remoteTrainingGetStatus(alloc, addr, name); + return; + } + var manifest = manifest_loader.load(alloc, manifest_path) catch |err| { writeErr("failed to load manifest: {s} ({})", .{ manifest_path, err }); return TrainError.ManifestLoadFailed; @@ -220,16 +239,80 @@ fn parseTrainArgs(args: *std.process.ArgIterator) TrainArgs { return result; } +fn currentAppNameAlloc(alloc: std.mem.Allocator) ![]u8 { + var cwd_buf: [4096]u8 = undefined; + const cwd = std.fs.cwd().realpath(".", &cwd_buf) catch return TrainError.StoreError; + return alloc.dupe(u8, std.fs.path.basename(cwd)) catch return TrainError.DeploymentFailed; +} + +fn remoteTrainingPath( + alloc: std.mem.Allocator, + app_name: []const u8, + job_name: []const u8, + action: []const u8, +) ![]u8 { + return std.fmt.allocPrint(alloc, "/apps/{s}/training/{s}/{s}", .{ app_name, job_name, action }); +} + +fn remoteTrainingPost( + alloc: std.mem.Allocator, + server_addr: []const u8, + job_name: []const u8, + action: []const u8, + body: []const u8, +) ![]u8 { + const app_name = try currentAppNameAlloc(alloc); + defer alloc.free(app_name); + const path = try remoteTrainingPath(alloc, app_name, job_name, action); + defer alloc.free(path); + + const server = cli.parseServerAddr(server_addr); + var token_buf: [64]u8 = undefined; + const token = cli.readApiToken(&token_buf); + var resp = http_client.postWithAuth(alloc, server.ip, server.port, path, body, token) catch return TrainError.DeploymentFailed; + defer resp.deinit(alloc); + + if (resp.status_code != 200) { + writeErr("training {s} failed (status {d}): {s}\n", .{ action, resp.status_code, resp.body }); + return TrainError.DeploymentFailed; + } + return alloc.dupe(u8, resp.body) catch return TrainError.DeploymentFailed; +} + +fn remoteTrainingGetStatus(alloc: std.mem.Allocator, server_addr: []const u8, job_name: []const u8) !void { + const app_name = try currentAppNameAlloc(alloc); + defer alloc.free(app_name); + const path = try remoteTrainingPath(alloc, app_name, job_name, "status"); + defer alloc.free(path); + + const server = cli.parseServerAddr(server_addr); + var token_buf: [64]u8 = undefined; + const token = cli.readApiToken(&token_buf); + var resp = http_client.getWithAuth(alloc, server.ip, server.port, path, token) catch return TrainError.DeploymentFailed; + defer resp.deinit(alloc); + + if (resp.status_code != 200) { + writeErr("training status failed (status {d}): {s}\n", .{ resp.status_code, resp.body }); + return TrainError.DeploymentFailed; + } + write("{s}\n", .{resp.body}); +} + fn loadTrainJobContext(args: *std.process.ArgIterator, alloc: std.mem.Allocator, comptime usage: []const u8) !TrainJobContext { const parsed = parseTrainArgs(args); + return loadTrainJobContextFromParsed(parsed, alloc, usage); +} + +fn loadTrainJobContextFromParsed(parsed: TrainArgs, alloc: std.mem.Allocator, comptime usage: []const u8) !TrainJobContext { + const manifest_path = parsed.manifest_path; const name = parsed.job_name orelse { writeErr("usage: {s}\n", .{usage}); return TrainError.InvalidArgument; }; - var manifest = manifest_loader.load(alloc, parsed.manifest_path) catch |err| { - writeErr("failed to load manifest: {s} ({})", .{ parsed.manifest_path, err }); + var manifest = manifest_loader.load(alloc, manifest_path) catch |err| { + writeErr("failed to load manifest: {s} ({})", .{ manifest_path, err }); return TrainError.ManifestLoadFailed; }; errdefer manifest.deinit(); @@ -254,7 +337,20 @@ fn loadTrainJobContext(args: *std.process.ArgIterator, alloc: std.mem.Allocator, } fn trainStop(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void { - var ctx = try loadTrainJobContext(args, alloc, "yoq train stop [-f manifest.toml] "); + const parsed = parseTrainArgs(args); + const name = parsed.job_name orelse { + writeErr("usage: yoq train stop [-f manifest.toml] [--server host:port] \n", .{}); + return TrainError.InvalidArgument; + }; + + if (parsed.server_addr) |addr| { + const body = try remoteTrainingPost(alloc, addr, name, "stop", "{}"); + defer alloc.free(body); + write("{s}\n", .{body}); + return; + } + + var ctx = try loadTrainJobContextFromParsed(parsed, alloc, "yoq train stop [-f manifest.toml] "); defer ctx.deinit(); if (!ctx.ctrl.loadFromStore()) { @@ -272,7 +368,20 @@ fn trainStop(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void { } fn trainPause(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void { - var ctx = try loadTrainJobContext(args, alloc, "yoq train pause [-f manifest.toml] "); + const parsed = parseTrainArgs(args); + const name = parsed.job_name orelse { + writeErr("usage: yoq train pause [-f manifest.toml] [--server host:port] \n", .{}); + return TrainError.InvalidArgument; + }; + + if (parsed.server_addr) |addr| { + const body = try remoteTrainingPost(alloc, addr, name, "pause", "{}"); + defer alloc.free(body); + write("{s}\n", .{body}); + return; + } + + var ctx = try loadTrainJobContextFromParsed(parsed, alloc, "yoq train pause [-f manifest.toml] "); defer ctx.deinit(); if (!ctx.ctrl.loadFromStore()) { @@ -295,7 +404,20 @@ fn trainPause(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void { } fn trainResume(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void { - var ctx = try loadTrainJobContext(args, alloc, "yoq train resume [-f manifest.toml] [--server host:port] "); + const parsed = parseTrainArgs(args); + const name = parsed.job_name orelse { + writeErr("usage: yoq train resume [-f manifest.toml] [--server host:port] \n", .{}); + return TrainError.InvalidArgument; + }; + + if (parsed.server_addr) |addr| { + const body = try remoteTrainingPost(alloc, addr, name, "resume", "{}"); + defer alloc.free(body); + write("{s}\n", .{body}); + return; + } + + var ctx = try loadTrainJobContextFromParsed(parsed, alloc, "yoq train resume [-f manifest.toml] [--server host:port] "); defer ctx.deinit(); if (!ctx.ctrl.loadFromStore()) { @@ -378,6 +500,15 @@ fn trainScale(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void { return TrainError.InvalidArgument; } + if (server_addr) |addr| { + const body_json = std.fmt.allocPrint(alloc, "{{\"gpus\":{d}}}", .{gpus}) catch return TrainError.DeploymentFailed; + defer alloc.free(body_json); + const body = try remoteTrainingPost(alloc, addr, name, "scale", body_json); + defer alloc.free(body); + write("{s}\n", .{body}); + return; + } + var manifest = manifest_loader.load(alloc, manifest_loader.default_filename) catch |err| { writeErr("failed to load manifest ({})\n", .{err}); return TrainError.ManifestLoadFailed; @@ -450,6 +581,7 @@ fn trainScale(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void { fn trainLogs(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void { var job_name: ?[]const u8 = null; var rank: u32 = 0; + var server_addr: ?[]const u8 = null; while (args.next()) |arg| { if (std.mem.eql(u8, arg, "--rank")) { @@ -461,16 +593,41 @@ fn trainLogs(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void { writeErr("invalid rank number: {s}\n", .{rank_str}); return TrainError.InvalidArgument; }; + } else if (std.mem.eql(u8, arg, "--server")) { + server_addr = args.next() orelse { + writeErr("--server requires a host:port address\n", .{}); + return TrainError.InvalidArgument; + }; } else { job_name = arg; } } const name = job_name orelse { - writeErr("usage: yoq train logs [--rank N] \n", .{}); + writeErr("usage: yoq train logs [--server host:port] [--rank N] \n", .{}); return TrainError.InvalidArgument; }; + if (server_addr) |addr| { + const app_name = try currentAppNameAlloc(alloc); + defer alloc.free(app_name); + const path = std.fmt.allocPrint(alloc, "/apps/{s}/training/{s}/logs?rank={d}", .{ app_name, name, rank }) catch return TrainError.DeploymentFailed; + defer alloc.free(path); + + const server = cli.parseServerAddr(addr); + var token_buf: [64]u8 = undefined; + const token = cli.readApiToken(&token_buf); + var resp = http_client.getWithAuth(alloc, server.ip, server.port, path, token) catch return TrainError.DeploymentFailed; + defer resp.deinit(alloc); + + if (resp.status_code != 200) { + writeErr("training logs failed (status {d}): {s}\n", .{ resp.status_code, resp.body }); + return TrainError.DeploymentFailed; + } + write("{s}", .{resp.body}); + return; + } + var hostname_buf: [128]u8 = undefined; const hostname = std.fmt.bufPrint(&hostname_buf, "{s}-rank-{d}", .{ name, rank }) catch { writeErr("failed to build hostname\n", .{}); diff --git a/src/manifest/loader.zig b/src/manifest/loader.zig index 0f246816..396a3d09 100644 --- a/src/manifest/loader.zig +++ b/src/manifest/loader.zig @@ -107,11 +107,6 @@ fn buildManifest(alloc: std.mem.Allocator, root: *const toml.Table) LoadError!sp } } - if (services.items.len == 0 and training_jobs.items.len == 0) { - log.err("manifest: no services or training jobs defined", .{}); - return LoadError.NoServices; - } - // parse workers from [worker.*] subtables var workers: std.ArrayListUnmanaged(spec.Worker) = .empty; defer { @@ -150,6 +145,11 @@ fn buildManifest(alloc: std.mem.Allocator, root: *const toml.Table) LoadError!sp } } + if (services.items.len == 0 and workers.items.len == 0 and crons.items.len == 0 and training_jobs.items.len == 0) { + log.err("manifest: no services, workers, crons, or training jobs defined", .{}); + return LoadError.NoServices; + } + var volumes: std.ArrayListUnmanaged(spec.Volume) = .empty; defer { for (volumes.items) |vol| vol.deinit(alloc); @@ -1931,6 +1931,33 @@ test "training job — manifest with only training jobs is valid" { try std.testing.expectEqual(@as(usize, 1), manifest.training_jobs.len); } +test "worker-only manifest is valid" { + const alloc = std.testing.allocator; + var manifest = try loadFromString(alloc, + \\[worker.migrate] + \\image = "alpine:latest" + \\command = ["sh", "-c", "migrate"] + ); + defer manifest.deinit(); + + try std.testing.expectEqual(@as(usize, 0), manifest.services.len); + try std.testing.expectEqual(@as(usize, 1), manifest.workers.len); +} + +test "cron-only manifest is valid" { + const alloc = std.testing.allocator; + var manifest = try loadFromString(alloc, + \\[cron.cleanup] + \\image = "busybox" + \\command = ["sh", "-c", "cleanup"] + \\every = "1h" + ); + defer manifest.deinit(); + + try std.testing.expectEqual(@as(usize, 0), manifest.services.len); + try std.testing.expectEqual(@as(usize, 1), manifest.crons.len); +} + test "training job — checkpoint interval default" { const alloc = std.testing.allocator; diff --git a/src/runtime/cli/status_command.zig b/src/runtime/cli/status_command.zig index 7caabcfb..22a200f9 100644 --- a/src/runtime/cli/status_command.zig +++ b/src/runtime/cli/status_command.zig @@ -2,6 +2,7 @@ const std = @import("std"); const cli = @import("../../lib/cli.zig"); const json_out = @import("../../lib/json_output.zig"); const apply_release = @import("../../manifest/apply_release.zig"); +const app_snapshot = @import("../../manifest/app_snapshot.zig"); const store = @import("../../state/store.zig"); const monitor = @import("../monitor.zig"); const cgroups = @import("../cgroups.zig"); @@ -130,7 +131,10 @@ const AppStatusSnapshot = struct { status: []const u8, manifest_hash: []const u8, created_at: i64, - service_count: usize, + service_count: usize = 0, + worker_count: usize = 0, + cron_count: usize = 0, + training_job_count: usize = 0, completed_targets: usize, failed_targets: usize, remaining_targets: usize, @@ -367,8 +371,8 @@ fn printAppStatuses(snapshots: []const AppStatusSnapshot) void { } fn printAppStatusHeader() void { - write("{s:<14} {s:<14} {s:<14} {s:<20} {s:<22} {s:<14} {s}\n", .{ - "APP", "RELEASE", "STATUS", "TIMESTAMP", "TARGETS", "PREV OK", "MESSAGE", + write("{s:<14} {s:<14} {s:<14} {s:<11} {s:<20} {s:<22} {s:<14} {s}\n", .{ + "APP", "RELEASE", "STATUS", "KINDS", "TIMESTAMP", "TARGETS", "PREV OK", "MESSAGE", }); } @@ -379,16 +383,24 @@ fn printAppStatusRow(snapshot: AppStatusSnapshot) void { var progress_buf: [64]u8 = undefined; const progress_str = formatAppProgress(&progress_buf, snapshot); + var kinds_buf: [32]u8 = undefined; + const kinds_str = std.fmt.bufPrint(&kinds_buf, "{d}/{d}/{d}/{d}", .{ + snapshot.service_count, + snapshot.worker_count, + snapshot.cron_count, + snapshot.training_job_count, + }) catch "?"; const previous_successful = if (snapshot.previous_successful_release_id) |release_id| cli.truncate(release_id, 12) else "-"; - write("{s:<14} {s:<14} {s:<14} {s:<20} {s:<22} {s:<14} {s}\n", .{ + write("{s:<14} {s:<14} {s:<14} {s:<11} {s:<20} {s:<22} {s:<14} {s}\n", .{ snapshot.app_name, cli.truncate(snapshot.release_id, 12), snapshot.status, + kinds_str, ts_str, progress_str, previous_successful, @@ -422,6 +434,9 @@ fn parseAppStatusResponse(json: []const u8) AppStatusSnapshot { .manifest_hash = extractJsonString(json, "manifest_hash") orelse "?", .created_at = extractJsonInt(json, "created_at") orelse 0, .service_count = @intCast(@max(0, extractJsonInt(json, "service_count") orelse 0)), + .worker_count = @intCast(@max(0, extractJsonInt(json, "worker_count") orelse 0)), + .cron_count = @intCast(@max(0, extractJsonInt(json, "cron_count") orelse 0)), + .training_job_count = @intCast(@max(0, extractJsonInt(json, "training_job_count") orelse 0)), .completed_targets = @intCast(@max(0, extractJsonInt(json, "completed_targets") orelse 0)), .failed_targets = @intCast(@max(0, extractJsonInt(json, "failed_targets") orelse 0)), .remaining_targets = @intCast(@max(0, extractJsonInt(json, "remaining_targets") orelse 0)), @@ -442,6 +457,9 @@ fn writeAppStatusJsonObject(w: *json_out.JsonWriter, snapshot: AppStatusSnapshot w.stringField("manifest_hash", snapshot.manifest_hash); w.intField("created_at", snapshot.created_at); w.uintField("service_count", snapshot.service_count); + w.uintField("worker_count", snapshot.worker_count); + w.uintField("cron_count", snapshot.cron_count); + w.uintField("training_job_count", snapshot.training_job_count); w.uintField("completed_targets", snapshot.completed_targets); w.uintField("failed_targets", snapshot.failed_targets); w.uintField("remaining_targets", snapshot.remaining_targets); @@ -455,6 +473,7 @@ fn writeAppStatusJsonObject(w: *json_out.JsonWriter, snapshot: AppStatusSnapshot fn appStatusFromReports( report: apply_release.ApplyReport, previous_successful: ?apply_release.ApplyReport, + summary: app_snapshot.Summary, ) AppStatusSnapshot { return .{ .app_name = report.app_name, @@ -463,7 +482,10 @@ fn appStatusFromReports( .status = report.status.toString(), .manifest_hash = report.manifest_hash, .created_at = report.created_at, - .service_count = report.service_count, + .service_count = summary.service_count, + .worker_count = summary.worker_count, + .cron_count = summary.cron_count, + .training_job_count = summary.training_job_count, .completed_targets = report.completed_targets, .failed_targets = report.failed_targets, .remaining_targets = report.remainingTargets(), @@ -482,6 +504,7 @@ fn snapshotFromDeployments( return appStatusFromReports( apply_release.reportFromDeployment(latest), if (previous_successful) |dep| apply_release.reportFromDeployment(dep) else null, + app_snapshot.summarize(latest.config_snapshot), ); } diff --git a/src/state/store.zig b/src/state/store.zig index 2edaec31..331326f0 100644 --- a/src/state/store.zig +++ b/src/state/store.zig @@ -99,11 +99,17 @@ pub const getPreviousSuccessfulDeploymentByApp = @import("store/deployments.zig" pub const getPreviousSuccessfulDeploymentByAppInDb = @import("store/deployments.zig").getPreviousSuccessfulDeploymentByAppInDb; pub const saveTrainingJob = @import("store/training.zig").saveTrainingJob; +pub const saveTrainingJobInDb = @import("store/training.zig").saveTrainingJobInDb; pub const updateTrainingJobState = @import("store/training.zig").updateTrainingJobState; +pub const updateTrainingJobStateInDb = @import("store/training.zig").updateTrainingJobStateInDb; pub const incrementTrainingJobRestarts = @import("store/training.zig").incrementTrainingJobRestarts; +pub const incrementTrainingJobRestartsInDb = @import("store/training.zig").incrementTrainingJobRestartsInDb; pub const updateTrainingJobGpus = @import("store/training.zig").updateTrainingJobGpus; +pub const updateTrainingJobGpusInDb = @import("store/training.zig").updateTrainingJobGpusInDb; pub const findTrainingJob = @import("store/training.zig").findTrainingJob; +pub const findTrainingJobInDb = @import("store/training.zig").findTrainingJobInDb; pub const getTrainingJob = @import("store/training.zig").getTrainingJob; +pub const getTrainingJobInDb = @import("store/training.zig").getTrainingJobInDb; pub const saveCheckpoint = @import("store/training.zig").saveCheckpoint; pub const getLatestCheckpoint = @import("store/training.zig").getLatestCheckpoint; pub const listCheckpoints = @import("store/training.zig").listCheckpoints; diff --git a/src/state/store/training.zig b/src/state/store/training.zig index f2dedc50..0b2f20f4 100644 --- a/src/state/store/training.zig +++ b/src/state/store/training.zig @@ -100,6 +100,10 @@ fn checkpointRowToRecord(row: CheckpointRow) CheckpointRecord { pub fn saveTrainingJob(record: TrainingJobRecord) StoreError!void { const db = try common.getDb(); + return saveTrainingJobInDb(db, record); +} + +pub fn saveTrainingJobInDb(db: *sqlite.Db, record: TrainingJobRecord) StoreError!void { db.exec( "INSERT OR REPLACE INTO training_jobs (" ++ training_job_columns ++ ")" ++ " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);", @@ -123,6 +127,10 @@ pub fn saveTrainingJob(record: TrainingJobRecord) StoreError!void { pub fn updateTrainingJobState(id: []const u8, state: []const u8, now: i64) StoreError!void { const db = try common.getDb(); + return updateTrainingJobStateInDb(db, id, state, now); +} + +pub fn updateTrainingJobStateInDb(db: *sqlite.Db, id: []const u8, state: []const u8, now: i64) StoreError!void { db.exec( "UPDATE training_jobs SET state = ?, updated_at = ? WHERE id = ?;", .{}, @@ -132,6 +140,10 @@ pub fn updateTrainingJobState(id: []const u8, state: []const u8, now: i64) Store pub fn incrementTrainingJobRestarts(id: []const u8, now: i64) StoreError!void { const db = try common.getDb(); + return incrementTrainingJobRestartsInDb(db, id, now); +} + +pub fn incrementTrainingJobRestartsInDb(db: *sqlite.Db, id: []const u8, now: i64) StoreError!void { db.exec( "UPDATE training_jobs SET restart_count = restart_count + 1, updated_at = ? WHERE id = ?;", .{}, @@ -141,6 +153,10 @@ pub fn incrementTrainingJobRestarts(id: []const u8, now: i64) StoreError!void { pub fn updateTrainingJobGpus(id: []const u8, gpus: u32, now: i64) StoreError!void { const db = try common.getDb(); + return updateTrainingJobGpusInDb(db, id, gpus, now); +} + +pub fn updateTrainingJobGpusInDb(db: *sqlite.Db, id: []const u8, gpus: u32, now: i64) StoreError!void { db.exec( "UPDATE training_jobs SET gpus = ?, updated_at = ? WHERE id = ?;", .{}, @@ -150,6 +166,10 @@ pub fn updateTrainingJobGpus(id: []const u8, gpus: u32, now: i64) StoreError!voi pub fn findTrainingJob(alloc: Allocator, app_name: []const u8, name: []const u8) StoreError!?TrainingJobRecord { const db = try common.getDb(); + return findTrainingJobInDb(db, alloc, app_name, name); +} + +pub fn findTrainingJobInDb(db: *sqlite.Db, alloc: Allocator, app_name: []const u8, name: []const u8) StoreError!?TrainingJobRecord { const row = (db.oneAlloc( TrainingJobRow, alloc, @@ -162,6 +182,10 @@ pub fn findTrainingJob(alloc: Allocator, app_name: []const u8, name: []const u8) pub fn getTrainingJob(alloc: Allocator, id: []const u8) StoreError!TrainingJobRecord { const db = try common.getDb(); + return getTrainingJobInDb(db, alloc, id); +} + +pub fn getTrainingJobInDb(db: *sqlite.Db, alloc: Allocator, id: []const u8) StoreError!TrainingJobRecord { const row = (db.oneAlloc( TrainingJobRow, alloc, diff --git a/src/test_root.zig b/src/test_root.zig index 0689862f..1c2c5646 100644 --- a/src/test_root.zig +++ b/src/test_root.zig @@ -76,6 +76,7 @@ comptime { _ = @import("build/commands.zig"); _ = @import("manifest/spec.zig"); _ = @import("manifest/app_spec.zig"); + _ = @import("manifest/app_snapshot.zig"); _ = @import("manifest/apply_release.zig"); _ = @import("manifest/local_apply_backend.zig"); _ = @import("manifest/release_plan.zig"); @@ -100,6 +101,7 @@ comptime { _ = @import("api/routes/cluster_agents/apply_request.zig"); _ = @import("api/routes/cluster_agents/app_routes.zig"); _ = @import("api/routes/cluster_agents/deploy_routes.zig"); + _ = @import("api/routes/cluster_agents/workload_routes.zig"); _ = @import("api/routes/status_metrics.zig"); _ = @import("api/server.zig"); _ = @import("api/server/connection_runtime.zig"); From 00b59f579ea129808ac5d00de6daedde38854b68 Mon Sep 17 00:00:00 2001 From: Kacy Fortner Date: Fri, 10 Apr 2026 03:14:45 +0000 Subject: [PATCH 02/14] Add workload route flow coverage --- .../routes/cluster_agents/workload_routes.zig | 139 +++++++++++++++++- src/manifest/cli/ops.zig | 12 +- src/runtime/cli/status_command.zig | 11 +- 3 files changed, 154 insertions(+), 8 deletions(-) diff --git a/src/api/routes/cluster_agents/workload_routes.zig b/src/api/routes/cluster_agents/workload_routes.zig index d948a2c4..37463858 100644 --- a/src/api/routes/cluster_agents/workload_routes.zig +++ b/src/api/routes/cluster_agents/workload_routes.zig @@ -375,28 +375,155 @@ fn formatTrainingRecordJson( return json_buf.toOwnedSlice(alloc); } -fn testRequest(method: http.Method, path: []const u8) http.Request { +const RouteFlowHarness = struct { + alloc: std.mem.Allocator, + tmp: std.testing.TmpDir, + node: cluster_node.Node, + + fn init(alloc: std.mem.Allocator) !RouteFlowHarness { + var tmp = std.testing.tmpDir(.{}); + errdefer tmp.cleanup(); + + var path_buf: [512]u8 = undefined; + const tmp_path = tmp.dir.realpath(".", &path_buf) catch return error.SkipZigTest; + + var node = cluster_node.Node.init(alloc, .{ + .id = 1, + .port = 0, + .peers = &.{}, + .data_dir = tmp_path, + }) catch return error.SkipZigTest; + errdefer node.deinit(); + + node.raft.role = .leader; + node.leader_id = node.config.id; + + var harness = RouteFlowHarness{ + .alloc = alloc, + .tmp = tmp, + .node = node, + }; + try harness.seedActiveAgent(); + return harness; + } + + fn deinit(self: *RouteFlowHarness) void { + self.node.deinit(); + self.tmp.cleanup(); + } + + fn ctx(self: *RouteFlowHarness) RouteContext { + return .{ .cluster = &self.node, .join_token = null }; + } + + fn seedActiveAgent(self: *RouteFlowHarness) !void { + self.node.stateMachineDb().exec( + "INSERT INTO agents (id, address, status, cpu_cores, memory_mb, cpu_used, memory_used_mb, containers, last_heartbeat, registered_at, role, labels, gpu_count, gpu_used, gpu_model, gpu_vram_mb) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);", + .{}, + .{ "abc123def456", "10.0.0.2:7701", "active", @as(i64, 8), @as(i64, 16384), @as(i64, 0), @as(i64, 0), @as(i64, 0), @as(i64, 100), @as(i64, 100), "agent", "", @as(i64, 4), @as(i64, 0), "L4", @as(i64, 24576) }, + ) catch return error.SkipZigTest; + } + + fn seedLatestRelease(self: *RouteFlowHarness, app_name: []const u8, snapshot: []const u8) !void { + try store.saveDeploymentInDb(self.node.stateMachineDb(), .{ + .id = "dep-seed", + .app_name = app_name, + .service_name = app_name, + .trigger = "apply", + .manifest_hash = "sha256:seed", + .config_snapshot = snapshot, + .status = "completed", + .message = "apply completed", + .created_at = 100, + }); + } +}; + +fn makeRequest(method: http.Method, path: []const u8, body: []const u8, query: []const u8) http.Request { return .{ .method = method, .path = path, .path_only = path, - .query = "", + .query = query, .headers_raw = "", - .body = "", - .content_length = 0, + .body = body, + .content_length = body.len, }; } +fn freeResponse(alloc: std.mem.Allocator, response: Response) void { + if (response.allocated) alloc.free(response.body); +} + test "route rejects worker run without cluster" { const ctx: RouteContext = .{ .cluster = null, .join_token = null }; - const req = testRequest(.POST, "/apps/demo-app/workers/migrate/run"); + const req = makeRequest(.POST, "/apps/demo-app/workers/migrate/run", "", ""); const resp = route(req, std.testing.allocator, ctx).?; try std.testing.expectEqual(http.StatusCode.bad_request, resp.status); } test "route rejects training status without cluster" { const ctx: RouteContext = .{ .cluster = null, .join_token = null }; - const req = testRequest(.GET, "/apps/demo-app/training/finetune/status"); + const req = makeRequest(.GET, "/apps/demo-app/training/finetune/status", "", ""); const resp = route(req, std.testing.allocator, ctx).?; try std.testing.expectEqual(http.StatusCode.bad_request, resp.status); } + +test "worker run route schedules worker from latest app snapshot" { + const alloc = std.testing.allocator; + var harness = try RouteFlowHarness.init(alloc); + defer harness.deinit(); + + try harness.seedLatestRelease( + "demo-app", + "{\"app_name\":\"demo-app\",\"services\":[],\"workers\":[{\"name\":\"migrate\",\"image\":\"alpine:latest\",\"command\":[\"/bin/sh\",\"-c\",\"echo ok\"],\"gpu_limit\":0,\"required_labels\":[]}],\"crons\":[],\"training_jobs\":[]}", + ); + + const resp = route( + makeRequest(.POST, "/apps/demo-app/workers/migrate/run", "", ""), + alloc, + harness.ctx(), + ).?; + defer freeResponse(alloc, resp); + + try std.testing.expectEqual(http.StatusCode.ok, resp.status); + try std.testing.expect(std.mem.indexOf(u8, resp.body, "\"app_name\":\"demo-app\"") != null); + try std.testing.expect(std.mem.indexOf(u8, resp.body, "\"worker\":\"migrate\"") != null); + try std.testing.expect(std.mem.indexOf(u8, resp.body, "\"placed\":1") != null); + try std.testing.expect(std.mem.indexOf(u8, resp.body, "\"failed\":0") != null); +} + +test "training start and status routes persist job state from app snapshot" { + const alloc = std.testing.allocator; + var harness = try RouteFlowHarness.init(alloc); + defer harness.deinit(); + + try harness.seedLatestRelease( + "demo-app", + "{\"app_name\":\"demo-app\",\"services\":[],\"workers\":[],\"crons\":[],\"training_jobs\":[{\"name\":\"finetune\",\"image\":\"pytorch:latest\",\"command\":[\"python\",\"train.py\"],\"gpus\":1,\"cpu_limit\":2000,\"memory_limit_mb\":4096}]}", + ); + + const start_resp = route( + makeRequest(.POST, "/apps/demo-app/training/finetune/start", "", ""), + alloc, + harness.ctx(), + ).?; + defer freeResponse(alloc, start_resp); + + try std.testing.expectEqual(http.StatusCode.ok, start_resp.status); + try std.testing.expect(std.mem.indexOf(u8, start_resp.body, "\"app_name\":\"demo-app\"") != null); + try std.testing.expect(std.mem.indexOf(u8, start_resp.body, "\"training_job\":\"finetune\"") != null); + try std.testing.expect(std.mem.indexOf(u8, start_resp.body, "\"state\":\"running\"") != null); + try std.testing.expect(std.mem.indexOf(u8, start_resp.body, "\"gpus\":1") != null); + + const status_resp = route( + makeRequest(.GET, "/apps/demo-app/training/finetune/status", "", ""), + alloc, + harness.ctx(), + ).?; + defer freeResponse(alloc, status_resp); + + try std.testing.expectEqual(http.StatusCode.ok, status_resp.status); + try std.testing.expect(std.mem.indexOf(u8, status_resp.body, "\"state\":\"running\"") != null); + try std.testing.expect(std.mem.indexOf(u8, status_resp.body, "\"training_job\":\"finetune\"") != null); +} diff --git a/src/manifest/cli/ops.zig b/src/manifest/cli/ops.zig index 4acf3f75..9b127849 100644 --- a/src/manifest/cli/ops.zig +++ b/src/manifest/cli/ops.zig @@ -371,7 +371,7 @@ fn rollbackRemoteApp(alloc: std.mem.Allocator, addr_str: []const u8, app_name: [ test "parseHistoryObject extracts app release fields" { const entry = parseHistoryObject( - \\{"id":"dep-1","app":"demo-app","service":"demo-app","trigger":"apply","status":"completed","manifest_hash":"sha256:123","created_at":42,"source_release_id":null,"message":null} + \\{"id":"dep-1","app":"demo-app","service":"demo-app","trigger":"apply","status":"completed","manifest_hash":"sha256:123","created_at":42,"service_count":2,"worker_count":1,"cron_count":3,"training_job_count":4,"completed_targets":0,"failed_targets":0,"remaining_targets":2,"source_release_id":null,"message":null} ); try std.testing.expectEqualStrings("dep-1", entry.id); @@ -381,6 +381,10 @@ test "parseHistoryObject extracts app release fields" { try std.testing.expectEqualStrings("completed", entry.status); try std.testing.expectEqualStrings("sha256:123", entry.manifest_hash); try std.testing.expectEqual(@as(i64, 42), entry.created_at); + try std.testing.expectEqual(@as(usize, 2), entry.service_count); + try std.testing.expectEqual(@as(usize, 1), entry.worker_count); + try std.testing.expectEqual(@as(usize, 3), entry.cron_count); + try std.testing.expectEqual(@as(usize, 4), entry.training_job_count); try std.testing.expect(entry.source_release_id == null); try std.testing.expect(entry.message == null); } @@ -427,6 +431,9 @@ test "writeHistoryJsonObject round-trips through remote parser" { .manifest_hash = "sha256:123", .created_at = 42, .service_count = 1, + .worker_count = 2, + .cron_count = 3, + .training_job_count = 4, .completed_targets = 1, .failed_targets = 0, .remaining_targets = 0, @@ -446,6 +453,9 @@ test "writeHistoryJsonObject round-trips through remote parser" { try std.testing.expectEqualStrings(entry.manifest_hash, parsed.manifest_hash); try std.testing.expectEqual(entry.created_at, parsed.created_at); try std.testing.expectEqual(entry.service_count, parsed.service_count); + try std.testing.expectEqual(entry.worker_count, parsed.worker_count); + try std.testing.expectEqual(entry.cron_count, parsed.cron_count); + try std.testing.expectEqual(entry.training_job_count, parsed.training_job_count); try std.testing.expectEqual(entry.completed_targets, parsed.completed_targets); try std.testing.expectEqual(entry.failed_targets, parsed.failed_targets); try std.testing.expectEqual(entry.remaining_targets, parsed.remaining_targets); diff --git a/src/runtime/cli/status_command.zig b/src/runtime/cli/status_command.zig index 22a200f9..c8907c7b 100644 --- a/src/runtime/cli/status_command.zig +++ b/src/runtime/cli/status_command.zig @@ -615,7 +615,7 @@ fn parsePsiFromJson(json: []const u8, some_key: []const u8, full_key: []const u8 test "parseAppStatusResponse extracts app fields" { const snapshot = parseAppStatusResponse( - \\{"app_name":"demo-app","trigger":"apply","release_id":"abc123def456","status":"completed","manifest_hash":"sha256:123","created_at":42,"service_count":2,"completed_targets":2,"failed_targets":0,"remaining_targets":0,"source_release_id":null,"message":null} + \\{"app_name":"demo-app","trigger":"apply","release_id":"abc123def456","status":"completed","manifest_hash":"sha256:123","created_at":42,"service_count":2,"worker_count":1,"cron_count":3,"training_job_count":4,"completed_targets":2,"failed_targets":0,"remaining_targets":0,"source_release_id":null,"message":null} ); try std.testing.expectEqualStrings("demo-app", snapshot.app_name); @@ -625,6 +625,9 @@ test "parseAppStatusResponse extracts app fields" { try std.testing.expectEqualStrings("sha256:123", snapshot.manifest_hash); try std.testing.expectEqual(@as(i64, 42), snapshot.created_at); try std.testing.expectEqual(@as(usize, 2), snapshot.service_count); + try std.testing.expectEqual(@as(usize, 1), snapshot.worker_count); + try std.testing.expectEqual(@as(usize, 3), snapshot.cron_count); + try std.testing.expectEqual(@as(usize, 4), snapshot.training_job_count); try std.testing.expectEqual(@as(usize, 2), snapshot.completed_targets); try std.testing.expectEqual(@as(usize, 0), snapshot.failed_targets); try std.testing.expectEqual(@as(usize, 0), snapshot.remaining_targets); @@ -679,6 +682,9 @@ test "writeAppStatusJsonObject round-trips through remote parser" { .manifest_hash = "sha256:222", .created_at = 200, .service_count = 2, + .worker_count = 1, + .cron_count = 2, + .training_job_count = 3, .completed_targets = 1, .failed_targets = 1, .remaining_targets = 0, @@ -700,6 +706,9 @@ test "writeAppStatusJsonObject round-trips through remote parser" { try std.testing.expectEqualStrings(snapshot.manifest_hash, parsed.manifest_hash); try std.testing.expectEqual(snapshot.created_at, parsed.created_at); try std.testing.expectEqual(snapshot.service_count, parsed.service_count); + try std.testing.expectEqual(snapshot.worker_count, parsed.worker_count); + try std.testing.expectEqual(snapshot.cron_count, parsed.cron_count); + try std.testing.expectEqual(snapshot.training_job_count, parsed.training_job_count); try std.testing.expectEqual(snapshot.completed_targets, parsed.completed_targets); try std.testing.expectEqual(snapshot.failed_targets, parsed.failed_targets); try std.testing.expectEqual(snapshot.remaining_targets, parsed.remaining_targets); From 7e90f080a554411293598e4ef41e5e6fc1772cd9 Mon Sep 17 00:00:00 2001 From: Kacy Fortner Date: Fri, 10 Apr 2026 03:16:18 +0000 Subject: [PATCH 03/14] Cover mixed-workload rollback parity --- src/api/routes/cluster_agents/app_routes.zig | 43 ++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/api/routes/cluster_agents/app_routes.zig b/src/api/routes/cluster_agents/app_routes.zig index 089d8e43..b5e5c39d 100644 --- a/src/api/routes/cluster_agents/app_routes.zig +++ b/src/api/routes/cluster_agents/app_routes.zig @@ -711,6 +711,49 @@ test "app apply then rollback routes preserve release transition metadata" { try expectJsonContains(history_response.body, source_release_id); } +test "app rollback restores worker and training workload snapshot" { + const alloc = std.testing.allocator; + const first_apply_body = + \\{"app_name":"demo-app","services":[],"workers":[{"name":"migrate","image":"alpine","command":["/bin/sh","-c","echo first"]}],"crons":[],"training_jobs":[{"name":"finetune","image":"trainer:v1","command":["python","train.py"],"gpus":1}]} + ; + const second_apply_body = + \\{"app_name":"demo-app","services":[],"workers":[{"name":"compact","image":"alpine","command":["/bin/sh","-c","echo second"]}],"crons":[{"name":"nightly","schedule":"0 2 * * *","command":["/bin/sh","-c","echo cron"]}],"training_jobs":[]} + ; + + var harness = try RouteFlowHarness.init(alloc); + defer harness.deinit(); + + const first_apply_response = harness.appApply(first_apply_body); + defer freeResponse(alloc, first_apply_response); + try expectResponseOk(first_apply_response); + + const source_release_id = json_helpers.extractJsonString(first_apply_response.body, "release_id").?; + + const second_apply_response = harness.appApply(second_apply_body); + defer freeResponse(alloc, second_apply_response); + try expectResponseOk(second_apply_response); + + const rollback_response = try harness.rollback("demo-app", source_release_id); + defer freeResponse(alloc, rollback_response); + try expectResponseOk(rollback_response); + + const latest = try store.getLatestDeploymentByAppInDb(harness.node.stateMachineDb(), alloc, "demo-app"); + defer latest.deinit(alloc); + + try std.testing.expectEqualStrings("rollback", latest.trigger.?); + try std.testing.expectEqualStrings(source_release_id, latest.source_release_id.?); + try std.testing.expect(std.mem.indexOf(u8, latest.config_snapshot, "\"workers\":[{\"name\":\"migrate\"") != null); + try std.testing.expect(std.mem.indexOf(u8, latest.config_snapshot, "\"training_jobs\":[{\"name\":\"finetune\"") != null); + try std.testing.expect(std.mem.indexOf(u8, latest.config_snapshot, "\"crons\":[]") != null); + + const status_response = harness.status("demo-app"); + defer freeResponse(alloc, status_response); + try expectResponseOk(status_response); + try expectJsonContains(status_response.body, "\"worker_count\":1"); + try expectJsonContains(status_response.body, "\"training_job_count\":1"); + try expectJsonContains(status_response.body, "\"cron_count\":0"); +} + test "app apply route preserves failed release metadata across reads" { const alloc = std.testing.allocator; const apply_body = From cc86dd1e3e90618f512df226a76fdcb9945defdf Mon Sep 17 00:00:00 2001 From: Kacy Fortner Date: Fri, 10 Apr 2026 03:17:23 +0000 Subject: [PATCH 04/14] Strengthen app summary workload tests --- src/api/routes/cluster_agents/app_routes.zig | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/api/routes/cluster_agents/app_routes.zig b/src/api/routes/cluster_agents/app_routes.zig index b5e5c39d..34b5f97b 100644 --- a/src/api/routes/cluster_agents/app_routes.zig +++ b/src/api/routes/cluster_agents/app_routes.zig @@ -428,7 +428,7 @@ test "formatAppsResponse emits one latest summary per app" { .service_name = "app-a", .trigger = "apply", .manifest_hash = "sha256:a1", - .config_snapshot = "{\"app_name\":\"app-a\",\"services\":[{\"name\":\"web\"}]}", + .config_snapshot = "{\"app_name\":\"app-a\",\"services\":[{\"name\":\"web\"}],\"workers\":[],\"crons\":[],\"training_jobs\":[]}", .status = "completed", .message = "apply completed", .created_at = 100, @@ -439,7 +439,7 @@ test "formatAppsResponse emits one latest summary per app" { .service_name = "app-b", .trigger = "apply", .manifest_hash = "sha256:b1", - .config_snapshot = "{\"app_name\":\"app-b\",\"services\":[{\"name\":\"api\"}]}", + .config_snapshot = "{\"app_name\":\"app-b\",\"services\":[{\"name\":\"api\"}],\"workers\":[],\"crons\":[],\"training_jobs\":[]}", .status = "completed", .message = "apply completed", .created_at = 150, @@ -450,7 +450,7 @@ test "formatAppsResponse emits one latest summary per app" { .service_name = "app-a", .trigger = "apply", .manifest_hash = "sha256:a2", - .config_snapshot = "{\"app_name\":\"app-a\",\"services\":[{\"name\":\"web\"},{\"name\":\"db\"}]}", + .config_snapshot = "{\"app_name\":\"app-a\",\"services\":[{\"name\":\"web\"},{\"name\":\"db\"}],\"workers\":[{\"name\":\"migrate\"}],\"crons\":[{\"name\":\"nightly\"}],\"training_jobs\":[{\"name\":\"finetune\"}]}", .status = "failed", .message = "scheduler error during apply", .created_at = 200, @@ -468,6 +468,9 @@ test "formatAppsResponse emits one latest summary per app" { try std.testing.expect(std.mem.indexOf(u8, json, "\"app_name\":\"app-a\"") != null); try std.testing.expect(std.mem.indexOf(u8, json, "\"release_id\":\"dep-3\"") != null); try std.testing.expect(std.mem.indexOf(u8, json, "\"previous_successful_release_id\":\"dep-1\"") != null); + try std.testing.expect(std.mem.indexOf(u8, json, "\"worker_count\":1") != null); + try std.testing.expect(std.mem.indexOf(u8, json, "\"cron_count\":1") != null); + try std.testing.expect(std.mem.indexOf(u8, json, "\"training_job_count\":1") != null); try std.testing.expect(std.mem.indexOf(u8, json, "\"app_name\":\"app-b\"") != null); try std.testing.expect(std.mem.indexOf(u8, json, "\"release_id\":\"dep-2\"") != null); } From 30589be02d8336906a57220efbd9a4e095b5d0d6 Mon Sep 17 00:00:00 2001 From: Kacy Fortner Date: Fri, 10 Apr 2026 03:24:09 +0000 Subject: [PATCH 05/14] Reuse runtime setup for local replacements --- src/manifest/local_apply_backend.zig | 2 +- src/manifest/orchestrator.zig | 29 +++++++++++++++ .../orchestrator/lifecycle_support.zig | 35 +++++++++++-------- 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/src/manifest/local_apply_backend.zig b/src/manifest/local_apply_backend.zig index 28dd2ccc..602a9c49 100644 --- a/src/manifest/local_apply_backend.zig +++ b/src/manifest/local_apply_backend.zig @@ -392,7 +392,7 @@ const LocalApplyBackend = struct { } fn finish(runner_self: *@This()) void { - runner_self.orch.startTlsProxy(); + runner_self.orch.finishRuntimeSetup(); } fn reportProgress(runner_self: *@This(), completed_targets: usize, failed_targets: usize) void { diff --git a/src/manifest/orchestrator.zig b/src/manifest/orchestrator.zig index 425916ba..1e30c236 100644 --- a/src/manifest/orchestrator.zig +++ b/src/manifest/orchestrator.zig @@ -188,6 +188,10 @@ pub const Orchestrator = struct { self.proxy = resources.proxy; } + pub fn finishRuntimeSetup(self: *Orchestrator) void { + lifecycle_support.finishRuntimeSetup(self); + } + /// stop all running services in reverse dependency order. pub fn stopAll(self: *Orchestrator) void { lifecycle_support.stopAll(self); @@ -698,3 +702,28 @@ test "stopServiceByIndex marks running service stopped without pid" { try std.testing.expectEqual(ServiceState.Status.stopped, orch.states[0].status); try std.testing.expect(orch.states[0].thread == null); } + +test "finishRuntimeSetup starts cron scheduler when crons are present" { + const alloc = std.testing.allocator; + const loader = @import("loader.zig"); + + var manifest = try loader.loadFromString(alloc, + \\[service.web] + \\image = "nginx:latest" + \\ + \\[cron.cleanup] + \\image = "alpine:latest" + \\command = ["/bin/sh", "-c", "echo cleanup"] + \\every = "1h" + ); + defer manifest.deinit(); + + var orch = try Orchestrator.init(alloc, &manifest, "demo-app"); + defer orch.deinit(); + + orch.finishRuntimeSetup(); + + try std.testing.expect(orch.cron_sched != null); + orch.stopAll(); + try std.testing.expect(orch.cron_sched != null); +} diff --git a/src/manifest/orchestrator/lifecycle_support.zig b/src/manifest/orchestrator/lifecycle_support.zig index 3007af9f..5bb87ca0 100644 --- a/src/manifest/orchestrator/lifecycle_support.zig +++ b/src/manifest/orchestrator/lifecycle_support.zig @@ -82,23 +82,30 @@ pub fn startAll(self: anytype, comptime OrchestratorError: type, serviceThreadFn }; } + finishRuntimeSetup(self); +} + +pub fn finishRuntimeSetup(self: anytype) void { self.registerHealthChecks(); self.startTlsProxy(); + startCronSchedulerIfNeeded(self); +} - if (self.service_filter == null and self.manifest.crons.len > 0) { - const cs = self.alloc.create(cron_scheduler.CronScheduler) catch { - writeErr("failed to allocate cron scheduler\n", .{}); - return; - }; - cs.* = cron_scheduler.CronScheduler.init(self.alloc, self.manifest.crons, self.manifest.volumes, self.app_name) catch { - self.alloc.destroy(cs); - writeErr("failed to init cron scheduler\n", .{}); - return; - }; - self.cron_sched = cs; - cs.start(); - writeErr("{d} cron(s) scheduled\n", .{self.manifest.crons.len}); - } +fn startCronSchedulerIfNeeded(self: anytype) void { + if (self.service_filter != null or self.manifest.crons.len == 0 or self.cron_sched != null) return; + + const cs = self.alloc.create(cron_scheduler.CronScheduler) catch { + writeErr("failed to allocate cron scheduler\n", .{}); + return; + }; + cs.* = cron_scheduler.CronScheduler.init(self.alloc, self.manifest.crons, self.manifest.volumes, self.app_name) catch { + self.alloc.destroy(cs); + writeErr("failed to init cron scheduler\n", .{}); + return; + }; + self.cron_sched = cs; + cs.start(); + writeErr("{d} cron(s) scheduled\n", .{self.manifest.crons.len}); } pub fn startServiceByIndex( From 7845968f4f8c958d821e3b393c01febfd4e2a47b Mon Sep 17 00:00:00 2001 From: Kacy Fortner Date: Fri, 10 Apr 2026 03:49:29 +0000 Subject: [PATCH 06/14] Register cluster cron schedules on app apply --- src/api/routes/cluster_agents/app_routes.zig | 69 +++++++++ .../routes/cluster_agents/deploy_routes.zig | 23 +++ src/manifest/app_snapshot.zig | 50 +++++++ src/state/schema/migrations.zig | 15 ++ src/state/schema/tables.zig | 11 ++ src/state/store.zig | 6 + src/state/store/crons.zig | 132 ++++++++++++++++++ src/test_root.zig | 1 + 8 files changed, 307 insertions(+) create mode 100644 src/state/store/crons.zig diff --git a/src/api/routes/cluster_agents/app_routes.zig b/src/api/routes/cluster_agents/app_routes.zig index 34b5f97b..38aa48f1 100644 --- a/src/api/routes/cluster_agents/app_routes.zig +++ b/src/api/routes/cluster_agents/app_routes.zig @@ -714,6 +714,32 @@ test "app apply then rollback routes preserve release transition metadata" { try expectJsonContains(history_response.body, source_release_id); } +test "app apply registers cluster cron schedules from snapshot" { + const alloc = std.testing.allocator; + const apply_body = + \\{"app_name":"demo-app","services":[],"workers":[],"crons":[{"name":"nightly","image":"alpine","command":["/bin/sh","-c","echo cron"],"every":3600}],"training_jobs":[]} + ; + + var harness = try RouteFlowHarness.init(alloc); + defer harness.deinit(); + + const apply_response = harness.appApply(apply_body); + defer freeResponse(alloc, apply_response); + + try expectResponseOk(apply_response); + try expectJsonContains(apply_response.body, "\"cron_count\":1"); + + var schedules = try store.listCronSchedulesByAppInDb(harness.node.stateMachineDb(), alloc, "demo-app"); + defer { + for (schedules.items) |schedule| schedule.deinit(alloc); + schedules.deinit(alloc); + } + + try std.testing.expectEqual(@as(usize, 1), schedules.items.len); + try std.testing.expectEqualStrings("nightly", schedules.items[0].name); + try std.testing.expectEqual(@as(i64, 3600), schedules.items[0].every); +} + test "app rollback restores worker and training workload snapshot" { const alloc = std.testing.allocator; const first_apply_body = @@ -749,6 +775,13 @@ test "app rollback restores worker and training workload snapshot" { try std.testing.expect(std.mem.indexOf(u8, latest.config_snapshot, "\"training_jobs\":[{\"name\":\"finetune\"") != null); try std.testing.expect(std.mem.indexOf(u8, latest.config_snapshot, "\"crons\":[]") != null); + var schedules = try store.listCronSchedulesByAppInDb(harness.node.stateMachineDb(), alloc, "demo-app"); + defer { + for (schedules.items) |schedule| schedule.deinit(alloc); + schedules.deinit(alloc); + } + try std.testing.expectEqual(@as(usize, 0), schedules.items.len); + const status_response = harness.status("demo-app"); defer freeResponse(alloc, status_response); try expectResponseOk(status_response); @@ -757,6 +790,42 @@ test "app rollback restores worker and training workload snapshot" { try expectJsonContains(status_response.body, "\"cron_count\":0"); } +test "app rollback restores cluster cron schedules from selected release" { + const alloc = std.testing.allocator; + const first_apply_body = + \\{"app_name":"demo-app","services":[],"workers":[],"crons":[{"name":"cleanup","image":"alpine","command":["/bin/sh","-c","echo first"],"every":60}],"training_jobs":[]} + ; + const second_apply_body = + \\{"app_name":"demo-app","services":[],"workers":[],"crons":[{"name":"backup","image":"alpine","command":["/bin/sh","-c","echo second"],"every":3600}],"training_jobs":[]} + ; + + var harness = try RouteFlowHarness.init(alloc); + defer harness.deinit(); + + const first_apply_response = harness.appApply(first_apply_body); + defer freeResponse(alloc, first_apply_response); + try expectResponseOk(first_apply_response); + const source_release_id = json_helpers.extractJsonString(first_apply_response.body, "release_id").?; + + const second_apply_response = harness.appApply(second_apply_body); + defer freeResponse(alloc, second_apply_response); + try expectResponseOk(second_apply_response); + + const rollback_response = try harness.rollback("demo-app", source_release_id); + defer freeResponse(alloc, rollback_response); + try expectResponseOk(rollback_response); + + var schedules = try store.listCronSchedulesByAppInDb(harness.node.stateMachineDb(), alloc, "demo-app"); + defer { + for (schedules.items) |schedule| schedule.deinit(alloc); + schedules.deinit(alloc); + } + + try std.testing.expectEqual(@as(usize, 1), schedules.items.len); + try std.testing.expectEqualStrings("cleanup", schedules.items[0].name); + try std.testing.expectEqual(@as(i64, 60), schedules.items[0].every); +} + test "app apply route preserves failed release metadata across reads" { const alloc = std.testing.allocator; const apply_body = diff --git a/src/api/routes/cluster_agents/deploy_routes.zig b/src/api/routes/cluster_agents/deploy_routes.zig index 26c77ca4..15d9d14a 100644 --- a/src/api/routes/cluster_agents/deploy_routes.zig +++ b/src/api/routes/cluster_agents/deploy_routes.zig @@ -9,6 +9,7 @@ const apply_request = @import("apply_request.zig"); const volumes_mod = @import("../../../state/volumes.zig"); const agent_registry = @import("../../../cluster/registry.zig"); const deployment_store = @import("../../../manifest/update/deployment_store.zig"); +const store = @import("../../../state/store.zig"); const common = @import("../common.zig"); const Response = common.Response; @@ -294,6 +295,12 @@ fn handleApply( const apply_report = apply_result.toReport(parsed.app_name orelse "", parsed.requests.items.len, apply_context); defer apply_report.deinit(alloc); + if (parsed.app_name) |app_name| { + if (apply_result.outcome.status != .failed) { + reconcileCronSchedules(db, alloc, app_name, request.body) catch return common.internalError(); + } + } + const body = switch (response_mode) { .legacy => formatLegacyApplyResponse(alloc, apply_report.placed, apply_report.failed) catch return common.internalError(), .app => formatAppApplyResponse(alloc, apply_report, parsed.summary) catch return common.internalError(), @@ -301,6 +308,22 @@ fn handleApply( return .{ .status = .ok, .body = body, .allocated = true }; } +fn reconcileCronSchedules(db: *sqlite.Db, alloc: std.mem.Allocator, app_name: []const u8, config_snapshot: []const u8) !void { + var schedules = try app_snapshot.listCronSchedules(alloc, config_snapshot); + defer { + for (schedules.items) |schedule| schedule.deinit(alloc); + schedules.deinit(alloc); + } + + try store.replaceCronSchedulesForAppInDb( + db, + alloc, + app_name, + schedules.items, + std.time.timestamp(), + ); +} + pub fn handleAppApply(alloc: std.mem.Allocator, request: @import("../../http.zig").Request, ctx: RouteContext) Response { return handleApply(alloc, request, ctx, .app, .{}); } diff --git a/src/manifest/app_snapshot.zig b/src/manifest/app_snapshot.zig index a8941d4b..7c3c6bd8 100644 --- a/src/manifest/app_snapshot.zig +++ b/src/manifest/app_snapshot.zig @@ -41,6 +41,17 @@ pub const TrainingJobSpec = struct { } }; +pub const CronScheduleSpec = struct { + name: []const u8, + every: u64, + spec_json: []const u8, + + pub fn deinit(self: CronScheduleSpec, alloc: std.mem.Allocator) void { + alloc.free(self.name); + alloc.free(self.spec_json); + } +}; + pub fn summarize(json: []const u8) Summary { return .{ .service_count = countArrayObjects(json, "services"), @@ -103,6 +114,28 @@ pub fn findTrainingJobSpec(alloc: std.mem.Allocator, json: []const u8, name: []c }; } +pub fn listCronSchedules(alloc: std.mem.Allocator, json: []const u8) !std.ArrayList(CronScheduleSpec) { + var specs: std.ArrayList(CronScheduleSpec) = .empty; + errdefer { + for (specs.items) |spec| spec.deinit(alloc); + specs.deinit(alloc); + } + + const array = json_helpers.extractJsonArray(json, "crons") orelse return specs; + var iter = json_helpers.extractJsonObjects(array); + while (iter.next()) |obj| { + const name = json_helpers.extractJsonString(obj, "name") orelse continue; + const every = json_helpers.extractJsonInt(obj, "every") orelse continue; + try specs.append(alloc, .{ + .name = try alloc.dupe(u8, name), + .every = @intCast(@max(@as(i64, 0), every)), + .spec_json = try alloc.dupe(u8, obj), + }); + } + + return specs; +} + fn countArrayObjects(json: []const u8, key: []const u8) usize { const array = json_helpers.extractJsonArray(json, key) orelse return 0; var count: usize = 0; @@ -211,3 +244,20 @@ test "findTrainingJobSpec extracts training scheduler fields" { try std.testing.expectEqual(@as(i64, 131072), job.memory_limit_mb); try std.testing.expectEqualStrings("/ckpt", job.checkpoint_path.?); } + +test "listCronSchedules extracts cron registration specs" { + const alloc = std.testing.allocator; + var schedules = try listCronSchedules( + alloc, + \\{"app_name":"demo","services":[],"workers":[],"crons":[{"name":"cleanup","image":"alpine","command":["/bin/sh"],"every":60},{"name":"backup","image":"postgres","command":["/bin/sh"],"every":3600}],"training_jobs":[]} + ); + defer { + for (schedules.items) |schedule| schedule.deinit(alloc); + schedules.deinit(alloc); + } + + try std.testing.expectEqual(@as(usize, 2), schedules.items.len); + try std.testing.expectEqualStrings("cleanup", schedules.items[0].name); + try std.testing.expectEqual(@as(u64, 60), schedules.items[0].every); + try std.testing.expect(std.mem.indexOf(u8, schedules.items[1].spec_json, "\"name\":\"backup\"") != null); +} diff --git a/src/state/schema/migrations.zig b/src/state/schema/migrations.zig index a5e04a93..dccad941 100644 --- a/src/state/schema/migrations.zig +++ b/src/state/schema/migrations.zig @@ -8,6 +8,7 @@ pub fn apply(db: *sqlite.Db) SchemaError!void { migrateAgents(db); migrateServices(db); migrateDeployments(db); + migrateCronSchedules(db); } fn migrateContainers(db: *sqlite.Db) void { @@ -135,6 +136,20 @@ fn migrateDeployments(db: *sqlite.Db) void { db.exec("UPDATE deployments SET trigger = 'apply' WHERE trigger IS NULL OR trigger = '';", .{}, .{}) catch {}; } +fn migrateCronSchedules(db: *sqlite.Db) void { + createTableIfMissing(db, + \\CREATE TABLE IF NOT EXISTS cron_schedules ( + \\ app_name TEXT NOT NULL, + \\ name TEXT NOT NULL, + \\ every INTEGER NOT NULL, + \\ spec_json TEXT NOT NULL, + \\ created_at INTEGER NOT NULL, + \\ updated_at INTEGER NOT NULL, + \\ PRIMARY KEY (app_name, name) + \\); + ) catch {}; +} + fn addColumnIfMissing(db: *sqlite.Db, sql: []const u8) SchemaError!void { db.execDynamic(sql, .{}, .{}) catch { const err_msg = std.mem.span(sqlite.c.sqlite3_errmsg(db.db)); diff --git a/src/state/schema/tables.zig b/src/state/schema/tables.zig index b3b77f3a..08f4b26f 100644 --- a/src/state/schema/tables.zig +++ b/src/state/schema/tables.zig @@ -226,6 +226,17 @@ pub fn initClusterTables(db: *sqlite.Db) SchemaError!void { \\ PRIMARY KEY (node_id) \\); ); + try exec(db, + \\CREATE TABLE IF NOT EXISTS cron_schedules ( + \\ app_name TEXT NOT NULL, + \\ name TEXT NOT NULL, + \\ every INTEGER NOT NULL, + \\ spec_json TEXT NOT NULL, + \\ created_at INTEGER NOT NULL, + \\ updated_at INTEGER NOT NULL, + \\ PRIMARY KEY (app_name, name) + \\); + ); } pub fn initSecurityTables(db: *sqlite.Db) SchemaError!void { diff --git a/src/state/store.zig b/src/state/store.zig index 331326f0..8b538091 100644 --- a/src/state/store.zig +++ b/src/state/store.zig @@ -23,6 +23,7 @@ pub const ServiceEndpointRecord = @import("store/services.zig").ServiceEndpointR pub const ServiceNameRecord = @import("store/services.zig").ServiceNameRecord; pub const NetworkPolicyRecord = @import("store/services.zig").NetworkPolicyRecord; pub const DeploymentRecord = @import("store/deployments.zig").DeploymentRecord; +pub const CronScheduleRecord = @import("store/crons.zig").CronScheduleRecord; pub const TrainingJobRecord = @import("store/training.zig").TrainingJobRecord; pub const CheckpointRecord = @import("store/training.zig").CheckpointRecord; @@ -98,6 +99,11 @@ pub const getLastSuccessfulDeploymentByApp = @import("store/deployments.zig").ge pub const getPreviousSuccessfulDeploymentByApp = @import("store/deployments.zig").getPreviousSuccessfulDeploymentByApp; pub const getPreviousSuccessfulDeploymentByAppInDb = @import("store/deployments.zig").getPreviousSuccessfulDeploymentByAppInDb; +pub const replaceCronSchedulesForApp = @import("store/crons.zig").replaceCronSchedulesForApp; +pub const replaceCronSchedulesForAppInDb = @import("store/crons.zig").replaceCronSchedulesForAppInDb; +pub const listCronSchedulesByApp = @import("store/crons.zig").listCronSchedulesByApp; +pub const listCronSchedulesByAppInDb = @import("store/crons.zig").listCronSchedulesByAppInDb; + pub const saveTrainingJob = @import("store/training.zig").saveTrainingJob; pub const saveTrainingJobInDb = @import("store/training.zig").saveTrainingJobInDb; pub const updateTrainingJobState = @import("store/training.zig").updateTrainingJobState; diff --git a/src/state/store/crons.zig b/src/state/store/crons.zig new file mode 100644 index 00000000..a23970f7 --- /dev/null +++ b/src/state/store/crons.zig @@ -0,0 +1,132 @@ +const std = @import("std"); +const sqlite = @import("sqlite"); +const common = @import("common.zig"); + +const Allocator = std.mem.Allocator; +const StoreError = common.StoreError; + +pub const CronScheduleRecord = struct { + app_name: []const u8, + name: []const u8, + every: i64, + spec_json: []const u8, + created_at: i64, + updated_at: i64, + + pub fn deinit(self: CronScheduleRecord, alloc: Allocator) void { + alloc.free(self.app_name); + alloc.free(self.name); + alloc.free(self.spec_json); + } +}; + +const cron_columns = + "app_name, name, every, spec_json, created_at, updated_at"; + +const CronScheduleRow = struct { + app_name: sqlite.Text, + name: sqlite.Text, + every: i64, + spec_json: sqlite.Text, + created_at: i64, + updated_at: i64, +}; + +fn rowToRecord(row: CronScheduleRow) CronScheduleRecord { + return .{ + .app_name = row.app_name.data, + .name = row.name.data, + .every = row.every, + .spec_json = row.spec_json.data, + .created_at = row.created_at, + .updated_at = row.updated_at, + }; +} + +pub fn replaceCronSchedulesForApp( + alloc: Allocator, + app_name: []const u8, + schedules: []const @import("../../manifest/app_snapshot.zig").CronScheduleSpec, + now: i64, +) StoreError!void { + const db = try common.getDb(); + return replaceCronSchedulesForAppInDb(db, alloc, app_name, schedules, now); +} + +pub fn replaceCronSchedulesForAppInDb( + db: *sqlite.Db, + alloc: Allocator, + app_name: []const u8, + schedules: []const @import("../../manifest/app_snapshot.zig").CronScheduleSpec, + now: i64, +) StoreError!void { + db.exec("DELETE FROM cron_schedules WHERE app_name = ?;", .{}, .{app_name}) catch return StoreError.WriteFailed; + + for (schedules) |schedule| { + db.exec( + "INSERT INTO cron_schedules (" ++ cron_columns ++ ") VALUES (?, ?, ?, ?, ?, ?);", + .{}, + .{ + app_name, + schedule.name, + @as(i64, @intCast(schedule.every)), + schedule.spec_json, + now, + now, + }, + ) catch return StoreError.WriteFailed; + } + + _ = alloc; +} + +pub fn listCronSchedulesByApp(alloc: Allocator, app_name: []const u8) StoreError!std.ArrayList(CronScheduleRecord) { + const db = try common.getDb(); + return listCronSchedulesByAppInDb(db, alloc, app_name); +} + +pub fn listCronSchedulesByAppInDb( + db: *sqlite.Db, + alloc: Allocator, + app_name: []const u8, +) StoreError!std.ArrayList(CronScheduleRecord) { + var records: std.ArrayList(CronScheduleRecord) = .empty; + var stmt = db.prepare( + "SELECT " ++ cron_columns ++ " FROM cron_schedules WHERE app_name = ? ORDER BY name ASC;", + ) catch return StoreError.ReadFailed; + defer stmt.deinit(); + var iter = stmt.iterator(CronScheduleRow, .{app_name}) catch return StoreError.ReadFailed; + while (iter.nextAlloc(alloc, .{}) catch return StoreError.ReadFailed) |row| { + records.append(alloc, rowToRecord(row)) catch return StoreError.ReadFailed; + } + return records; +} + +test "replaceCronSchedulesForAppInDb swaps active schedules for app" { + const alloc = std.testing.allocator; + var db = try sqlite.Db.init(.{ .mode = .Memory, .open_flags = .{ .write = true } }); + defer db.deinit(); + try @import("../schema.zig").init(&db); + + const app_snapshot = @import("../../manifest/app_snapshot.zig"); + const first = [_]app_snapshot.CronScheduleSpec{ + .{ .name = "cleanup", .every = 60, .spec_json = "{\"name\":\"cleanup\",\"every\":60}" }, + }; + const second = [_]app_snapshot.CronScheduleSpec{ + .{ .name = "backup", .every = 3600, .spec_json = "{\"name\":\"backup\",\"every\":3600}" }, + }; + + try replaceCronSchedulesForAppInDb(&db, alloc, "demo-app", &first, 100); + try replaceCronSchedulesForAppInDb(&db, alloc, "demo-app", &second, 200); + + var records = try listCronSchedulesByAppInDb(&db, alloc, "demo-app"); + defer { + for (records.items) |record| record.deinit(alloc); + records.deinit(alloc); + } + + try std.testing.expectEqual(@as(usize, 1), records.items.len); + try std.testing.expectEqualStrings("backup", records.items[0].name); + try std.testing.expectEqual(@as(i64, 3600), records.items[0].every); + try std.testing.expectEqual(@as(i64, 200), records.items[0].updated_at); +} diff --git a/src/test_root.zig b/src/test_root.zig index 1c2c5646..982fc233 100644 --- a/src/test_root.zig +++ b/src/test_root.zig @@ -19,6 +19,7 @@ comptime { _ = @import("runtime/container_commands.zig"); _ = @import("runtime/run_state.zig"); _ = @import("state/store.zig"); + _ = @import("state/store/crons.zig"); _ = @import("state/store/deployments.zig"); _ = @import("state/schema.zig"); _ = @import("state/commands.zig"); From cf2244d9952a9575bb69f63b7fe255905c8b76a4 Mon Sep 17 00:00:00 2001 From: Kacy Fortner Date: Fri, 10 Apr 2026 03:55:19 +0000 Subject: [PATCH 07/14] Show app training runtime summaries --- src/api/routes/cluster_agents/app_routes.zig | 22 +++- src/runtime/cli/status_command.zig | 92 +++++++++++++- src/state/store.zig | 5 + src/state/store/training.zig | 126 +++++++++++++++++++ src/test_root.zig | 1 + 5 files changed, 234 insertions(+), 12 deletions(-) diff --git a/src/api/routes/cluster_agents/app_routes.zig b/src/api/routes/cluster_agents/app_routes.zig index 38aa48f1..9cbece05 100644 --- a/src/api/routes/cluster_agents/app_routes.zig +++ b/src/api/routes/cluster_agents/app_routes.zig @@ -84,7 +84,7 @@ pub fn handleAppStatus(alloc: std.mem.Allocator, app_name: []const u8, ctx: Rout ) catch return common.internalError(); defer if (previous_successful) |dep| dep.deinit(alloc); - const body = formatAppStatusResponseFromDeployments(alloc, latest, previous_successful) catch return common.internalError(); + const body = formatAppStatusResponseFromDeployments(alloc, node.stateMachineDb(), latest, previous_successful) catch return common.internalError(); return .{ .status = .ok, .body = body, .allocated = true }; } @@ -136,7 +136,7 @@ fn formatAppsResponse( defer if (previous_successful) |dep| dep.deinit(alloc); if (i > 0) try writer.writeByte(','); - const json = try formatAppStatusResponseFromDeployments(alloc, latest, previous_successful); + const json = try formatAppStatusResponseFromDeployments(alloc, db, latest, previous_successful); defer alloc.free(json); try writer.writeAll(json); } @@ -158,6 +158,7 @@ fn loadPreviousSuccessfulDeployment( fn formatAppStatusResponseFromDeployments( alloc: std.mem.Allocator, + db: *sqlite.Db, latest: store.DeploymentRecord, previous_successful: ?store.DeploymentRecord, ) ![]u8 { @@ -166,6 +167,7 @@ fn formatAppStatusResponseFromDeployments( apply_release.reportFromDeployment(latest), if (previous_successful) |dep| apply_release.reportFromDeployment(dep) else null, app_snapshot.summarize(latest.config_snapshot), + store.summarizeTrainingJobsByAppInDb(db, alloc, latest.app_name.?) catch .{}, ); } @@ -216,6 +218,7 @@ fn formatAppStatusResponse( report: apply_release.ApplyReport, previous_successful: ?apply_release.ApplyReport, summary: app_snapshot.Summary, + training_summary: store.TrainingJobSummary, ) ![]u8 { var json_buf: std.ArrayList(u8) = .empty; errdefer json_buf.deinit(alloc); @@ -231,12 +234,15 @@ fn formatAppStatusResponse( try json_helpers.writeJsonStringField(writer, "status", report.status.toString()); try writer.writeByte(','); try json_helpers.writeJsonStringField(writer, "manifest_hash", report.manifest_hash); - try writer.print(",\"created_at\":{d},\"service_count\":{d},\"worker_count\":{d},\"cron_count\":{d},\"training_job_count\":{d},\"completed_targets\":{d},\"failed_targets\":{d},\"remaining_targets\":{d}", .{ + try writer.print(",\"created_at\":{d},\"service_count\":{d},\"worker_count\":{d},\"cron_count\":{d},\"training_job_count\":{d},\"active_training_jobs\":{d},\"paused_training_jobs\":{d},\"failed_training_jobs\":{d},\"completed_targets\":{d},\"failed_targets\":{d},\"remaining_targets\":{d}", .{ report.created_at, summary.service_count, summary.worker_count, summary.cron_count, summary.training_job_count, + training_summary.active, + training_summary.paused, + training_summary.failed, report.completed_targets, report.failed_targets, report.remainingTargets(), @@ -401,7 +407,7 @@ test "formatAppStatusResponse summarizes latest release" { .created_at = 200, }; - const json = try formatAppStatusResponse(alloc, apply_release.reportFromDeployment(latest), null, app_snapshot.summarize(latest.config_snapshot)); + const json = try formatAppStatusResponse(alloc, apply_release.reportFromDeployment(latest), null, app_snapshot.summarize(latest.config_snapshot), .{}); defer alloc.free(json); try std.testing.expect(std.mem.indexOf(u8, json, "\"app_name\":\"demo-app\"") != null); @@ -505,7 +511,7 @@ test "formatAppStatusResponse includes structured rollback metadata" { .created_at = 300, }; - const json = try formatAppStatusResponse(alloc, apply_release.reportFromDeployment(latest), null, app_snapshot.summarize(latest.config_snapshot)); + const json = try formatAppStatusResponse(alloc, apply_release.reportFromDeployment(latest), null, app_snapshot.summarize(latest.config_snapshot), .{}); defer alloc.free(json); try std.testing.expect(std.mem.indexOf(u8, json, "\"trigger\":\"rollback\"") != null); @@ -527,7 +533,7 @@ test "formatAppStatusResponse falls back to rollback metadata inferred from lega .created_at = 400, }; - const json = try formatAppStatusResponse(alloc, apply_release.reportFromDeployment(latest), null, app_snapshot.summarize(latest.config_snapshot)); + const json = try formatAppStatusResponse(alloc, apply_release.reportFromDeployment(latest), null, app_snapshot.summarize(latest.config_snapshot), .{}); defer alloc.free(json); try std.testing.expect(std.mem.indexOf(u8, json, "\"trigger\":\"rollback\"") != null); @@ -590,6 +596,8 @@ test "app status and history surface rollback release metadata from persisted ro alloc, apply_release.reportFromDeployment(latest), apply_release.reportFromDeployment(previous_successful), + app_snapshot.summarize(latest.config_snapshot), + .{}, ); defer alloc.free(status_json); @@ -654,6 +662,8 @@ test "app status and history surface failed apply metadata from persisted rows" alloc, apply_release.reportFromDeployment(latest), apply_release.reportFromDeployment(previous_successful), + app_snapshot.summarize(latest.config_snapshot), + .{}, ); defer alloc.free(status_json); diff --git a/src/runtime/cli/status_command.zig b/src/runtime/cli/status_command.zig index c8907c7b..cef4997b 100644 --- a/src/runtime/cli/status_command.zig +++ b/src/runtime/cli/status_command.zig @@ -135,6 +135,9 @@ const AppStatusSnapshot = struct { worker_count: usize = 0, cron_count: usize = 0, training_job_count: usize = 0, + active_training_jobs: usize = 0, + paused_training_jobs: usize = 0, + failed_training_jobs: usize = 0, completed_targets: usize, failed_targets: usize, remaining_targets: usize, @@ -371,8 +374,8 @@ fn printAppStatuses(snapshots: []const AppStatusSnapshot) void { } fn printAppStatusHeader() void { - write("{s:<14} {s:<14} {s:<14} {s:<11} {s:<20} {s:<22} {s:<14} {s}\n", .{ - "APP", "RELEASE", "STATUS", "KINDS", "TIMESTAMP", "TARGETS", "PREV OK", "MESSAGE", + write("{s:<14} {s:<14} {s:<14} {s:<11} {s:<20} {s:<22} {s:<18} {s:<14} {s}\n", .{ + "APP", "RELEASE", "STATUS", "KINDS", "TIMESTAMP", "TARGETS", "TRAINING", "PREV OK", "MESSAGE", }); } @@ -390,19 +393,22 @@ fn printAppStatusRow(snapshot: AppStatusSnapshot) void { snapshot.cron_count, snapshot.training_job_count, }) catch "?"; + var training_buf: [48]u8 = undefined; + const training_str = formatTrainingRuntime(&training_buf, snapshot); const previous_successful = if (snapshot.previous_successful_release_id) |release_id| cli.truncate(release_id, 12) else "-"; - write("{s:<14} {s:<14} {s:<14} {s:<11} {s:<20} {s:<22} {s:<14} {s}\n", .{ + write("{s:<14} {s:<14} {s:<14} {s:<11} {s:<20} {s:<22} {s:<18} {s:<14} {s}\n", .{ snapshot.app_name, cli.truncate(snapshot.release_id, 12), snapshot.status, kinds_str, ts_str, progress_str, + training_str, previous_successful, cli.truncate(msg, 40), }); @@ -425,6 +431,17 @@ fn formatAppProgress(buf: []u8, snapshot: AppStatusSnapshot) []const u8 { }) catch "?"; } +fn formatTrainingRuntime(buf: []u8, snapshot: AppStatusSnapshot) []const u8 { + if (snapshot.active_training_jobs == 0 and snapshot.paused_training_jobs == 0 and snapshot.failed_training_jobs == 0) { + return "-"; + } + return std.fmt.bufPrint(buf, "{d} act, {d} pause, {d} fail", .{ + snapshot.active_training_jobs, + snapshot.paused_training_jobs, + snapshot.failed_training_jobs, + }) catch "?"; +} + fn parseAppStatusResponse(json: []const u8) AppStatusSnapshot { return .{ .app_name = extractJsonString(json, "app_name") orelse "?", @@ -437,6 +454,9 @@ fn parseAppStatusResponse(json: []const u8) AppStatusSnapshot { .worker_count = @intCast(@max(0, extractJsonInt(json, "worker_count") orelse 0)), .cron_count = @intCast(@max(0, extractJsonInt(json, "cron_count") orelse 0)), .training_job_count = @intCast(@max(0, extractJsonInt(json, "training_job_count") orelse 0)), + .active_training_jobs = @intCast(@max(0, extractJsonInt(json, "active_training_jobs") orelse 0)), + .paused_training_jobs = @intCast(@max(0, extractJsonInt(json, "paused_training_jobs") orelse 0)), + .failed_training_jobs = @intCast(@max(0, extractJsonInt(json, "failed_training_jobs") orelse 0)), .completed_targets = @intCast(@max(0, extractJsonInt(json, "completed_targets") orelse 0)), .failed_targets = @intCast(@max(0, extractJsonInt(json, "failed_targets") orelse 0)), .remaining_targets = @intCast(@max(0, extractJsonInt(json, "remaining_targets") orelse 0)), @@ -460,6 +480,9 @@ fn writeAppStatusJsonObject(w: *json_out.JsonWriter, snapshot: AppStatusSnapshot w.uintField("worker_count", snapshot.worker_count); w.uintField("cron_count", snapshot.cron_count); w.uintField("training_job_count", snapshot.training_job_count); + w.uintField("active_training_jobs", snapshot.active_training_jobs); + w.uintField("paused_training_jobs", snapshot.paused_training_jobs); + w.uintField("failed_training_jobs", snapshot.failed_training_jobs); w.uintField("completed_targets", snapshot.completed_targets); w.uintField("failed_targets", snapshot.failed_targets); w.uintField("remaining_targets", snapshot.remaining_targets); @@ -474,6 +497,7 @@ fn appStatusFromReports( report: apply_release.ApplyReport, previous_successful: ?apply_release.ApplyReport, summary: app_snapshot.Summary, + training_summary: store.TrainingJobSummary, ) AppStatusSnapshot { return .{ .app_name = report.app_name, @@ -486,6 +510,9 @@ fn appStatusFromReports( .worker_count = summary.worker_count, .cron_count = summary.cron_count, .training_job_count = summary.training_job_count, + .active_training_jobs = training_summary.active, + .paused_training_jobs = training_summary.paused, + .failed_training_jobs = training_summary.failed, .completed_targets = report.completed_targets, .failed_targets = report.failed_targets, .remaining_targets = report.remainingTargets(), @@ -505,6 +532,7 @@ fn snapshotFromDeployments( apply_release.reportFromDeployment(latest), if (previous_successful) |dep| apply_release.reportFromDeployment(dep) else null, app_snapshot.summarize(latest.config_snapshot), + store.summarizeTrainingJobsByApp(latest.app_name.?) catch .{}, ); } @@ -615,7 +643,7 @@ fn parsePsiFromJson(json: []const u8, some_key: []const u8, full_key: []const u8 test "parseAppStatusResponse extracts app fields" { const snapshot = parseAppStatusResponse( - \\{"app_name":"demo-app","trigger":"apply","release_id":"abc123def456","status":"completed","manifest_hash":"sha256:123","created_at":42,"service_count":2,"worker_count":1,"cron_count":3,"training_job_count":4,"completed_targets":2,"failed_targets":0,"remaining_targets":0,"source_release_id":null,"message":null} + \\{"app_name":"demo-app","trigger":"apply","release_id":"abc123def456","status":"completed","manifest_hash":"sha256:123","created_at":42,"service_count":2,"worker_count":1,"cron_count":3,"training_job_count":4,"active_training_jobs":2,"paused_training_jobs":1,"failed_training_jobs":1,"completed_targets":2,"failed_targets":0,"remaining_targets":0,"source_release_id":null,"message":null} ); try std.testing.expectEqualStrings("demo-app", snapshot.app_name); @@ -628,6 +656,9 @@ test "parseAppStatusResponse extracts app fields" { try std.testing.expectEqual(@as(usize, 1), snapshot.worker_count); try std.testing.expectEqual(@as(usize, 3), snapshot.cron_count); try std.testing.expectEqual(@as(usize, 4), snapshot.training_job_count); + try std.testing.expectEqual(@as(usize, 2), snapshot.active_training_jobs); + try std.testing.expectEqual(@as(usize, 1), snapshot.paused_training_jobs); + try std.testing.expectEqual(@as(usize, 1), snapshot.failed_training_jobs); try std.testing.expectEqual(@as(usize, 2), snapshot.completed_targets); try std.testing.expectEqual(@as(usize, 0), snapshot.failed_targets); try std.testing.expectEqual(@as(usize, 0), snapshot.remaining_targets); @@ -653,7 +684,7 @@ test "appStatusFromReport matches remote app status shape" { .created_at = 200, }; - const local = appStatusFromReports(report, null); + const local = appStatusFromReports(report, null, .{ .service_count = 2 }, .{}); const remote = parseAppStatusResponse( \\{"app_name":"demo-app","trigger":"apply","release_id":"dep-2","status":"completed","manifest_hash":"sha256:222","created_at":200,"service_count":2,"completed_targets":2,"failed_targets":0,"remaining_targets":0,"source_release_id":null,"message":"all placements healthy"} ); @@ -685,6 +716,9 @@ test "writeAppStatusJsonObject round-trips through remote parser" { .worker_count = 1, .cron_count = 2, .training_job_count = 3, + .active_training_jobs = 1, + .paused_training_jobs = 1, + .failed_training_jobs = 1, .completed_targets = 1, .failed_targets = 1, .remaining_targets = 0, @@ -709,6 +743,9 @@ test "writeAppStatusJsonObject round-trips through remote parser" { try std.testing.expectEqual(snapshot.worker_count, parsed.worker_count); try std.testing.expectEqual(snapshot.cron_count, parsed.cron_count); try std.testing.expectEqual(snapshot.training_job_count, parsed.training_job_count); + try std.testing.expectEqual(snapshot.active_training_jobs, parsed.active_training_jobs); + try std.testing.expectEqual(snapshot.paused_training_jobs, parsed.paused_training_jobs); + try std.testing.expectEqual(snapshot.failed_training_jobs, parsed.failed_training_jobs); try std.testing.expectEqual(snapshot.completed_targets, parsed.completed_targets); try std.testing.expectEqual(snapshot.failed_targets, parsed.failed_targets); try std.testing.expectEqual(snapshot.remaining_targets, parsed.remaining_targets); @@ -746,7 +783,7 @@ test "appStatusFromReport preserves partially failed local release state" { .created_at = 200, }; - const local = appStatusFromReports(apply_release.reportFromDeployment(dep), previous_successful); + const local = appStatusFromReports(apply_release.reportFromDeployment(dep), previous_successful, .{ .service_count = 2 }, .{}); const remote = parseAppStatusResponse( \\{"app_name":"demo-app","trigger":"apply","release_id":"dep-3","status":"partially_failed","manifest_hash":"sha256:333","created_at":300,"service_count":2,"completed_targets":1,"failed_targets":1,"remaining_targets":0,"source_release_id":null,"previous_successful_release_id":"dep-2","previous_successful_manifest_hash":"sha256:222","previous_successful_created_at":200,"message":"one or more placements failed"} ); @@ -828,3 +865,46 @@ test "formatAppProgress summarizes in-flight and partial outcomes" { }; try std.testing.expectEqualStrings("2 ok", formatAppProgress(&buf, completed)); } + +test "formatTrainingRuntime summarizes active paused and failed jobs" { + var buf: [48]u8 = undefined; + + const empty = AppStatusSnapshot{ + .app_name = "demo-app", + .trigger = "apply", + .release_id = "dep-1", + .status = "completed", + .manifest_hash = "sha256:111", + .created_at = 100, + .completed_targets = 0, + .failed_targets = 0, + .remaining_targets = 0, + .source_release_id = null, + .previous_successful_release_id = null, + .previous_successful_manifest_hash = null, + .previous_successful_created_at = null, + .message = null, + }; + try std.testing.expectEqualStrings("-", formatTrainingRuntime(&buf, empty)); + + const active = AppStatusSnapshot{ + .app_name = "demo-app", + .trigger = "apply", + .release_id = "dep-2", + .status = "completed", + .manifest_hash = "sha256:222", + .created_at = 200, + .active_training_jobs = 2, + .paused_training_jobs = 1, + .failed_training_jobs = 1, + .completed_targets = 0, + .failed_targets = 0, + .remaining_targets = 0, + .source_release_id = null, + .previous_successful_release_id = null, + .previous_successful_manifest_hash = null, + .previous_successful_created_at = null, + .message = null, + }; + try std.testing.expectEqualStrings("2 act, 1 pause, 1 fail", formatTrainingRuntime(&buf, active)); +} diff --git a/src/state/store.zig b/src/state/store.zig index 8b538091..181f83f8 100644 --- a/src/state/store.zig +++ b/src/state/store.zig @@ -25,6 +25,7 @@ pub const NetworkPolicyRecord = @import("store/services.zig").NetworkPolicyRecor pub const DeploymentRecord = @import("store/deployments.zig").DeploymentRecord; pub const CronScheduleRecord = @import("store/crons.zig").CronScheduleRecord; pub const TrainingJobRecord = @import("store/training.zig").TrainingJobRecord; +pub const TrainingJobSummary = @import("store/training.zig").TrainingJobSummary; pub const CheckpointRecord = @import("store/training.zig").CheckpointRecord; pub const initTestDb = common.initTestDb; @@ -116,6 +117,10 @@ pub const findTrainingJob = @import("store/training.zig").findTrainingJob; pub const findTrainingJobInDb = @import("store/training.zig").findTrainingJobInDb; pub const getTrainingJob = @import("store/training.zig").getTrainingJob; pub const getTrainingJobInDb = @import("store/training.zig").getTrainingJobInDb; +pub const listTrainingJobsByApp = @import("store/training.zig").listTrainingJobsByApp; +pub const listTrainingJobsByAppInDb = @import("store/training.zig").listTrainingJobsByAppInDb; +pub const summarizeTrainingJobsByApp = @import("store/training.zig").summarizeTrainingJobsByApp; +pub const summarizeTrainingJobsByAppInDb = @import("store/training.zig").summarizeTrainingJobsByAppInDb; pub const saveCheckpoint = @import("store/training.zig").saveCheckpoint; pub const getLatestCheckpoint = @import("store/training.zig").getLatestCheckpoint; pub const listCheckpoints = @import("store/training.zig").listCheckpoints; diff --git a/src/state/store/training.zig b/src/state/store/training.zig index 0b2f20f4..247e30c2 100644 --- a/src/state/store/training.zig +++ b/src/state/store/training.zig @@ -43,6 +43,12 @@ pub const CheckpointRecord = struct { } }; +pub const TrainingJobSummary = struct { + active: usize = 0, + paused: usize = 0, + failed: usize = 0, +}; + const training_job_columns = "id, name, app_name, state, image, gpus, checkpoint_path, checkpoint_interval, checkpoint_keep, restart_count, created_at, updated_at"; @@ -196,6 +202,57 @@ pub fn getTrainingJobInDb(db: *sqlite.Db, alloc: Allocator, id: []const u8) Stor return trainingJobRowToRecord(row); } +pub fn listTrainingJobsByApp(alloc: Allocator, app_name: []const u8) StoreError!std.ArrayList(TrainingJobRecord) { + const db = try common.getDb(); + return listTrainingJobsByAppInDb(db, alloc, app_name); +} + +pub fn listTrainingJobsByAppInDb( + db: *sqlite.Db, + alloc: Allocator, + app_name: []const u8, +) StoreError!std.ArrayList(TrainingJobRecord) { + var records: std.ArrayList(TrainingJobRecord) = .empty; + var stmt = db.prepare( + "SELECT " ++ training_job_columns ++ " FROM training_jobs WHERE app_name = ? ORDER BY updated_at DESC, created_at DESC;", + ) catch return StoreError.ReadFailed; + defer stmt.deinit(); + var iter = stmt.iterator(TrainingJobRow, .{app_name}) catch return StoreError.ReadFailed; + while (iter.nextAlloc(alloc, .{}) catch return StoreError.ReadFailed) |row| { + records.append(alloc, trainingJobRowToRecord(row)) catch return StoreError.ReadFailed; + } + return records; +} + +pub fn summarizeTrainingJobsByApp(alloc: Allocator, app_name: []const u8) StoreError!TrainingJobSummary { + const db = try common.getDb(); + return summarizeTrainingJobsByAppInDb(db, alloc, app_name); +} + +pub fn summarizeTrainingJobsByAppInDb( + db: *sqlite.Db, + alloc: Allocator, + app_name: []const u8, +) StoreError!TrainingJobSummary { + var records = try listTrainingJobsByAppInDb(db, alloc, app_name); + defer { + for (records.items) |record| record.deinit(alloc); + records.deinit(alloc); + } + + var summary: TrainingJobSummary = .{}; + for (records.items) |record| { + if (std.mem.eql(u8, record.state, "running") or std.mem.eql(u8, record.state, "scheduling")) { + summary.active += 1; + } else if (std.mem.eql(u8, record.state, "paused")) { + summary.paused += 1; + } else if (std.mem.eql(u8, record.state, "failed")) { + summary.failed += 1; + } + } + return summary; +} + pub fn saveCheckpoint(job_id: []const u8, step: i64, path: []const u8, size_bytes: i64, now: i64) StoreError!void { const db = try common.getDb(); db.exec( @@ -239,3 +296,72 @@ pub fn deleteCheckpoint(id: i64) StoreError!void { .{id}, ) catch return StoreError.WriteFailed; } + +test "summarizeTrainingJobsByAppInDb groups active paused and failed states" { + const alloc = std.testing.allocator; + var db = try sqlite.Db.init(.{ .mode = .Memory, .open_flags = .{ .write = true } }); + defer db.deinit(); + try @import("../schema.zig").init(&db); + + try saveTrainingJobInDb(&db, .{ + .id = "job-1", + .name = "a", + .app_name = "demo-app", + .state = "running", + .image = "trainer:v1", + .gpus = 1, + .checkpoint_path = null, + .checkpoint_interval = null, + .checkpoint_keep = null, + .restart_count = 0, + .created_at = 100, + .updated_at = 100, + }); + try saveTrainingJobInDb(&db, .{ + .id = "job-2", + .name = "b", + .app_name = "demo-app", + .state = "paused", + .image = "trainer:v1", + .gpus = 1, + .checkpoint_path = null, + .checkpoint_interval = null, + .checkpoint_keep = null, + .restart_count = 0, + .created_at = 110, + .updated_at = 110, + }); + try saveTrainingJobInDb(&db, .{ + .id = "job-3", + .name = "c", + .app_name = "demo-app", + .state = "failed", + .image = "trainer:v1", + .gpus = 1, + .checkpoint_path = null, + .checkpoint_interval = null, + .checkpoint_keep = null, + .restart_count = 0, + .created_at = 120, + .updated_at = 120, + }); + try saveTrainingJobInDb(&db, .{ + .id = "job-4", + .name = "d", + .app_name = "demo-app", + .state = "scheduling", + .image = "trainer:v1", + .gpus = 1, + .checkpoint_path = null, + .checkpoint_interval = null, + .checkpoint_keep = null, + .restart_count = 0, + .created_at = 130, + .updated_at = 130, + }); + + const summary = try summarizeTrainingJobsByAppInDb(&db, alloc, "demo-app"); + try std.testing.expectEqual(@as(usize, 2), summary.active); + try std.testing.expectEqual(@as(usize, 1), summary.paused); + try std.testing.expectEqual(@as(usize, 1), summary.failed); +} diff --git a/src/test_root.zig b/src/test_root.zig index 982fc233..3388a0d5 100644 --- a/src/test_root.zig +++ b/src/test_root.zig @@ -21,6 +21,7 @@ comptime { _ = @import("state/store.zig"); _ = @import("state/store/crons.zig"); _ = @import("state/store/deployments.zig"); + _ = @import("state/store/training.zig"); _ = @import("state/schema.zig"); _ = @import("state/commands.zig"); _ = @import("state/volumes.zig"); From 2ad1ca46ca4770c0f826254dc71f6de56d6f2273 Mon Sep 17 00:00:00 2001 From: Kacy Fortner Date: Fri, 10 Apr 2026 03:57:17 +0000 Subject: [PATCH 08/14] Document app workload runtime summaries --- README.md | 1 + docs/cluster-guide.md | 2 +- docs/users-guide.md | 4 ++ src/api/routes/cluster_agents/app_routes.zig | 45 ++++++++++++++++++++ 4 files changed, 51 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ae597f22..7077163d 100644 --- a/README.md +++ b/README.md @@ -292,6 +292,7 @@ Notes: - crons defined in the manifest start automatically with `yoq up`. - deployment, metrics, and certificate commands also support `--server host:port`. - clustered manifest deploys now go through the app-first `/apps/apply` API and carry services, workers, crons, and training definitions in one app snapshot. the older `/deploy` route remains as a compatibility shim for legacy callers. +- remote app applies now register active cron schedules in cluster state, and `yoq apps` / `yoq status --app` include live training runtime summaries for the current app. ## current status diff --git a/docs/cluster-guide.md b/docs/cluster-guide.md index 3249cf4e..b23ff0c9 100644 --- a/docs/cluster-guide.md +++ b/docs/cluster-guide.md @@ -147,7 +147,7 @@ yoq history --app [name] --server 10.0.0.1:7700 yoq rollback --app [name] --server 10.0.0.1:7700 --release ``` -`yoq apps` shows the latest release summary for every app, `status --app` shows the latest release metadata for one app, `history --app` lists prior releases, and remote `rollback --app ... --release` re-applies a stored app snapshot. `yoq run-worker --server ...` and `yoq train ... --server ...` now resolve workers and training jobs from the current app release on the server. +`yoq apps` shows the latest release summary for every app, `status --app` shows the latest release metadata for one app, `history --app` lists prior releases, and remote `rollback --app ... --release` re-applies a stored app snapshot. `yoq run-worker --server ...` and `yoq train ... --server ...` now resolve workers and training jobs from the current app release on the server. Clustered app applies also register cron schedules from the current app snapshot, and the app summary/status views include live training runtime counts for the app. --- diff --git a/docs/users-guide.md b/docs/users-guide.md index 9b154e27..798801e2 100644 --- a/docs/users-guide.md +++ b/docs/users-guide.md @@ -173,6 +173,8 @@ this gives the operator one app-first day-2 model: - `yoq run-worker [--server host:port] ` — run a worker from the current app release - `yoq train start|status|stop|pause|resume|scale|logs [--server host:port] ` — manage training jobs from the current app release +`yoq apps` and `yoq status --app` now show both the desired workload mix from the latest app release and the current training runtime summary for that app. On clustered applies, cron definitions from the app snapshot are also registered in cluster state, so rollback restores the active cron schedule set along with the rest of the app snapshot. + ### dev mode `yoq up --dev` bind-mounts source directories and watches for file changes via inotify. changed files trigger a container restart with 500ms debounce. logs are multiplexed with colored service name prefixes. @@ -231,6 +233,8 @@ the cluster API also exposes app-scoped day-2 reads and rollback: - `POST /apps//training//start|stop|pause|resume|scale` — manage training jobs for the current app release - `GET /apps//training//status|logs` — inspect training jobs for the current app release +The app status surfaces (`GET /apps`, `GET /apps//status`, `yoq apps`, and `yoq status --app`) also report live training runtime counts for the app: active, paused, and failed jobs. + ### rolling upgrades to upgrade a cluster without downtime: diff --git a/src/api/routes/cluster_agents/app_routes.zig b/src/api/routes/cluster_agents/app_routes.zig index 9cbece05..9bcac14d 100644 --- a/src/api/routes/cluster_agents/app_routes.zig +++ b/src/api/routes/cluster_agents/app_routes.zig @@ -461,6 +461,48 @@ test "formatAppsResponse emits one latest summary per app" { .message = "scheduler error during apply", .created_at = 200, }); + try store.saveTrainingJobInDb(&db, .{ + .id = "job-1", + .name = "finetune-a", + .app_name = "app-a", + .state = "running", + .image = "trainer:v1", + .gpus = 1, + .checkpoint_path = null, + .checkpoint_interval = null, + .checkpoint_keep = null, + .restart_count = 0, + .created_at = 210, + .updated_at = 210, + }); + try store.saveTrainingJobInDb(&db, .{ + .id = "job-2", + .name = "finetune-b", + .app_name = "app-a", + .state = "paused", + .image = "trainer:v1", + .gpus = 1, + .checkpoint_path = null, + .checkpoint_interval = null, + .checkpoint_keep = null, + .restart_count = 0, + .created_at = 220, + .updated_at = 220, + }); + try store.saveTrainingJobInDb(&db, .{ + .id = "job-3", + .name = "finetune-c", + .app_name = "app-a", + .state = "failed", + .image = "trainer:v1", + .gpus = 1, + .checkpoint_path = null, + .checkpoint_interval = null, + .checkpoint_keep = null, + .restart_count = 0, + .created_at = 230, + .updated_at = 230, + }); var latest = try store.listLatestDeploymentsByAppInDb(&db, alloc); defer { @@ -477,6 +519,9 @@ test "formatAppsResponse emits one latest summary per app" { try std.testing.expect(std.mem.indexOf(u8, json, "\"worker_count\":1") != null); try std.testing.expect(std.mem.indexOf(u8, json, "\"cron_count\":1") != null); try std.testing.expect(std.mem.indexOf(u8, json, "\"training_job_count\":1") != null); + try std.testing.expect(std.mem.indexOf(u8, json, "\"active_training_jobs\":1") != null); + try std.testing.expect(std.mem.indexOf(u8, json, "\"paused_training_jobs\":1") != null); + try std.testing.expect(std.mem.indexOf(u8, json, "\"failed_training_jobs\":1") != null); try std.testing.expect(std.mem.indexOf(u8, json, "\"app_name\":\"app-b\"") != null); try std.testing.expect(std.mem.indexOf(u8, json, "\"release_id\":\"dep-2\"") != null); } From 927492a314689d6edcac66ed563e7d4c57eecdf9 Mon Sep 17 00:00:00 2001 From: Kacy Fortner Date: Fri, 10 Apr 2026 04:13:41 +0000 Subject: [PATCH 09/14] Track cluster training assignments for control --- .../routes/cluster_agents/workload_routes.zig | 118 ++++++++++++++++++ src/cluster/registry.zig | 11 ++ src/cluster/registry/sql_mutations.zig | 19 +++ src/cluster/scheduler/common.zig | 3 + src/cluster/scheduler/sql_support.zig | 33 ++++- src/state/schema/migrations.zig | 7 ++ src/state/schema/tables.zig | 3 + 7 files changed, 188 insertions(+), 6 deletions(-) diff --git a/src/api/routes/cluster_agents/workload_routes.zig b/src/api/routes/cluster_agents/workload_routes.zig index 37463858..4294e436 100644 --- a/src/api/routes/cluster_agents/workload_routes.zig +++ b/src/api/routes/cluster_agents/workload_routes.zig @@ -1,4 +1,5 @@ const std = @import("std"); +const sqlite = @import("sqlite"); const scheduler = @import("../../../cluster/scheduler.zig"); const cluster_node = @import("../../../cluster/node.zig"); const agent_registry = @import("../../../cluster/registry.zig"); @@ -190,6 +191,10 @@ fn handleTrainingStateChange( if (rec == null) return common.notFound(); defer rec.?.deinit(alloc); + clearTrainingAssignments(node, app_name, job_name) catch |err| return switch (err) { + error.NotLeader => common.notLeader(alloc, node), + else => common.internalError(), + }; store.updateTrainingJobStateInDb(node.stateMachineDb(), rec.?.id, new_state, std.time.timestamp()) catch return common.internalError(); const updated = store.getTrainingJobInDb(node.stateMachineDb(), alloc, rec.?.id) catch return common.internalError(); defer updated.deinit(alloc); @@ -265,6 +270,11 @@ fn scheduleTrainingJob( defer if (existing) |rec| rec.deinit(alloc); const restarts = if (existing) |rec| rec.restart_count else 0; + clearTrainingAssignments(node, app_name, job_name) catch |err| return switch (err) { + error.NotLeader => common.notLeader(alloc, node), + else => common.internalError(), + }; + store.saveTrainingJobInDb(node.stateMachineDb(), .{ .id = job_id, .name = job_name, @@ -285,6 +295,9 @@ fn scheduleTrainingJob( .command = job.?.command, .cpu_limit = job.?.cpu_limit, .memory_limit_mb = job.?.memory_limit_mb, + .app_name = app_name, + .workload_kind = "training", + .workload_name = job_name, .gpu_limit = if (gpus_override) |gpus| gpus else job.?.gpus, .gpu_model = job.?.gpu_type, .gang_world_size = if (gpus_override) |gpus| gpus else job.?.gpus, @@ -308,6 +321,12 @@ fn scheduleTrainingJob( ); } +fn clearTrainingAssignments(node: *cluster_node.Node, app_name: []const u8, job_name: []const u8) !void { + var sql_buf: [512]u8 = undefined; + const sql = try agent_registry.deleteAssignmentsForWorkloadSql(&sql_buf, app_name, "training", job_name); + _ = try node.propose(sql); +} + fn runPlacementRequests( alloc: std.mem.Allocator, node: *cluster_node.Node, @@ -416,6 +435,10 @@ const RouteFlowHarness = struct { return .{ .cluster = &self.node, .join_token = null }; } + fn applyCommitted(self: *RouteFlowHarness) void { + self.node.state_machine.applyUpTo(&self.node.log, self.alloc, self.node.log.lastIndex()); + } + fn seedActiveAgent(self: *RouteFlowHarness) !void { self.node.stateMachineDb().exec( "INSERT INTO agents (id, address, status, cpu_cores, memory_mb, cpu_used, memory_used_mb, containers, last_heartbeat, registered_at, role, labels, gpu_count, gpu_used, gpu_model, gpu_vram_mb) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);", @@ -455,6 +478,17 @@ fn freeResponse(alloc: std.mem.Allocator, response: Response) void { if (response.allocated) alloc.free(response.body); } +fn countTrainingAssignments(db: *sqlite.Db, app_name: []const u8, job_name: []const u8) usize { + const Row = struct { count: i64 }; + const row = (db.one( + Row, + "SELECT COUNT(*) AS count FROM assignments WHERE app_name = ? AND workload_kind = 'training' AND workload_name = ?;", + .{}, + .{ app_name, job_name }, + ) catch unreachable) orelse unreachable; + return @intCast(row.count); +} + test "route rejects worker run without cluster" { const ctx: RouteContext = .{ .cluster = null, .join_token = null }; const req = makeRequest(.POST, "/apps/demo-app/workers/migrate/run", "", ""); @@ -527,3 +561,87 @@ test "training start and status routes persist job state from app snapshot" { try std.testing.expect(std.mem.indexOf(u8, status_resp.body, "\"state\":\"running\"") != null); try std.testing.expect(std.mem.indexOf(u8, status_resp.body, "\"training_job\":\"finetune\"") != null); } + +test "training start tags assignments with workload metadata" { + const alloc = std.testing.allocator; + var harness = try RouteFlowHarness.init(alloc); + defer harness.deinit(); + + try harness.seedLatestRelease( + "demo-app", + "{\"app_name\":\"demo-app\",\"services\":[],\"workers\":[],\"crons\":[],\"training_jobs\":[{\"name\":\"finetune\",\"image\":\"pytorch:latest\",\"command\":[\"python\",\"train.py\"],\"gpus\":2,\"cpu_limit\":2000,\"memory_limit_mb\":4096}]}", + ); + + const start_resp = route( + makeRequest(.POST, "/apps/demo-app/training/finetune/start", "", ""), + alloc, + harness.ctx(), + ).?; + defer freeResponse(alloc, start_resp); + try std.testing.expectEqual(http.StatusCode.ok, start_resp.status); + harness.applyCommitted(); + try std.testing.expectEqual(@as(usize, 2), countTrainingAssignments(harness.node.stateMachineDb(), "demo-app", "finetune")); +} + +test "training pause route clears scheduled assignments" { + const alloc = std.testing.allocator; + var harness = try RouteFlowHarness.init(alloc); + defer harness.deinit(); + + try harness.seedLatestRelease( + "demo-app", + "{\"app_name\":\"demo-app\",\"services\":[],\"workers\":[],\"crons\":[],\"training_jobs\":[{\"name\":\"finetune\",\"image\":\"pytorch:latest\",\"command\":[\"python\",\"train.py\"],\"gpus\":2,\"cpu_limit\":2000,\"memory_limit_mb\":4096}]}", + ); + + const start_resp = route( + makeRequest(.POST, "/apps/demo-app/training/finetune/start", "", ""), + alloc, + harness.ctx(), + ).?; + defer freeResponse(alloc, start_resp); + try std.testing.expectEqual(http.StatusCode.ok, start_resp.status); + + const pause_resp = route( + makeRequest(.POST, "/apps/demo-app/training/finetune/pause", "", ""), + alloc, + harness.ctx(), + ).?; + defer freeResponse(alloc, pause_resp); + harness.applyCommitted(); + + try std.testing.expectEqual(http.StatusCode.ok, pause_resp.status); + try std.testing.expect(std.mem.indexOf(u8, pause_resp.body, "\"state\":\"paused\"") != null); + try std.testing.expectEqual(@as(usize, 0), countTrainingAssignments(harness.node.stateMachineDb(), "demo-app", "finetune")); +} + +test "training scale route replaces prior scheduled assignments" { + const alloc = std.testing.allocator; + var harness = try RouteFlowHarness.init(alloc); + defer harness.deinit(); + + try harness.seedLatestRelease( + "demo-app", + "{\"app_name\":\"demo-app\",\"services\":[],\"workers\":[],\"crons\":[],\"training_jobs\":[{\"name\":\"finetune\",\"image\":\"pytorch:latest\",\"command\":[\"python\",\"train.py\"],\"gpus\":1,\"cpu_limit\":2000,\"memory_limit_mb\":4096}]}", + ); + + const start_resp = route( + makeRequest(.POST, "/apps/demo-app/training/finetune/start", "", ""), + alloc, + harness.ctx(), + ).?; + defer freeResponse(alloc, start_resp); + try std.testing.expectEqual(http.StatusCode.ok, start_resp.status); + + const scale_resp = route( + makeRequest(.POST, "/apps/demo-app/training/finetune/scale", "{\"gpus\":2}", ""), + alloc, + harness.ctx(), + ).?; + defer freeResponse(alloc, scale_resp); + harness.applyCommitted(); + + try std.testing.expectEqual(http.StatusCode.ok, scale_resp.status); + try std.testing.expect(std.mem.indexOf(u8, scale_resp.body, "\"state\":\"running\"") != null); + try std.testing.expect(std.mem.indexOf(u8, scale_resp.body, "\"gpus\":2") != null); + try std.testing.expectEqual(@as(usize, 2), countTrainingAssignments(harness.node.stateMachineDb(), "demo-app", "finetune")); +} diff --git a/src/cluster/registry.zig b/src/cluster/registry.zig index a8a7545c..078f6a1d 100644 --- a/src/cluster/registry.zig +++ b/src/cluster/registry.zig @@ -30,6 +30,7 @@ pub const removeSql = sql_mutations.removeSql; pub const orphanAssignmentsSql = sql_mutations.orphanAssignmentsSql; pub const reassignSql = sql_mutations.reassignSql; pub const deleteAgentAssignmentsSql = sql_mutations.deleteAgentAssignmentsSql; +pub const deleteAssignmentsForWorkloadSql = sql_mutations.deleteAssignmentsForWorkloadSql; pub const wireguardPeerSql = sql_mutations.wireguardPeerSql; pub const removeWireguardPeerSql = sql_mutations.removeWireguardPeerSql; @@ -433,6 +434,16 @@ test "deleteAgentAssignmentsSql generates valid SQL" { try std.testing.expect(std.mem.indexOf(u8, sql, "agent1234567") != null); } +test "deleteAssignmentsForWorkloadSql generates valid SQL" { + var buf: [512]u8 = undefined; + const sql = try deleteAssignmentsForWorkloadSql(&buf, "demo-app", "training", "finetune"); + + try std.testing.expect(std.mem.indexOf(u8, sql, "DELETE FROM assignments") != null); + try std.testing.expect(std.mem.indexOf(u8, sql, "demo-app") != null); + try std.testing.expect(std.mem.indexOf(u8, sql, "training") != null); + try std.testing.expect(std.mem.indexOf(u8, sql, "finetune") != null); +} + test "assignNodeId returns 1 for empty table" { var db = sqlite.Db.init(.{ .mode = .Memory, diff --git a/src/cluster/registry/sql_mutations.zig b/src/cluster/registry/sql_mutations.zig index f69ea52b..2f0e66ca 100644 --- a/src/cluster/registry/sql_mutations.zig +++ b/src/cluster/registry/sql_mutations.zig @@ -200,6 +200,25 @@ pub fn deleteAgentAssignmentsSql(buf: []u8, agent_id: []const u8) ![]const u8 { return std.fmt.bufPrint(buf, "DELETE FROM assignments WHERE agent_id = '{s}';", .{id_esc}); } +pub fn deleteAssignmentsForWorkloadSql( + buf: []u8, + app_name: []const u8, + workload_kind: []const u8, + workload_name: []const u8, +) ![]const u8 { + var app_esc_buf: [256]u8 = undefined; + const app_esc = try sql_escape.escapeSqlString(&app_esc_buf, app_name); + var kind_esc_buf: [64]u8 = undefined; + const kind_esc = try sql_escape.escapeSqlString(&kind_esc_buf, workload_kind); + var name_esc_buf: [256]u8 = undefined; + const name_esc = try sql_escape.escapeSqlString(&name_esc_buf, workload_name); + return std.fmt.bufPrint( + buf, + "DELETE FROM assignments WHERE app_name = '{s}' AND workload_kind = '{s}' AND workload_name = '{s}';", + .{ app_esc, kind_esc, name_esc }, + ); +} + pub fn wireguardPeerSql( buf: []u8, node_id: u16, diff --git a/src/cluster/scheduler/common.zig b/src/cluster/scheduler/common.zig index e9c2c84c..ee7a6e0c 100644 --- a/src/cluster/scheduler/common.zig +++ b/src/cluster/scheduler/common.zig @@ -8,6 +8,9 @@ pub const PlacementRequest = struct { command: []const u8, cpu_limit: i64, memory_limit_mb: i64, + app_name: ?[]const u8 = null, + workload_kind: ?[]const u8 = null, + workload_name: ?[]const u8 = null, gpu_limit: i64 = 0, gpu_model: ?[]const u8 = null, gpu_vram_min_mb: ?u64 = null, diff --git a/src/cluster/scheduler/sql_support.zig b/src/cluster/scheduler/sql_support.zig index 4042ae43..c6996b3f 100644 --- a/src/cluster/scheduler/sql_support.zig +++ b/src/cluster/scheduler/sql_support.zig @@ -27,20 +27,41 @@ pub fn assignmentSqlGang( const img_esc = try sql_escape.escapeSqlString(&img_esc_buf, request.image); var cmd_esc_buf: [512]u8 = undefined; const cmd_esc = try sql_escape.escapeSqlString(&cmd_esc_buf, request.command); + var app_esc_buf: [256]u8 = undefined; + var kind_esc_buf: [64]u8 = undefined; + var name_esc_buf: [256]u8 = undefined; + var metadata_vals_buf: [768]u8 = undefined; + + const metadata_cols = if (request.app_name != null and request.workload_kind != null and request.workload_name != null) + ", app_name, workload_kind, workload_name" + else + ""; + const metadata_vals = if (request.app_name != null and request.workload_kind != null and request.workload_name != null) + try std.fmt.bufPrint( + &metadata_vals_buf, + ", '{s}', '{s}', '{s}'", + .{ + try sql_escape.escapeSqlString(&app_esc_buf, request.app_name.?), + try sql_escape.escapeSqlString(&kind_esc_buf, request.workload_kind.?), + try sql_escape.escapeSqlString(&name_esc_buf, request.workload_name.?), + }, + ) + else + ""; if (gang) |placement| { var master_esc_buf: [256]u8 = undefined; const master_esc = try sql_escape.escapeSqlString(&master_esc_buf, placement.master_addr); return std.fmt.bufPrint(buf, - \\INSERT INTO assignments (id, agent_id, image, command, status, cpu_limit, memory_limit_mb, gang_rank, gang_world_size, gang_master_addr, gang_master_port, created_at) - \\ VALUES ('{s}', '{s}', '{s}', '{s}', 'pending', {d}, {d}, {d}, {d}, '{s}', {d}, {d}); - , .{ id, agent_id, img_esc, cmd_esc, request.cpu_limit, request.memory_limit_mb, placement.rank, placement.world_size, master_esc, placement.master_port, now }); + \\INSERT INTO assignments (id, agent_id, image, command, status, cpu_limit, memory_limit_mb, gang_rank, gang_world_size, gang_master_addr, gang_master_port, created_at{s}) + \\ VALUES ('{s}', '{s}', '{s}', '{s}', 'pending', {d}, {d}, {d}, {d}, '{s}', {d}, {d}{s}); + , .{ metadata_cols, id, agent_id, img_esc, cmd_esc, request.cpu_limit, request.memory_limit_mb, placement.rank, placement.world_size, master_esc, placement.master_port, now, metadata_vals }); } return std.fmt.bufPrint(buf, - \\INSERT INTO assignments (id, agent_id, image, command, status, cpu_limit, memory_limit_mb, created_at) - \\ VALUES ('{s}', '{s}', '{s}', '{s}', 'pending', {d}, {d}, {d}); - , .{ id, agent_id, img_esc, cmd_esc, request.cpu_limit, request.memory_limit_mb, now }); + \\INSERT INTO assignments (id, agent_id, image, command, status, cpu_limit, memory_limit_mb, created_at{s}) + \\ VALUES ('{s}', '{s}', '{s}', '{s}', 'pending', {d}, {d}, {d}{s}); + , .{ metadata_cols, id, agent_id, img_esc, cmd_esc, request.cpu_limit, request.memory_limit_mb, now, metadata_vals }); } pub fn generateAssignmentId(buf: *[12]u8) void { diff --git a/src/state/schema/migrations.zig b/src/state/schema/migrations.zig index dccad941..76a10edd 100644 --- a/src/state/schema/migrations.zig +++ b/src/state/schema/migrations.zig @@ -6,6 +6,7 @@ pub const SchemaError = error{InitFailed}; pub fn apply(db: *sqlite.Db) SchemaError!void { migrateContainers(db); migrateAgents(db); + migrateAssignments(db); migrateServices(db); migrateDeployments(db); migrateCronSchedules(db); @@ -150,6 +151,12 @@ fn migrateCronSchedules(db: *sqlite.Db) void { ) catch {}; } +fn migrateAssignments(db: *sqlite.Db) void { + addColumnIfMissing(db, "ALTER TABLE assignments ADD COLUMN app_name TEXT;") catch {}; + addColumnIfMissing(db, "ALTER TABLE assignments ADD COLUMN workload_kind TEXT;") catch {}; + addColumnIfMissing(db, "ALTER TABLE assignments ADD COLUMN workload_name TEXT;") catch {}; +} + fn addColumnIfMissing(db: *sqlite.Db, sql: []const u8) SchemaError!void { db.execDynamic(sql, .{}, .{}) catch { const err_msg = std.mem.span(sqlite.c.sqlite3_errmsg(db.db)); diff --git a/src/state/schema/tables.zig b/src/state/schema/tables.zig index 08f4b26f..50938c74 100644 --- a/src/state/schema/tables.zig +++ b/src/state/schema/tables.zig @@ -199,6 +199,9 @@ pub fn initClusterTables(db: *sqlite.Db) SchemaError!void { \\ status TEXT NOT NULL DEFAULT 'pending', \\ cpu_limit INTEGER NOT NULL DEFAULT 1000, \\ memory_limit_mb INTEGER NOT NULL DEFAULT 256, + \\ app_name TEXT, + \\ workload_kind TEXT, + \\ workload_name TEXT, \\ gang_rank INTEGER, \\ gang_world_size INTEGER, \\ gang_master_addr TEXT, From 4df8cf8dfba6667843e72f427d812c3d688d7600 Mon Sep 17 00:00:00 2001 From: Kacy Fortner Date: Fri, 10 Apr 2026 04:15:03 +0000 Subject: [PATCH 10/14] Deduplicate app training summaries by job --- src/state/store/training.zig | 48 ++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/src/state/store/training.zig b/src/state/store/training.zig index 247e30c2..bde50f8c 100644 --- a/src/state/store/training.zig +++ b/src/state/store/training.zig @@ -240,8 +240,15 @@ pub fn summarizeTrainingJobsByAppInDb( records.deinit(alloc); } + var seen: std.StringHashMapUnmanaged(void) = .empty; + defer seen.deinit(alloc); + var summary: TrainingJobSummary = .{}; for (records.items) |record| { + const gop = seen.getOrPut(alloc, record.name) catch return StoreError.ReadFailed; + if (gop.found_existing) continue; + gop.value_ptr.* = {}; + if (std.mem.eql(u8, record.state, "running") or std.mem.eql(u8, record.state, "scheduling")) { summary.active += 1; } else if (std.mem.eql(u8, record.state, "paused")) { @@ -365,3 +372,44 @@ test "summarizeTrainingJobsByAppInDb groups active paused and failed states" { try std.testing.expectEqual(@as(usize, 1), summary.paused); try std.testing.expectEqual(@as(usize, 1), summary.failed); } + +test "summarizeTrainingJobsByAppInDb keeps only the latest row per job name" { + const alloc = std.testing.allocator; + var db = try sqlite.Db.init(.{ .mode = .Memory, .open_flags = .{ .write = true } }); + defer db.deinit(); + try @import("../schema.zig").init(&db); + + try saveTrainingJobInDb(&db, .{ + .id = "job-old", + .name = "finetune", + .app_name = "demo-app", + .state = "failed", + .image = "trainer:v1", + .gpus = 1, + .checkpoint_path = null, + .checkpoint_interval = null, + .checkpoint_keep = null, + .restart_count = 0, + .created_at = 100, + .updated_at = 100, + }); + try saveTrainingJobInDb(&db, .{ + .id = "job-new", + .name = "finetune", + .app_name = "demo-app", + .state = "running", + .image = "trainer:v2", + .gpus = 2, + .checkpoint_path = null, + .checkpoint_interval = null, + .checkpoint_keep = null, + .restart_count = 1, + .created_at = 200, + .updated_at = 200, + }); + + const summary = try summarizeTrainingJobsByAppInDb(&db, alloc, "demo-app"); + try std.testing.expectEqual(@as(usize, 1), summary.active); + try std.testing.expectEqual(@as(usize, 0), summary.paused); + try std.testing.expectEqual(@as(usize, 0), summary.failed); +} From aae377aa8671f462ba04f14afbcc3b962331e9a1 Mon Sep 17 00:00:00 2001 From: Kacy Fortner Date: Fri, 10 Apr 2026 10:46:38 +0000 Subject: [PATCH 11/14] Clarify remote training log behavior --- .../routes/cluster_agents/workload_routes.zig | 42 ++++++++- src/api/routes/cluster_agents/writers.zig | 15 ++++ src/cluster/agent/assignment_runtime.zig | 86 +++++++++++++++++-- src/cluster/agent_types.zig | 6 ++ src/cluster/registry.zig | 1 + src/cluster/registry/queries.zig | 21 ++++- 6 files changed, 162 insertions(+), 9 deletions(-) diff --git a/src/api/routes/cluster_agents/workload_routes.zig b/src/api/routes/cluster_agents/workload_routes.zig index 4294e436..7414bc87 100644 --- a/src/api/routes/cluster_agents/workload_routes.zig +++ b/src/api/routes/cluster_agents/workload_routes.zig @@ -232,7 +232,17 @@ fn handleTrainingLogs( var hostname_buf: [128]u8 = undefined; const hostname = std.fmt.bufPrint(&hostname_buf, "{s}-rank-{d}", .{ job_name, rank }) catch return common.internalError(); const record = store.findAppContainer(alloc, app_name, hostname) catch return common.internalError(); - if (record == null) return common.notFound(); + if (record == null) { + const scheduled = agent_registry.countAssignmentsForWorkload(ctx.cluster.?.stateMachineDb(), app_name, "training", job_name) catch return common.internalError(); + if (scheduled > 0) { + return .{ + .status = .bad_request, + .body = "{\"error\":\"training logs are only available on the hosting agent\"}", + .allocated = false, + }; + } + return common.notFound(); + } defer record.?.deinit(alloc); const logs = @import("../../../runtime/logs.zig"); @@ -645,3 +655,33 @@ test "training scale route replaces prior scheduled assignments" { try std.testing.expect(std.mem.indexOf(u8, scale_resp.body, "\"gpus\":2") != null); try std.testing.expectEqual(@as(usize, 2), countTrainingAssignments(harness.node.stateMachineDb(), "demo-app", "finetune")); } + +test "training logs route reports remote-hosted ranks explicitly" { + const alloc = std.testing.allocator; + var harness = try RouteFlowHarness.init(alloc); + defer harness.deinit(); + + try harness.seedLatestRelease( + "demo-app", + "{\"app_name\":\"demo-app\",\"services\":[],\"workers\":[],\"crons\":[],\"training_jobs\":[{\"name\":\"finetune\",\"image\":\"pytorch:latest\",\"command\":[\"python\",\"train.py\"],\"gpus\":1,\"cpu_limit\":2000,\"memory_limit_mb\":4096}]}", + ); + + const start_resp = route( + makeRequest(.POST, "/apps/demo-app/training/finetune/start", "", ""), + alloc, + harness.ctx(), + ).?; + defer freeResponse(alloc, start_resp); + try std.testing.expectEqual(http.StatusCode.ok, start_resp.status); + harness.applyCommitted(); + + const logs_resp = route( + makeRequest(.GET, "/apps/demo-app/training/finetune/logs", "", "rank=0"), + alloc, + harness.ctx(), + ).?; + defer freeResponse(alloc, logs_resp); + + try std.testing.expectEqual(http.StatusCode.bad_request, logs_resp.status); + try std.testing.expect(std.mem.indexOf(u8, logs_resp.body, "hosting agent") != null); +} diff --git a/src/api/routes/cluster_agents/writers.zig b/src/api/routes/cluster_agents/writers.zig index 373bb2d6..4fe83fa5 100644 --- a/src/api/routes/cluster_agents/writers.zig +++ b/src/api/routes/cluster_agents/writers.zig @@ -90,6 +90,21 @@ pub fn writeAssignmentJson(writer: anytype, assignment: agent_registry.Assignmen try std.fmt.format(writer, "{d}", .{assignment.cpu_limit}); try writer.writeAll(",\"memory_limit_mb\":"); try std.fmt.format(writer, "{d}", .{assignment.memory_limit_mb}); + if (assignment.app_name) |app_name| { + try writer.writeAll(",\"app_name\":\""); + try json_helpers.writeJsonEscaped(writer, app_name); + try writer.writeByte('"'); + } + if (assignment.workload_kind) |workload_kind| { + try writer.writeAll(",\"workload_kind\":\""); + try json_helpers.writeJsonEscaped(writer, workload_kind); + try writer.writeByte('"'); + } + if (assignment.workload_name) |workload_name| { + try writer.writeAll(",\"workload_name\":\""); + try json_helpers.writeJsonEscaped(writer, workload_name); + try writer.writeByte('"'); + } if (assignment.gang_rank) |rank| { try writer.writeAll(",\"gang_rank\":"); try std.fmt.format(writer, "{d}", .{rank}); diff --git a/src/cluster/agent/assignment_runtime.zig b/src/cluster/agent/assignment_runtime.zig index a5fd1712..a4bbee24 100644 --- a/src/cluster/agent/assignment_runtime.zig +++ b/src/cluster/agent/assignment_runtime.zig @@ -20,6 +20,12 @@ pub const GangInfo = struct { master_port: u16, }; +const AssignmentMeta = struct { + app_name: ?[]const u8 = null, + workload_kind: ?[]const u8 = null, + workload_name: ?[]const u8 = null, +}; + pub fn reconcile(self: anytype) void { var resp = fetchAssignments(self) orelse { reconcileFromCache(self); @@ -36,6 +42,9 @@ pub fn reconcile(self: anytype) void { const command = extractJsonString(obj, "command") orelse ""; const cpu_limit = extractJsonInt(obj, "cpu_limit") orelse 1000; const memory_limit_mb = extractJsonInt(obj, "memory_limit_mb") orelse 256; + const app_name = extractJsonString(obj, "app_name"); + const workload_kind = extractJsonString(obj, "workload_kind"); + const workload_name = extractJsonString(obj, "workload_name"); const gang_rank = extractJsonInt(obj, "gang_rank"); const gang_world_size = extractJsonInt(obj, "gang_world_size"); const gang_master_addr = extractJsonString(obj, "gang_master_addr"); @@ -63,7 +72,11 @@ pub fn reconcile(self: anytype) void { .master_addr = gang_master_addr.?, .master_port = if (gang_master_port) |port| @intCast(@max(0, port)) else 29500, } else null; - startPendingAssignment(self, assignment_id, image, command, gang_info); + startPendingAssignment(self, assignment_id, image, command, gang_info, .{ + .app_name = app_name, + .workload_kind = workload_kind, + .workload_name = workload_name, + }); } } } @@ -78,11 +91,11 @@ fn reconcileFromCache(self: anytype) void { if (cached.len == 0) return; log.warn("server unreachable, reconciling from cache ({d} assignments)", .{cached.len}); for (cached) |assignment| { - startPendingAssignment(self, assignment.id, assignment.image, assignment.command, null); + startPendingAssignment(self, assignment.id, assignment.image, assignment.command, null, .{}); } } -fn startPendingAssignment(self: anytype, id: []const u8, image: []const u8, command: []const u8, gang_info: ?GangInfo) void { +fn startPendingAssignment(self: anytype, id: []const u8, image: []const u8, command: []const u8, gang_info: ?GangInfo, meta: AssignmentMeta) void { self.container_lock.lock(); const already_tracked = self.local_containers.contains(id); self.container_lock.unlock(); @@ -98,11 +111,44 @@ fn startPendingAssignment(self: anytype, id: []const u8, image: []const u8, comm self.alloc.free(image_copy); return; }; + const app_name_copy = if (meta.app_name) |app_name| + self.alloc.dupe(u8, app_name) catch { + self.alloc.free(id_copy); + self.alloc.free(image_copy); + self.alloc.free(command_copy); + return; + } + else + null; + const workload_kind_copy = if (meta.workload_kind) |workload_kind| + self.alloc.dupe(u8, workload_kind) catch { + self.alloc.free(id_copy); + self.alloc.free(image_copy); + self.alloc.free(command_copy); + if (app_name_copy) |app_name| self.alloc.free(app_name); + return; + } + else + null; + const workload_name_copy = if (meta.workload_name) |workload_name| + self.alloc.dupe(u8, workload_name) catch { + self.alloc.free(id_copy); + self.alloc.free(image_copy); + self.alloc.free(command_copy); + if (app_name_copy) |app_name| self.alloc.free(app_name); + if (workload_kind_copy) |workload_kind| self.alloc.free(workload_kind); + return; + } + else + null; const gang_copy: ?GangInfo = if (gang_info) |gang| blk: { const addr_copy = self.alloc.dupe(u8, gang.master_addr) catch { self.alloc.free(id_copy); self.alloc.free(image_copy); self.alloc.free(command_copy); + if (app_name_copy) |app_name| self.alloc.free(app_name); + if (workload_kind_copy) |workload_kind| self.alloc.free(workload_kind); + if (workload_name_copy) |workload_name| self.alloc.free(workload_name); return; }; break :blk .{ @@ -119,6 +165,9 @@ fn startPendingAssignment(self: anytype, id: []const u8, image: []const u8, comm self.alloc.free(id_copy); self.alloc.free(image_copy); self.alloc.free(command_copy); + if (app_name_copy) |app_name| self.alloc.free(app_name); + if (workload_kind_copy) |workload_kind| self.alloc.free(workload_kind); + if (workload_name_copy) |workload_name| self.alloc.free(workload_name); if (gang_copy) |gang| self.alloc.free(gang.master_addr); return; }; @@ -130,7 +179,11 @@ fn startPendingAssignment(self: anytype, id: []const u8, image: []const u8, comm log.info("starting assignment {s} (image: {s})", .{ id_copy, image_copy }); } - _ = std.Thread.spawn(.{}, runAssignment, .{ self, id_copy, image_copy, command_copy, gang_copy }) catch { + _ = std.Thread.spawn(.{}, runAssignment, .{ self, id_copy, image_copy, command_copy, gang_copy, AssignmentMeta{ + .app_name = app_name_copy, + .workload_kind = workload_kind_copy, + .workload_name = workload_name_copy, + } }) catch { log.warn("failed to spawn thread for assignment {s}", .{id_copy}); self.container_lock.lock(); _ = self.local_containers.remove(id_copy); @@ -138,6 +191,9 @@ fn startPendingAssignment(self: anytype, id: []const u8, image: []const u8, comm self.alloc.free(id_copy); self.alloc.free(image_copy); self.alloc.free(command_copy); + if (app_name_copy) |app_name| self.alloc.free(app_name); + if (workload_kind_copy) |workload_kind| self.alloc.free(workload_kind); + if (workload_name_copy) |workload_name| self.alloc.free(workload_name); if (gang_copy) |gang| self.alloc.free(gang.master_addr); }; } @@ -148,10 +204,13 @@ fn fetchAssignments(self: anytype) ?http_client.Response { return http_client.getWithAuth(self.alloc, self.server_addr, self.server_port, path, self.token) catch return null; } -fn runAssignment(self: anytype, assignment_id: []const u8, image: []const u8, command: []const u8, gang_info: ?GangInfo) void { +fn runAssignment(self: anytype, assignment_id: []const u8, image: []const u8, command: []const u8, gang_info: ?GangInfo, meta: AssignmentMeta) void { defer { self.alloc.free(image); self.alloc.free(command); + if (meta.app_name) |app_name| self.alloc.free(app_name); + if (meta.workload_kind) |workload_kind| self.alloc.free(workload_kind); + if (meta.workload_name) |workload_name| self.alloc.free(workload_name); if (gang_info) |gang| self.alloc.free(gang.master_addr); } @@ -186,14 +245,18 @@ fn runAssignment(self: anytype, assignment_id: []const u8, image: []const u8, co }; const container_id = id_buf[0..]; + var hostname_buf: [128]u8 = undefined; + const hostname = buildAssignmentHostname(&hostname_buf, meta, gang_info); + store.save(.{ .id = container_id, .rootfs = rootfs, .command = if (command.len > 0) command else "/bin/sh", - .hostname = "agent", + .hostname = hostname, .status = "created", .pid = null, .exit_code = null, + .app_name = meta.app_name, .created_at = std.time.timestamp(), }) catch { log.warn("failed to save container record for assignment {s}", .{assignment_id}); @@ -273,6 +336,17 @@ fn runAssignment(self: anytype, assignment_id: []const u8, image: []const u8, co cleanup(container_id); } +fn buildAssignmentHostname(buf: []u8, meta: AssignmentMeta, gang_info: ?GangInfo) []const u8 { + if (meta.workload_kind != null and meta.workload_name != null and std.mem.eql(u8, meta.workload_kind.?, "training")) { + if (gang_info) |gang| { + return std.fmt.bufPrint(buf, "{s}-rank-{d}", .{ meta.workload_name.?, gang.rank }) catch meta.workload_name.?; + } + return meta.workload_name.?; + } + if (meta.workload_name) |workload_name| return workload_name; + return "agent"; +} + fn reportStatus(self: anytype, assignment_id: []const u8, status: []const u8) void { var path_buf: [128]u8 = undefined; const path = std.fmt.bufPrint(&path_buf, "/agents/{s}/assignments/{s}/status", .{ self.id, assignment_id }) catch return; diff --git a/src/cluster/agent_types.zig b/src/cluster/agent_types.zig index deb09ba3..423a05bc 100644 --- a/src/cluster/agent_types.zig +++ b/src/cluster/agent_types.zig @@ -115,6 +115,9 @@ pub const Assignment = struct { status: []const u8, cpu_limit: i64, memory_limit_mb: i64, + app_name: ?[]const u8 = null, + workload_kind: ?[]const u8 = null, + workload_name: ?[]const u8 = null, gang_rank: ?i64 = null, gang_world_size: ?i64 = null, gang_master_addr: ?[]const u8 = null, @@ -126,6 +129,9 @@ pub const Assignment = struct { alloc.free(self.image); alloc.free(self.command); alloc.free(self.status); + if (self.app_name) |app_name| alloc.free(app_name); + if (self.workload_kind) |workload_kind| alloc.free(workload_kind); + if (self.workload_name) |workload_name| alloc.free(workload_name); if (self.gang_master_addr) |addr| alloc.free(addr); } }; diff --git a/src/cluster/registry.zig b/src/cluster/registry.zig index 078f6a1d..cfa77353 100644 --- a/src/cluster/registry.zig +++ b/src/cluster/registry.zig @@ -47,6 +47,7 @@ pub const listAgents = queries.listAgents; pub const getAgent = queries.getAgent; pub const getAssignments = queries.getAssignments; pub const getOrphanedAssignments = queries.getOrphanedAssignments; +pub const countAssignmentsForWorkload = queries.countAssignmentsForWorkload; // -- tests -- diff --git a/src/cluster/registry/queries.zig b/src/cluster/registry/queries.zig index ed571a7e..43083317 100644 --- a/src/cluster/registry/queries.zig +++ b/src/cluster/registry/queries.zig @@ -161,7 +161,7 @@ pub fn getAssignments(alloc: Allocator, db: *sqlite.Db, agent_id: []const u8) ![ return queryAssignmentRows( alloc, db, - "SELECT id, agent_id, image, command, status, cpu_limit, memory_limit_mb, gang_rank, gang_world_size, gang_master_addr, gang_master_port FROM assignments WHERE agent_id = ?;", + "SELECT id, agent_id, image, command, status, cpu_limit, memory_limit_mb, app_name, workload_kind, workload_name, gang_rank, gang_world_size, gang_master_addr, gang_master_port FROM assignments WHERE agent_id = ?;", .{agent_id}, ); } @@ -170,11 +170,22 @@ pub fn getOrphanedAssignments(alloc: Allocator, db: *sqlite.Db) ![]Assignment { return queryAssignmentRows( alloc, db, - "SELECT id, agent_id, image, command, status, cpu_limit, memory_limit_mb, gang_rank, gang_world_size, gang_master_addr, gang_master_port FROM assignments WHERE agent_id = '' AND status = 'pending';", + "SELECT id, agent_id, image, command, status, cpu_limit, memory_limit_mb, app_name, workload_kind, workload_name, gang_rank, gang_world_size, gang_master_addr, gang_master_port FROM assignments WHERE agent_id = '' AND status = 'pending';", .{}, ); } +pub fn countAssignmentsForWorkload(db: *sqlite.Db, app_name: []const u8, workload_kind: []const u8, workload_name: []const u8) !usize { + const Row = struct { count: i64 }; + const row = (db.one( + Row, + "SELECT COUNT(*) AS count FROM assignments WHERE app_name = ? AND workload_kind = ? AND workload_name = ?;", + .{}, + .{ app_name, workload_kind, workload_name }, + ) catch return error.QueryFailed) orelse return 0; + return @intCast(row.count); +} + const AssignmentRow = struct { id: sqlite.Text, agent_id: sqlite.Text, @@ -183,6 +194,9 @@ const AssignmentRow = struct { status: sqlite.Text, cpu_limit: i64, memory_limit_mb: i64, + app_name: ?sqlite.Text, + workload_kind: ?sqlite.Text, + workload_name: ?sqlite.Text, gang_rank: ?i64, gang_world_size: ?i64, gang_master_addr: ?sqlite.Text, @@ -209,6 +223,9 @@ fn queryAssignmentRows(alloc: Allocator, db: *sqlite.Db, comptime query: []const .status = row.status.data, .cpu_limit = row.cpu_limit, .memory_limit_mb = row.memory_limit_mb, + .app_name = if (row.app_name) |app_name| app_name.data else null, + .workload_kind = if (row.workload_kind) |workload_kind| workload_kind.data else null, + .workload_name = if (row.workload_name) |workload_name| workload_name.data else null, .gang_rank = row.gang_rank, .gang_world_size = row.gang_world_size, .gang_master_addr = if (row.gang_master_addr) |addr| addr.data else null, From 068f9f467e546b966ea2f6b03896e23e3fb55725 Mon Sep 17 00:00:00 2001 From: Kacy Fortner Date: Fri, 10 Apr 2026 10:47:50 +0000 Subject: [PATCH 12/14] Document clustered training log limits --- README.md | 2 ++ docs/cluster-guide.md | 2 ++ docs/users-guide.md | 3 +++ 3 files changed, 7 insertions(+) diff --git a/README.md b/README.md index 7077163d..5ef22fee 100644 --- a/README.md +++ b/README.md @@ -270,6 +270,8 @@ yoq train scale [--server host:port] --gpus scale training ranks yoq train logs [--server host:port] [--rank N] show logs for a training rank ``` +For clustered training logs, the control plane returns an explicit hosting-agent error when the selected rank's logs are not directly readable on the control-plane host. + ### diagnostics ```text diff --git a/docs/cluster-guide.md b/docs/cluster-guide.md index b23ff0c9..55a16825 100644 --- a/docs/cluster-guide.md +++ b/docs/cluster-guide.md @@ -357,6 +357,8 @@ the important read paths are: - `GET /apps//training//status` - `GET /apps//training//logs` +`GET /apps//training//logs` is currently direct-read only. If the hosting rank's logs are not locally readable from the control-plane host, the route returns an explicit hosting-agent error. + ### draining a node before taking a node offline for maintenance: diff --git a/docs/users-guide.md b/docs/users-guide.md index 798801e2..6904d3d6 100644 --- a/docs/users-guide.md +++ b/docs/users-guide.md @@ -175,6 +175,8 @@ this gives the operator one app-first day-2 model: `yoq apps` and `yoq status --app` now show both the desired workload mix from the latest app release and the current training runtime summary for that app. On clustered applies, cron definitions from the app snapshot are also registered in cluster state, so rollback restores the active cron schedule set along with the rest of the app snapshot. +Remote `yoq train logs --server ...` is still a direct-read path. If the selected rank's logs are not locally readable from the control-plane host, the API returns an explicit hosting-agent error instead of a misleading empty or missing result. + ### dev mode `yoq up --dev` bind-mounts source directories and watches for file changes via inotify. changed files trigger a container restart with 500ms debounce. logs are multiplexed with colored service name prefixes. @@ -234,6 +236,7 @@ the cluster API also exposes app-scoped day-2 reads and rollback: - `GET /apps//training//status|logs` — inspect training jobs for the current app release The app status surfaces (`GET /apps`, `GET /apps//status`, `yoq apps`, and `yoq status --app`) also report live training runtime counts for the app: active, paused, and failed jobs. +For `GET /apps//training//logs`, a scheduled remote job now returns an explicit hosting-agent error when the control plane cannot directly read the selected rank's logs. ### rolling upgrades From 5885c056aebbdc7567bede00419b2a188adc1f9e Mon Sep 17 00:00:00 2001 From: Kacy Fortner Date: Fri, 10 Apr 2026 12:57:52 +0000 Subject: [PATCH 13/14] Proxy clustered training logs through agents --- README.md | 2 +- docs/cluster-guide.md | 2 +- docs/users-guide.md | 4 +- .../routes/cluster_agents/agent_routes.zig | 16 +- .../routes/cluster_agents/workload_routes.zig | 134 +++++++++++- src/api/routes/cluster_agents/writers.zig | 7 +- src/cluster/agent.zig | 6 +- src/cluster/agent/lifecycle_support.zig | 41 ++++ src/cluster/agent/log_server.zig | 197 ++++++++++++++++++ src/cluster/agent/request_support.zig | 5 +- src/cluster/agent_types.zig | 1 + src/cluster/cli/membership_command.zig | 11 + src/cluster/registry.zig | 2 + src/cluster/registry/queries.zig | 51 ++++- src/cluster/registry/sql_mutations.zig | 26 +-- src/cluster/registry/test_support.zig | 1 + src/state/schema/migrations.zig | 1 + src/state/schema/tables.zig | 1 + src/test_root.zig | 1 + 19 files changed, 481 insertions(+), 28 deletions(-) create mode 100644 src/cluster/agent/log_server.zig diff --git a/README.md b/README.md index 5ef22fee..66c63f6f 100644 --- a/README.md +++ b/README.md @@ -270,7 +270,7 @@ yoq train scale [--server host:port] --gpus scale training ranks yoq train logs [--server host:port] [--rank N] show logs for a training rank ``` -For clustered training logs, the control plane returns an explicit hosting-agent error when the selected rank's logs are not directly readable on the control-plane host. +For clustered training logs, the control plane now proxies log reads to the agent that hosts the selected rank. If that agent is unreachable or does not expose the log endpoint, the API returns an explicit hosting-agent error instead of a misleading empty result. ### diagnostics diff --git a/docs/cluster-guide.md b/docs/cluster-guide.md index 55a16825..cf16595d 100644 --- a/docs/cluster-guide.md +++ b/docs/cluster-guide.md @@ -357,7 +357,7 @@ the important read paths are: - `GET /apps//training//status` - `GET /apps//training//logs` -`GET /apps//training//logs` is currently direct-read only. If the hosting rank's logs are not locally readable from the control-plane host, the route returns an explicit hosting-agent error. +`GET /apps//training//logs` now proxies the request to the agent that hosts the selected rank. If that agent is unreachable or does not expose the log endpoint, the route returns an explicit hosting-agent error. ### draining a node diff --git a/docs/users-guide.md b/docs/users-guide.md index 6904d3d6..eee2b6af 100644 --- a/docs/users-guide.md +++ b/docs/users-guide.md @@ -175,7 +175,7 @@ this gives the operator one app-first day-2 model: `yoq apps` and `yoq status --app` now show both the desired workload mix from the latest app release and the current training runtime summary for that app. On clustered applies, cron definitions from the app snapshot are also registered in cluster state, so rollback restores the active cron schedule set along with the rest of the app snapshot. -Remote `yoq train logs --server ...` is still a direct-read path. If the selected rank's logs are not locally readable from the control-plane host, the API returns an explicit hosting-agent error instead of a misleading empty or missing result. +Remote `yoq train logs --server ...` now proxies the request to the agent that hosts the selected rank. If that agent is unreachable or does not expose the log endpoint, the API returns an explicit hosting-agent error instead of a misleading empty or missing result. ### dev mode @@ -236,7 +236,7 @@ the cluster API also exposes app-scoped day-2 reads and rollback: - `GET /apps//training//status|logs` — inspect training jobs for the current app release The app status surfaces (`GET /apps`, `GET /apps//status`, `yoq apps`, and `yoq status --app`) also report live training runtime counts for the app: active, paused, and failed jobs. -For `GET /apps//training//logs`, a scheduled remote job now returns an explicit hosting-agent error when the control plane cannot directly read the selected rank's logs. +For `GET /apps//training//logs`, the control plane now proxies the request to the hosting agent for the selected rank. If that agent is unreachable or does not expose the log endpoint, the route returns an explicit hosting-agent error. ### rolling upgrades diff --git a/src/api/routes/cluster_agents/agent_routes.zig b/src/api/routes/cluster_agents/agent_routes.zig index 79b7fe5d..252e402c 100644 --- a/src/api/routes/cluster_agents/agent_routes.zig +++ b/src/api/routes/cluster_agents/agent_routes.zig @@ -2,6 +2,7 @@ const std = @import("std"); const http = @import("../../http.zig"); const agent_registry = @import("../../../cluster/registry.zig"); const cluster_config = @import("../../../cluster/config.zig"); +const request_support = @import("../../../cluster/agent/request_support.zig"); const json_helpers = @import("../../../lib/json_helpers.zig"); const common = @import("../common.zig"); const writers = @import("writers.zig"); @@ -18,10 +19,14 @@ pub fn handleAgentRegister(alloc: std.mem.Allocator, request: http.Request, ctx: const token = extractJsonString(request.body, "token") orelse return common.badRequest("missing token field"); const address = extractJsonString(request.body, "address") orelse return common.badRequest("missing address field"); + const agent_api_port = extractJsonInt(request.body, "agent_api_port"); const cpu_cores = extractJsonInt(request.body, "cpu_cores") orelse return common.badRequest("missing cpu_cores field"); const memory_mb = extractJsonInt(request.body, "memory_mb") orelse return common.badRequest("missing memory_mb field"); if (cpu_cores <= 0 or cpu_cores > 10000) return common.badRequest("invalid cpu_cores"); if (memory_mb <= 0 or memory_mb > 10_000_000) return common.badRequest("invalid memory_mb"); + if (agent_api_port) |port| { + if (port <= 0 or port > 65535) return common.badRequest("invalid agent_api_port"); + } if (cpu_cores > std.math.maxInt(u32)) return common.badRequest("cpu_cores too large"); if (memory_mb > std.math.maxInt(u64)) return common.badRequest("memory_mb too large"); @@ -42,7 +47,6 @@ pub fn handleAgentRegister(alloc: std.mem.Allocator, request: http.Request, ctx: var container_subnet_buf: [20]u8 = undefined; var container_subnet: ?[]const u8 = null; var endpoint_buf: [64]u8 = undefined; - var endpoint: ?[]const u8 = null; var peer_sql: ?[]const u8 = null; var peer_sql_buf: [1024]u8 = undefined; @@ -76,15 +80,18 @@ pub fn handleAgentRegister(alloc: std.mem.Allocator, request: http.Request, ctx: if (p <= 0 or p > 65535) return common.badRequest("invalid wg_listen_port"); break :blk @intCast(p); } else 51820; - endpoint = std.fmt.bufPrint(&endpoint_buf, "{s}:{d}", .{ address, port }) catch null; + const endpoint_host = if (request_support.parseHostPort(address)) |hp| + std.fmt.bufPrint(&endpoint_buf, "{d}.{d}.{d}.{d}:{d}", .{ hp.addr[0], hp.addr[1], hp.addr[2], hp.addr[3], port }) catch null + else + std.fmt.bufPrint(&endpoint_buf, "{s}:{d}", .{ address, port }) catch null; - if (endpoint != null and overlay_ip_str != null and container_subnet != null) { + if (endpoint_host != null and overlay_ip_str != null and container_subnet != null) { peer_sql = agent_registry.wireguardPeerSql( &peer_sql_buf, nid, &id_buf, pub_key, - endpoint.?, + endpoint_host.?, overlay_ip_str.?, container_subnet.?, ) catch return common.internalError(); @@ -117,6 +124,7 @@ pub fn handleAgentRegister(alloc: std.mem.Allocator, request: http.Request, ctx: std.time.timestamp(), .{ .node_id = assigned_node_id, + .agent_api_port = if (agent_api_port) |port| @intCast(port) else null, .wg_public_key = wg_public_key, .overlay_ip = overlay_ip_str, .role = role_str, diff --git a/src/api/routes/cluster_agents/workload_routes.zig b/src/api/routes/cluster_agents/workload_routes.zig index 7414bc87..a2cce922 100644 --- a/src/api/routes/cluster_agents/workload_routes.zig +++ b/src/api/routes/cluster_agents/workload_routes.zig @@ -223,17 +223,21 @@ fn handleTrainingLogs( request: http.Request, ctx: RouteContext, ) Response { - _ = ctx.cluster orelse return common.badRequest("not running in cluster mode"); + const node = ctx.cluster orelse return common.badRequest("not running in cluster mode"); const rank = if (common.extractQueryValue(request.query, "rank")) |rank_str| std.fmt.parseInt(u32, rank_str, 10) catch 0 else 0; + if (proxyTrainingLogsFromHostingAgent(alloc, node, ctx.join_token, app_name, job_name, rank)) |result| { + return result; + } + var hostname_buf: [128]u8 = undefined; const hostname = std.fmt.bufPrint(&hostname_buf, "{s}-rank-{d}", .{ job_name, rank }) catch return common.internalError(); const record = store.findAppContainer(alloc, app_name, hostname) catch return common.internalError(); if (record == null) { - const scheduled = agent_registry.countAssignmentsForWorkload(ctx.cluster.?.stateMachineDb(), app_name, "training", job_name) catch return common.internalError(); + const scheduled = agent_registry.countAssignmentsForWorkload(node.stateMachineDb(), app_name, "training", job_name) catch return common.internalError(); if (scheduled > 0) { return .{ .status = .bad_request, @@ -250,6 +254,64 @@ fn handleTrainingLogs( return .{ .status = .ok, .body = data, .allocated = true, .content_type = "text/plain" }; } +fn proxyTrainingLogsFromHostingAgent( + alloc: std.mem.Allocator, + node: *cluster_node.Node, + join_token: ?[]const u8, + app_name: []const u8, + job_name: []const u8, + rank: u32, +) ?Response { + const token = join_token orelse return null; + const host = agent_registry.findWorkloadHostByRank(alloc, node.stateMachineDb(), app_name, "training", job_name, rank) catch + return common.internalError(); + if (host == null) return null; + defer host.?.deinit(alloc); + const port = host.?.agent_api_port orelse { + return .{ + .status = .service_unavailable, + .body = "{\"error\":\"hosting agent does not expose training logs\"}", + .allocated = false, + }; + }; + if (port <= 0 or port > 65535) { + return common.internalError(); + } + + const ip = @import("../../../network/ip.zig").parseIp(host.?.address) orelse { + return .{ + .status = .service_unavailable, + .body = "{\"error\":\"hosting agent address is invalid\"}", + .allocated = false, + }; + }; + + var path_buf: [256]u8 = undefined; + const path = std.fmt.bufPrint(&path_buf, "/training/{s}/{s}/logs?rank={d}", .{ app_name, job_name, rank }) catch + return common.internalError(); + + var resp = @import("../../../cluster/http_client.zig").getWithAuth(alloc, ip, @intCast(port), path, token) catch { + return .{ + .status = .bad_gateway, + .body = "{\"error\":\"failed to fetch training logs from hosting agent\"}", + .allocated = false, + }; + }; + defer resp.deinit(alloc); + + if (resp.status_code == 200) { + const body = alloc.dupe(u8, resp.body) catch return common.internalError(); + return .{ .status = .ok, .body = body, .allocated = true, .content_type = "text/plain" }; + } + if (resp.status_code == 404) return null; + if (resp.status_code == 401) return common.unauthorized(); + return .{ + .status = .bad_gateway, + .body = "{\"error\":\"failed to fetch training logs from hosting agent\"}", + .allocated = false, + }; +} + fn scheduleTrainingJob( alloc: std.mem.Allocator, app_name: []const u8, @@ -451,9 +513,9 @@ const RouteFlowHarness = struct { fn seedActiveAgent(self: *RouteFlowHarness) !void { self.node.stateMachineDb().exec( - "INSERT INTO agents (id, address, status, cpu_cores, memory_mb, cpu_used, memory_used_mb, containers, last_heartbeat, registered_at, role, labels, gpu_count, gpu_used, gpu_model, gpu_vram_mb) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);", + "INSERT INTO agents (id, address, agent_api_port, status, cpu_cores, memory_mb, cpu_used, memory_used_mb, containers, last_heartbeat, registered_at, role, labels, gpu_count, gpu_used, gpu_model, gpu_vram_mb) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);", .{}, - .{ "abc123def456", "10.0.0.2:7701", "active", @as(i64, 8), @as(i64, 16384), @as(i64, 0), @as(i64, 0), @as(i64, 0), @as(i64, 100), @as(i64, 100), "agent", "", @as(i64, 4), @as(i64, 0), "L4", @as(i64, 24576) }, + .{ "abc123def456", "10.0.0.2", @as(i64, 7701), "active", @as(i64, 8), @as(i64, 16384), @as(i64, 0), @as(i64, 0), @as(i64, 0), @as(i64, 100), @as(i64, 100), "agent", "", @as(i64, 4), @as(i64, 0), "L4", @as(i64, 24576) }, ) catch return error.SkipZigTest; } @@ -499,6 +561,14 @@ fn countTrainingAssignments(db: *sqlite.Db, app_name: []const u8, job_name: []co return @intCast(row.count); } +fn updateHarnessAgentEndpoint(harness: *RouteFlowHarness, address: []const u8, port: u16) !void { + harness.node.stateMachineDb().exec( + "UPDATE agents SET address = ?, agent_api_port = ? WHERE id = ?;", + .{}, + .{ address, @as(i64, port), "abc123def456" }, + ) catch return error.SkipZigTest; +} + test "route rejects worker run without cluster" { const ctx: RouteContext = .{ .cluster = null, .join_token = null }; const req = makeRequest(.POST, "/apps/demo-app/workers/migrate/run", "", ""); @@ -685,3 +755,59 @@ test "training logs route reports remote-hosted ranks explicitly" { try std.testing.expectEqual(http.StatusCode.bad_request, logs_resp.status); try std.testing.expect(std.mem.indexOf(u8, logs_resp.body, "hosting agent") != null); } + +test "training logs route proxies logs from hosting agent" { + const alloc = std.testing.allocator; + store.initTestDb() catch return error.SkipZigTest; + defer store.deinitTestDb(); + + try store.save(.{ + .id = "abc123def456", + .rootfs = "/tmp/rootfs", + .command = "python train.py", + .hostname = "finetune-rank-0", + .status = "running", + .pid = null, + .exit_code = null, + .app_name = "demo-app", + .created_at = 100, + }); + var file = try @import("../../../runtime/logs.zig").createLogFile("abc123def456"); + try file.writeAll("proxied rank logs\n"); + file.close(); + + var agent_log_server = try @import("../../../cluster/agent/log_server.zig").LogServer.init(alloc, 0, "join-token"); + const log_thread = try std.Thread.spawn(.{}, @import("../../../cluster/agent/log_server.zig").LogServer.run, .{&agent_log_server}); + defer { + agent_log_server.deinit(); + log_thread.join(); + } + + var harness = try RouteFlowHarness.init(alloc); + defer harness.deinit(); + try updateHarnessAgentEndpoint(&harness, "127.0.0.1", agent_log_server.port); + + try harness.seedLatestRelease( + "demo-app", + "{\"app_name\":\"demo-app\",\"services\":[],\"workers\":[],\"crons\":[],\"training_jobs\":[{\"name\":\"finetune\",\"image\":\"pytorch:latest\",\"command\":[\"python\",\"train.py\"],\"gpus\":1,\"cpu_limit\":2000,\"memory_limit_mb\":4096}]}", + ); + + const start_resp = route( + makeRequest(.POST, "/apps/demo-app/training/finetune/start", "", ""), + alloc, + .{ .cluster = &harness.node, .join_token = "join-token" }, + ).?; + defer freeResponse(alloc, start_resp); + try std.testing.expectEqual(http.StatusCode.ok, start_resp.status); + harness.applyCommitted(); + + const logs_resp = route( + makeRequest(.GET, "/apps/demo-app/training/finetune/logs", "", "rank=0"), + alloc, + .{ .cluster = &harness.node, .join_token = "join-token" }, + ).?; + defer freeResponse(alloc, logs_resp); + + try std.testing.expectEqual(http.StatusCode.ok, logs_resp.status); + try std.testing.expectEqualStrings("proxied rank logs\n", logs_resp.body); +} diff --git a/src/api/routes/cluster_agents/writers.zig b/src/api/routes/cluster_agents/writers.zig index 4fe83fa5..078b0d72 100644 --- a/src/api/routes/cluster_agents/writers.zig +++ b/src/api/routes/cluster_agents/writers.zig @@ -9,7 +9,12 @@ pub fn writeAgentJson(writer: anytype, agent: agent_registry.AgentRecord) !void try json_helpers.writeJsonEscaped(writer, agent.address); try writer.writeAll("\",\"status\":\""); try writer.writeAll(agent.status); - try writer.writeAll("\",\"cpu_cores\":"); + try writer.writeByte('"'); + if (agent.agent_api_port) |port| { + try writer.writeAll(",\"agent_api_port\":"); + try std.fmt.format(writer, "{d}", .{port}); + } + try writer.writeAll(",\"cpu_cores\":"); try std.fmt.format(writer, "{d}", .{agent.cpu_cores}); try writer.writeAll(",\"memory_mb\":"); try std.fmt.format(writer, "{d}", .{agent.memory_mb}); diff --git a/src/cluster/agent.zig b/src/cluster/agent.zig index 1c27a299..8fff1102 100644 --- a/src/cluster/agent.zig +++ b/src/cluster/agent.zig @@ -33,6 +33,7 @@ const resource_support = @import("agent/resource_support.zig"); const gossip_support = @import("agent/gossip_support.zig"); const assignment_runtime = @import("agent/assignment_runtime.zig"); const loop_runtime = @import("agent/loop_runtime.zig"); +const log_server_mod = @import("agent/log_server.zig"); const Allocator = std.mem.Allocator; const AgentResources = agent_types.AgentResources; @@ -64,8 +65,11 @@ pub const Agent = struct { server_port: u16, token: []const u8, owned_token: ?[]u8 = null, + agent_api_port: u16 = 7701, running: std.atomic.Value(bool), loop_thread: ?std.Thread, + log_server: ?log_server_mod.LogServer = null, + log_server_thread: ?std.Thread = null, /// tracks assignment_id → local container state. /// protected by mutex since container threads update it. @@ -129,7 +133,7 @@ pub const Agent = struct { var local_ip_buf: [16]u8 = undefined; const local_ip = resource_support.detectLocalIp(self.server_addr, &local_ip_buf); - const body = request_support.buildRegisterBody(self.alloc, self.token, local_ip, resources, pub_key, self.wg_listen_port, self.role, self.region) catch + const body = request_support.buildRegisterBody(self.alloc, self.token, local_ip, self.agent_api_port, resources, pub_key, self.wg_listen_port, self.role, self.region) catch return AgentError.RegisterFailed; defer self.alloc.free(body); diff --git a/src/cluster/agent/lifecycle_support.zig b/src/cluster/agent/lifecycle_support.zig index 884fc788..1d7ecb40 100644 --- a/src/cluster/agent/lifecycle_support.zig +++ b/src/cluster/agent/lifecycle_support.zig @@ -3,6 +3,7 @@ const setup = @import("../../network/setup.zig"); const agent_store = @import("../agent_store.zig"); const cluster_config = @import("../config.zig"); const agent_mod = @import("../agent.zig"); +const log_server = @import("log_server.zig"); const loop_runtime = @import("loop_runtime.zig"); pub fn init(alloc: std.mem.Allocator, server_addr: [4]u8, server_port: u16, token: []const u8, owned_token: ?[]u8) agent_mod.Agent { @@ -13,8 +14,11 @@ pub fn init(alloc: std.mem.Allocator, server_addr: [4]u8, server_port: u16, toke .server_port = server_port, .token = if (owned_token) |owned| owned else token, .owned_token = owned_token, + .agent_api_port = 7701, .running = std.atomic.Value(bool).init(false), .loop_thread = null, + .log_server = null, + .log_server_thread = null, .local_containers = std.StringHashMap(agent_mod.ContainerState).init(alloc), .container_lock = .{}, .node_id = null, @@ -38,18 +42,47 @@ pub fn initOwned(alloc: std.mem.Allocator, server_addr: [4]u8, server_port: u16, pub fn start(self: anytype) !void { self.running.store(true, .release); + self.log_server = try log_server.LogServer.init(self.alloc, self.agent_api_port, self.token); + errdefer { + if (self.log_server) |*server| server.deinit(); + self.log_server = null; + } + self.log_server_thread = std.Thread.spawn(.{}, runLogServer, .{self}) catch { + if (self.log_server) |*server| server.deinit(); + self.log_server = null; + self.running.store(false, .release); + return error.ThreadSpawnFailed; + }; self.loop_thread = std.Thread.spawn(.{}, loop_runtime.agentLoop, .{self}) catch { + if (self.log_server) |*server| server.deinit(); + if (self.log_server_thread) |t| { + t.join(); + self.log_server_thread = null; + } + self.log_server = null; self.running.store(false, .release); return error.ThreadSpawnFailed; }; } +fn runLogServer(self: anytype) void { + if (self.log_server) |*server| server.run(); +} + pub fn stop(self: anytype) void { self.running.store(false, .release); if (self.loop_thread) |t| { t.join(); self.loop_thread = null; } + if (self.log_server) |*server| { + server.deinit(); + } + if (self.log_server_thread) |t| { + t.join(); + self.log_server_thread = null; + } + self.log_server = null; if (self.node_id != null) { setup.teardownClusterNetworking(); @@ -65,6 +98,14 @@ pub fn wait(self: anytype) void { t.join(); self.loop_thread = null; } + if (self.log_server) |*server| { + server.deinit(); + } + if (self.log_server_thread) |t| { + t.join(); + self.log_server_thread = null; + } + self.log_server = null; } pub fn deinit(self: anytype) void { diff --git a/src/cluster/agent/log_server.zig b/src/cluster/agent/log_server.zig new file mode 100644 index 00000000..2742d8fd --- /dev/null +++ b/src/cluster/agent/log_server.zig @@ -0,0 +1,197 @@ +const std = @import("std"); +const posix = std.posix; +const http = @import("../../api/http.zig"); +const connection_runtime = @import("../../api/server/connection_runtime.zig"); +const common = @import("../../api/routes/common.zig"); +const store = @import("../../state/store.zig"); +const logs = @import("../../runtime/logs.zig"); + +pub const LogServer = struct { + alloc: std.mem.Allocator, + listen_fd: posix.fd_t, + token: []const u8, + port: u16, + running: std.atomic.Value(bool), + + pub fn init(alloc: std.mem.Allocator, port: u16, token: []const u8) !LogServer { + const fd = try posix.socket(posix.AF.INET, posix.SOCK.STREAM | posix.SOCK.CLOEXEC | posix.SOCK.NONBLOCK, 0); + errdefer posix.close(fd); + + const one: c_int = 1; + _ = posix.setsockopt(fd, posix.SOL.SOCKET, posix.SO.REUSEADDR, std.mem.asBytes(&one)) catch {}; + + const addr = std.net.Address.initIp4(.{ 0, 0, 0, 0 }, port); + try posix.bind(fd, &addr.any, addr.getOsSockLen()); + try posix.listen(fd, 32); + + var actual_addr: posix.sockaddr.in = undefined; + var actual_len: posix.socklen_t = @sizeOf(posix.sockaddr.in); + try posix.getsockname(fd, @ptrCast(&actual_addr), &actual_len); + + return .{ + .alloc = alloc, + .listen_fd = fd, + .token = token, + .port = std.mem.bigToNative(u16, actual_addr.port), + .running = std.atomic.Value(bool).init(true), + }; + } + + pub fn deinit(self: *LogServer) void { + self.running.store(false, .release); + posix.close(self.listen_fd); + } + + pub fn run(self: *LogServer) void { + while (self.running.load(.acquire)) { + const client_fd = posix.accept(self.listen_fd, null, null, posix.SOCK.CLOEXEC) catch |err| switch (err) { + error.WouldBlock => { + std.Thread.sleep(50 * std.time.ns_per_ms); + continue; + }, + else => return, + }; + handleConnection(self, client_fd); + } + } +}; + +fn handleConnection(self: *LogServer, client_fd: posix.fd_t) void { + defer posix.close(client_fd); + + const owned_request = connection_runtime.readRequestAlloc(self.alloc, client_fd) catch { + sendError(client_fd, .bad_request, "malformed request"); + return; + }; + defer owned_request.deinit(self.alloc); + + const request = owned_request.request; + if (!common.hasValidBearerToken(&request, self.token)) { + sendError(client_fd, .unauthorized, "unauthorized"); + return; + } + if (request.method != .GET) { + sendError(client_fd, .method_not_allowed, "method not allowed"); + return; + } + + if (matchTrainingLogs(request.path_only)) |path| { + if (!common.validateClusterInput(path.app_name) or !common.validateClusterInput(path.job_name)) { + sendError(client_fd, .bad_request, "invalid app or training job name"); + return; + } + const rank = if (common.extractQueryValue(request.query, "rank")) |rank_str| + std.fmt.parseInt(u32, rank_str, 10) catch 0 + else + 0; + serveTrainingLogs(self.alloc, client_fd, path.app_name, path.job_name, rank); + return; + } + + sendError(client_fd, .not_found, "not found"); +} + +const TrainingLogsPath = struct { + app_name: []const u8, + job_name: []const u8, +}; + +fn matchTrainingLogs(path: []const u8) ?TrainingLogsPath { + if (!std.mem.startsWith(u8, path, "/training/")) return null; + const tail = path["/training/".len..]; + const slash = std.mem.indexOfScalar(u8, tail, '/') orelse return null; + const app_name = tail[0..slash]; + const after_app = tail[slash + 1 ..]; + const slash2 = std.mem.indexOfScalar(u8, after_app, '/') orelse return null; + const job_name = after_app[0..slash2]; + if (!std.mem.eql(u8, after_app[slash2..], "/logs")) return null; + if (app_name.len == 0 or job_name.len == 0) return null; + return .{ .app_name = app_name, .job_name = job_name }; +} + +fn serveTrainingLogs(alloc: std.mem.Allocator, client_fd: posix.fd_t, app_name: []const u8, job_name: []const u8, rank: u32) void { + var hostname_buf: [128]u8 = undefined; + const hostname = std.fmt.bufPrint(&hostname_buf, "{s}-rank-{d}", .{ job_name, rank }) catch { + sendError(client_fd, .internal_server_error, "response formatting failed"); + return; + }; + const record = store.findAppContainer(alloc, app_name, hostname) catch { + sendError(client_fd, .internal_server_error, "container lookup failed"); + return; + }; + if (record == null) { + sendError(client_fd, .not_found, "not found"); + return; + } + defer record.?.deinit(alloc); + + const data = logs.readLogs(alloc, record.?.id) catch { + sendError(client_fd, .not_found, "not found"); + return; + }; + defer alloc.free(data); + + writeResponse(client_fd, .ok, "text/plain", data); +} + +fn sendError(fd: posix.fd_t, status: http.StatusCode, message: []const u8) void { + var resp_buf: [1024]u8 = undefined; + const resp = http.formatError(&resp_buf, status, message); + writeAll(fd, resp); +} + +fn writeResponse(fd: posix.fd_t, status: http.StatusCode, content_type: []const u8, body: []const u8) void { + var header_buf: [512]u8 = undefined; + const headers = http.formatResponseHeaders(&header_buf, status, content_type, body.len); + writeAll(fd, headers); + if (body.len > 0) writeAll(fd, body); +} + +fn writeAll(fd: posix.fd_t, data: []const u8) void { + var written: usize = 0; + while (written < data.len) { + const bytes_written = posix.write(fd, data[written..]) catch return; + if (bytes_written == 0) return; + written += bytes_written; + } +} + +test "log server serves remote training logs with auth" { + store.initTestDb() catch return error.SkipZigTest; + defer store.deinitTestDb(); + + try store.save(.{ + .id = "abc123def456", + .rootfs = "/tmp/rootfs", + .command = "python train.py", + .hostname = "finetune-rank-0", + .status = "running", + .pid = null, + .exit_code = null, + .app_name = "demo-app", + .created_at = 100, + }); + + var file = try logs.createLogFile("abc123def456"); + defer file.close(); + try file.writeAll("rank zero logs\n"); + + var server = try LogServer.init(std.testing.allocator, 0, "join-token"); + const thread = try std.Thread.spawn(.{}, LogServer.run, .{&server}); + defer { + server.deinit(); + thread.join(); + } + + var resp = try @import("../http_client.zig").getWithAuth( + std.testing.allocator, + .{ 127, 0, 0, 1 }, + server.port, + "/training/demo-app/finetune/logs?rank=0", + "join-token", + ); + defer resp.deinit(std.testing.allocator); + + try std.testing.expectEqual(@as(u16, 200), resp.status_code); + try std.testing.expectEqualStrings("rank zero logs\n", resp.body); +} diff --git a/src/cluster/agent/request_support.zig b/src/cluster/agent/request_support.zig index d2a93d0c..d89a5b5d 100644 --- a/src/cluster/agent/request_support.zig +++ b/src/cluster/agent/request_support.zig @@ -11,6 +11,7 @@ pub fn buildRegisterBody( alloc: Allocator, token: []const u8, address: []const u8, + agent_api_port: u16, resources: AgentResources, pub_key: []const u8, wg_listen_port: u16, @@ -25,7 +26,9 @@ pub fn buildRegisterBody( try json_helpers.writeJsonEscaped(writer, token); try writer.writeAll("\",\"address\":\""); try json_helpers.writeJsonEscaped(writer, address); - try writer.writeAll("\",\"cpu_cores\":"); + try writer.writeAll("\",\"agent_api_port\":"); + try writer.print("{d}", .{agent_api_port}); + try writer.writeAll(",\"cpu_cores\":"); try writer.print("{d}", .{resources.cpu_cores}); try writer.writeAll(",\"memory_mb\":"); try writer.print("{d}", .{resources.memory_mb}); diff --git a/src/cluster/agent_types.zig b/src/cluster/agent_types.zig index 423a05bc..6c6943eb 100644 --- a/src/cluster/agent_types.zig +++ b/src/cluster/agent_types.zig @@ -65,6 +65,7 @@ pub const AgentResources = struct { pub const AgentRecord = struct { id: []const u8, address: []const u8, + agent_api_port: ?i64 = null, status: []const u8, cpu_cores: i64, memory_mb: i64, diff --git a/src/cluster/cli/membership_command.zig b/src/cluster/cli/membership_command.zig index 0f24dd3a..69c63486 100644 --- a/src/cluster/cli/membership_command.zig +++ b/src/cluster/cli/membership_command.zig @@ -21,6 +21,7 @@ pub fn join(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void { var server_host: ?[]const u8 = null; var token: ?[]const u8 = null; var api_port: u16 = 7700; + var agent_api_port: u16 = 7701; var role: cluster_config.NodeRole = .both; var region: ?[]const u8 = null; @@ -39,6 +40,15 @@ pub fn join(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void { writeErr("invalid port: {s}\n", .{port_str}); return MembershipError.InvalidArgument; }; + } else if (std.mem.eql(u8, arg, "--agent-port")) { + const port_str = args.next() orelse { + writeErr("--agent-port requires a port number\n", .{}); + return MembershipError.InvalidArgument; + }; + agent_api_port = std.fmt.parseInt(u16, port_str, 10) catch { + writeErr("invalid agent port: {s}\n", .{port_str}); + return MembershipError.InvalidArgument; + }; } else if (std.mem.eql(u8, arg, "--role")) { const role_str = args.next() orelse { writeErr("--role requires a value (server, agent, or both)\n", .{}); @@ -78,6 +88,7 @@ pub fn join(args: *std.process.ArgIterator, alloc: std.mem.Allocator) !void { var agent = try cluster_agent.Agent.initOwned(alloc, server_addr, api_port, join_token); defer agent.deinit(); + agent.agent_api_port = agent_api_port; agent.role = role; agent.region = region; diff --git a/src/cluster/registry.zig b/src/cluster/registry.zig index cfa77353..e2e95fdd 100644 --- a/src/cluster/registry.zig +++ b/src/cluster/registry.zig @@ -48,6 +48,8 @@ pub const getAgent = queries.getAgent; pub const getAssignments = queries.getAssignments; pub const getOrphanedAssignments = queries.getOrphanedAssignments; pub const countAssignmentsForWorkload = queries.countAssignmentsForWorkload; +pub const WorkloadHost = queries.WorkloadHost; +pub const findWorkloadHostByRank = queries.findWorkloadHostByRank; // -- tests -- diff --git a/src/cluster/registry/queries.zig b/src/cluster/registry/queries.zig index 43083317..c5a223de 100644 --- a/src/cluster/registry/queries.zig +++ b/src/cluster/registry/queries.zig @@ -79,6 +79,7 @@ fn queryWireguardPeers(alloc: Allocator, db: *sqlite.Db, sql: []const u8) ![]Wir const AgentRow = struct { id: sqlite.Text, address: sqlite.Text, + agent_api_port: ?i64, status: sqlite.Text, cpu_cores: i64, memory_mb: i64, @@ -100,12 +101,13 @@ const AgentRow = struct { rdma_capable: ?i64, }; -const agent_select_cols = "id, address, status, cpu_cores, memory_mb, cpu_used, memory_used_mb, containers, last_heartbeat, registered_at, node_id, wg_public_key, overlay_ip, role, region, labels, gpu_count, gpu_used, gpu_model, gpu_vram_mb, rdma_capable"; +const agent_select_cols = "id, address, agent_api_port, status, cpu_cores, memory_mb, cpu_used, memory_used_mb, containers, last_heartbeat, registered_at, node_id, wg_public_key, overlay_ip, role, region, labels, gpu_count, gpu_used, gpu_model, gpu_vram_mb, rdma_capable"; fn agentRowToRecord(row: AgentRow) AgentRecord { return .{ .id = row.id.data, .address = row.address.data, + .agent_api_port = row.agent_api_port, .status = row.status.data, .cpu_cores = row.cpu_cores, .memory_mb = row.memory_mb, @@ -186,6 +188,53 @@ pub fn countAssignmentsForWorkload(db: *sqlite.Db, app_name: []const u8, workloa return @intCast(row.count); } +pub const WorkloadHost = struct { + agent_id: []const u8, + address: []const u8, + agent_api_port: ?i64, + + pub fn deinit(self: WorkloadHost, alloc: Allocator) void { + alloc.free(self.agent_id); + alloc.free(self.address); + } +}; + +pub fn findWorkloadHostByRank( + alloc: Allocator, + db: *sqlite.Db, + app_name: []const u8, + workload_kind: []const u8, + workload_name: []const u8, + rank: u32, +) !?WorkloadHost { + const Row = struct { + agent_id: sqlite.Text, + address: sqlite.Text, + agent_api_port: ?i64, + }; + const row = (db.oneAlloc( + Row, + alloc, + \\SELECT agents.id AS agent_id, agents.address, agents.agent_api_port + \\FROM assignments + \\JOIN agents ON assignments.agent_id = agents.id + \\WHERE assignments.app_name = ? + \\ AND assignments.workload_kind = ? + \\ AND assignments.workload_name = ? + \\ AND COALESCE(assignments.gang_rank, 0) = ? + \\ORDER BY assignments.created_at DESC, assignments.id DESC + \\LIMIT 1; + , + .{}, + .{ app_name, workload_kind, workload_name, @as(i64, rank) }, + ) catch return error.QueryFailed) orelse return null; + return .{ + .agent_id = row.agent_id.data, + .address = row.address.data, + .agent_api_port = row.agent_api_port, + }; +} + const AssignmentRow = struct { id: sqlite.Text, agent_id: sqlite.Text, diff --git a/src/cluster/registry/sql_mutations.zig b/src/cluster/registry/sql_mutations.zig index 2f0e66ca..ec433c20 100644 --- a/src/cluster/registry/sql_mutations.zig +++ b/src/cluster/registry/sql_mutations.zig @@ -5,6 +5,7 @@ const sql_escape = @import("../../lib/sql.zig"); pub const AgentResources = agent_types.AgentResources; pub const RegisterOpts = struct { + agent_api_port: ?u16 = null, node_id: ?u16 = null, wg_public_key: ?[]const u8 = null, overlay_ip: ?[]const u8 = null, @@ -32,6 +33,7 @@ pub fn registerSqlFull( opts: RegisterOpts, ) ![]const u8 { const node_id = opts.node_id; + const agent_api_port = opts.agent_api_port; const wg_public_key = opts.wg_public_key; const overlay_ip = opts.overlay_ip; const role = opts.role; @@ -78,16 +80,16 @@ pub fn registerSqlFull( const reg_esc = try sql_escape.escapeSqlString(®ion_esc_buf, region_val); return std.fmt.bufPrint( buf, - "INSERT INTO agents (id, address, status, cpu_cores, memory_mb, cpu_used, memory_used_mb, containers, last_heartbeat, registered_at, node_id, wg_public_key, overlay_ip, role, region, labels{s})" ++ - " VALUES ('{s}', '{s}', 'active', {d}, {d}, 0, 0, 0, {d}, {d}, {d}, '{s}', '{s}', '{s}', '{s}', '{s}'{s});", - .{ gpu_cols, id_esc, addr_esc, resources.cpu_cores, resources.memory_mb, now, now, nid, key_esc, ip_esc, role_esc, reg_esc, labels_esc, gpu_vals }, + "INSERT INTO agents (id, address, agent_api_port, status, cpu_cores, memory_mb, cpu_used, memory_used_mb, containers, last_heartbeat, registered_at, node_id, wg_public_key, overlay_ip, role, region, labels{s})" ++ + " VALUES ('{s}', '{s}', {any}, 'active', {d}, {d}, 0, 0, 0, {d}, {d}, {d}, '{s}', '{s}', '{s}', '{s}', '{s}'{s});", + .{ gpu_cols, id_esc, addr_esc, agent_api_port, resources.cpu_cores, resources.memory_mb, now, now, nid, key_esc, ip_esc, role_esc, reg_esc, labels_esc, gpu_vals }, ); } return std.fmt.bufPrint( buf, - "INSERT INTO agents (id, address, status, cpu_cores, memory_mb, cpu_used, memory_used_mb, containers, last_heartbeat, registered_at, node_id, wg_public_key, overlay_ip, role, labels{s})" ++ - " VALUES ('{s}', '{s}', 'active', {d}, {d}, 0, 0, 0, {d}, {d}, {d}, '{s}', '{s}', '{s}', '{s}'{s});", - .{ gpu_cols, id_esc, addr_esc, resources.cpu_cores, resources.memory_mb, now, now, nid, key_esc, ip_esc, role_esc, labels_esc, gpu_vals }, + "INSERT INTO agents (id, address, agent_api_port, status, cpu_cores, memory_mb, cpu_used, memory_used_mb, containers, last_heartbeat, registered_at, node_id, wg_public_key, overlay_ip, role, labels{s})" ++ + " VALUES ('{s}', '{s}', {any}, 'active', {d}, {d}, 0, 0, 0, {d}, {d}, {d}, '{s}', '{s}', '{s}', '{s}'{s});", + .{ gpu_cols, id_esc, addr_esc, agent_api_port, resources.cpu_cores, resources.memory_mb, now, now, nid, key_esc, ip_esc, role_esc, labels_esc, gpu_vals }, ); } @@ -95,17 +97,17 @@ pub fn registerSqlFull( const reg_esc = try sql_escape.escapeSqlString(®ion_esc_buf, region_val); return std.fmt.bufPrint( buf, - "INSERT INTO agents (id, address, status, cpu_cores, memory_mb, cpu_used, memory_used_mb, containers, last_heartbeat, registered_at, role, region, labels{s})" ++ - " VALUES ('{s}', '{s}', 'active', {d}, {d}, 0, 0, 0, {d}, {d}, '{s}', '{s}', '{s}'{s});", - .{ gpu_cols, id_esc, addr_esc, resources.cpu_cores, resources.memory_mb, now, now, role_esc, reg_esc, labels_esc, gpu_vals }, + "INSERT INTO agents (id, address, agent_api_port, status, cpu_cores, memory_mb, cpu_used, memory_used_mb, containers, last_heartbeat, registered_at, role, region, labels{s})" ++ + " VALUES ('{s}', '{s}', {any}, 'active', {d}, {d}, 0, 0, 0, {d}, {d}, '{s}', '{s}', '{s}'{s});", + .{ gpu_cols, id_esc, addr_esc, agent_api_port, resources.cpu_cores, resources.memory_mb, now, now, role_esc, reg_esc, labels_esc, gpu_vals }, ); } return std.fmt.bufPrint( buf, - "INSERT INTO agents (id, address, status, cpu_cores, memory_mb, cpu_used, memory_used_mb, containers, last_heartbeat, registered_at, role, labels{s})" ++ - " VALUES ('{s}', '{s}', 'active', {d}, {d}, 0, 0, 0, {d}, {d}, '{s}', '{s}'{s});", - .{ gpu_cols, id_esc, addr_esc, resources.cpu_cores, resources.memory_mb, now, now, role_esc, labels_esc, gpu_vals }, + "INSERT INTO agents (id, address, agent_api_port, status, cpu_cores, memory_mb, cpu_used, memory_used_mb, containers, last_heartbeat, registered_at, role, labels{s})" ++ + " VALUES ('{s}', '{s}', {any}, 'active', {d}, {d}, 0, 0, 0, {d}, {d}, '{s}', '{s}'{s});", + .{ gpu_cols, id_esc, addr_esc, agent_api_port, resources.cpu_cores, resources.memory_mb, now, now, role_esc, labels_esc, gpu_vals }, ); } diff --git a/src/cluster/registry/test_support.zig b/src/cluster/registry/test_support.zig index 84d5a470..dde1c0f8 100644 --- a/src/cluster/registry/test_support.zig +++ b/src/cluster/registry/test_support.zig @@ -2,6 +2,7 @@ pub const agents_schema = \\CREATE TABLE agents ( \\ id TEXT PRIMARY KEY, \\ address TEXT NOT NULL, + \\ agent_api_port INTEGER, \\ status TEXT NOT NULL DEFAULT 'active', \\ cpu_cores INTEGER NOT NULL DEFAULT 0, \\ memory_mb INTEGER NOT NULL DEFAULT 0, diff --git a/src/state/schema/migrations.zig b/src/state/schema/migrations.zig index 76a10edd..836509b4 100644 --- a/src/state/schema/migrations.zig +++ b/src/state/schema/migrations.zig @@ -17,6 +17,7 @@ fn migrateContainers(db: *sqlite.Db) void { } fn migrateAgents(db: *sqlite.Db) void { + addColumnIfMissing(db, "ALTER TABLE agents ADD COLUMN agent_api_port INTEGER;") catch {}; addColumnIfMissing(db, "ALTER TABLE agents ADD COLUMN node_id INTEGER;") catch {}; addColumnIfMissing(db, "ALTER TABLE agents ADD COLUMN wg_public_key TEXT;") catch {}; addColumnIfMissing(db, "ALTER TABLE agents ADD COLUMN overlay_ip TEXT;") catch {}; diff --git a/src/state/schema/tables.zig b/src/state/schema/tables.zig index 50938c74..67ba47da 100644 --- a/src/state/schema/tables.zig +++ b/src/state/schema/tables.zig @@ -180,6 +180,7 @@ pub fn initClusterTables(db: *sqlite.Db) SchemaError!void { \\CREATE TABLE IF NOT EXISTS agents ( \\ id TEXT PRIMARY KEY, \\ address TEXT NOT NULL, + \\ agent_api_port INTEGER, \\ status TEXT NOT NULL DEFAULT 'active', \\ cpu_cores INTEGER NOT NULL DEFAULT 0, \\ memory_mb INTEGER NOT NULL DEFAULT 0, diff --git a/src/test_root.zig b/src/test_root.zig index 3388a0d5..6071f587 100644 --- a/src/test_root.zig +++ b/src/test_root.zig @@ -123,6 +123,7 @@ comptime { _ = @import("cluster/registry.zig"); _ = @import("cluster/http_client.zig"); _ = @import("cluster/agent.zig"); + _ = @import("cluster/agent/log_server.zig"); _ = @import("cluster/scheduler.zig"); _ = @import("cluster/commands.zig"); _ = @import("tls/commands.zig"); From 4ad5b0f700582c6421aa234ced64bba0f91e59d4 Mon Sep 17 00:00:00 2001 From: Kacy Fortner Date: Fri, 10 Apr 2026 15:02:44 +0000 Subject: [PATCH 14/14] Harden clustered training log reads --- .../routes/cluster_agents/workload_routes.zig | 115 +++++++++++++++--- src/cluster/agent/log_server.zig | 13 +- 2 files changed, 104 insertions(+), 24 deletions(-) diff --git a/src/api/routes/cluster_agents/workload_routes.zig b/src/api/routes/cluster_agents/workload_routes.zig index a2cce922..72fba9a7 100644 --- a/src/api/routes/cluster_agents/workload_routes.zig +++ b/src/api/routes/cluster_agents/workload_routes.zig @@ -224,33 +224,42 @@ fn handleTrainingLogs( ctx: RouteContext, ) Response { const node = ctx.cluster orelse return common.badRequest("not running in cluster mode"); - const rank = if (common.extractQueryValue(request.query, "rank")) |rank_str| - std.fmt.parseInt(u32, rank_str, 10) catch 0 - else - 0; + const rank = parseTrainingLogRank(request.query) catch return common.badRequest("invalid rank"); + var hostname_buf: [128]u8 = undefined; + const hostname = trainingRankHostname(&hostname_buf, job_name, rank) catch return common.internalError(); + const record = store.findAppContainer(alloc, app_name, hostname) catch return common.internalError(); + if (record) |local_record| { + defer local_record.deinit(alloc); + return readTrainingLogsResponse(alloc, local_record.id); + } if (proxyTrainingLogsFromHostingAgent(alloc, node, ctx.join_token, app_name, job_name, rank)) |result| { return result; } - var hostname_buf: [128]u8 = undefined; - const hostname = std.fmt.bufPrint(&hostname_buf, "{s}-rank-{d}", .{ job_name, rank }) catch return common.internalError(); - const record = store.findAppContainer(alloc, app_name, hostname) catch return common.internalError(); - if (record == null) { - const scheduled = agent_registry.countAssignmentsForWorkload(node.stateMachineDb(), app_name, "training", job_name) catch return common.internalError(); - if (scheduled > 0) { - return .{ - .status = .bad_request, - .body = "{\"error\":\"training logs are only available on the hosting agent\"}", - .allocated = false, - }; - } - return common.notFound(); + const scheduled = agent_registry.countAssignmentsForWorkload(node.stateMachineDb(), app_name, "training", job_name) catch return common.internalError(); + if (scheduled > 0) { + return .{ + .status = .bad_request, + .body = "{\"error\":\"training logs are only available on the hosting agent\"}", + .allocated = false, + }; } - defer record.?.deinit(alloc); + return common.notFound(); +} + +fn parseTrainingLogRank(query: []const u8) !u32 { + const rank_str = common.extractQueryValue(query, "rank") orelse return 0; + return std.fmt.parseInt(u32, rank_str, 10) catch error.InvalidRank; +} - const logs = @import("../../../runtime/logs.zig"); - const data = logs.readLogs(alloc, record.?.id) catch return common.notFound(); +fn trainingRankHostname(buf: []u8, job_name: []const u8, rank: u32) ![]const u8 { + return std.fmt.bufPrint(buf, "{s}-rank-{d}", .{ job_name, rank }); +} + +fn readTrainingLogsResponse(alloc: std.mem.Allocator, container_id: []const u8) Response { + const runtime_logs = @import("../../../runtime/logs.zig"); + const data = runtime_logs.readLogs(alloc, container_id) catch return common.notFound(); return .{ .status = .ok, .body = data, .allocated = true, .content_type = "text/plain" }; } @@ -569,6 +578,22 @@ fn updateHarnessAgentEndpoint(harness: *RouteFlowHarness, address: []const u8, p ) catch return error.SkipZigTest; } +fn clearHarnessAgentEndpoint(harness: *RouteFlowHarness) !void { + harness.node.stateMachineDb().exec( + "UPDATE agents SET agent_api_port = NULL WHERE id = ?;", + .{}, + .{"abc123def456"}, + ) catch return error.SkipZigTest; +} + +fn seedTrainingAssignment(harness: *RouteFlowHarness, app_name: []const u8, job_name: []const u8, rank: u32) !void { + harness.node.stateMachineDb().exec( + "INSERT INTO assignments (id, agent_id, image, command, status, app_name, workload_kind, workload_name, gang_rank, created_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?);", + .{}, + .{ "assign12345678", "abc123def456", "pytorch:latest", "python train.py", "running", app_name, "training", job_name, @as(i64, rank), @as(i64, 100) }, + ) catch return error.SkipZigTest; +} + test "route rejects worker run without cluster" { const ctx: RouteContext = .{ .cluster = null, .join_token = null }; const req = makeRequest(.POST, "/apps/demo-app/workers/migrate/run", "", ""); @@ -756,6 +781,56 @@ test "training logs route reports remote-hosted ranks explicitly" { try std.testing.expect(std.mem.indexOf(u8, logs_resp.body, "hosting agent") != null); } +test "training logs route rejects invalid rank query" { + const alloc = std.testing.allocator; + var harness = try RouteFlowHarness.init(alloc); + defer harness.deinit(); + + const logs_resp = route( + makeRequest(.GET, "/apps/demo-app/training/finetune/logs", "", "rank=abc"), + alloc, + harness.ctx(), + ).?; + defer freeResponse(alloc, logs_resp); + + try std.testing.expectEqual(http.StatusCode.bad_request, logs_resp.status); + try std.testing.expect(std.mem.indexOf(u8, logs_resp.body, "invalid rank") != null); +} + +test "training logs route prefers local logs when available" { + const alloc = std.testing.allocator; + var harness = try RouteFlowHarness.init(alloc); + defer harness.deinit(); + + try store.save(.{ + .id = "abc123def456", + .rootfs = "/tmp/rootfs", + .command = "python train.py", + .hostname = "finetune-rank-0", + .status = "running", + .pid = null, + .exit_code = null, + .app_name = "demo-app", + .created_at = 100, + }); + var file = try @import("../../../runtime/logs.zig").createLogFile("abc123def456"); + try file.writeAll("local rank logs\n"); + file.close(); + + try seedTrainingAssignment(&harness, "demo-app", "finetune", 0); + try clearHarnessAgentEndpoint(&harness); + + const logs_resp = route( + makeRequest(.GET, "/apps/demo-app/training/finetune/logs", "", "rank=0"), + alloc, + harness.ctx(), + ).?; + defer freeResponse(alloc, logs_resp); + + try std.testing.expectEqual(http.StatusCode.ok, logs_resp.status); + try std.testing.expectEqualStrings("local rank logs\n", logs_resp.body); +} + test "training logs route proxies logs from hosting agent" { const alloc = std.testing.allocator; store.initTestDb() catch return error.SkipZigTest; diff --git a/src/cluster/agent/log_server.zig b/src/cluster/agent/log_server.zig index 2742d8fd..84c5d376 100644 --- a/src/cluster/agent/log_server.zig +++ b/src/cluster/agent/log_server.zig @@ -80,10 +80,10 @@ fn handleConnection(self: *LogServer, client_fd: posix.fd_t) void { sendError(client_fd, .bad_request, "invalid app or training job name"); return; } - const rank = if (common.extractQueryValue(request.query, "rank")) |rank_str| - std.fmt.parseInt(u32, rank_str, 10) catch 0 - else - 0; + const rank = parseRankQuery(request.query) catch { + sendError(client_fd, .bad_request, "invalid rank"); + return; + }; serveTrainingLogs(self.alloc, client_fd, path.app_name, path.job_name, rank); return; } @@ -109,6 +109,11 @@ fn matchTrainingLogs(path: []const u8) ?TrainingLogsPath { return .{ .app_name = app_name, .job_name = job_name }; } +fn parseRankQuery(query: []const u8) !u32 { + const rank_str = common.extractQueryValue(query, "rank") orelse return 0; + return std.fmt.parseInt(u32, rank_str, 10) catch error.InvalidRank; +} + fn serveTrainingLogs(alloc: std.mem.Allocator, client_fd: posix.fd_t, app_name: []const u8, job_name: []const u8, rank: u32) void { var hostname_buf: [128]u8 = undefined; const hostname = std.fmt.bufPrint(&hostname_buf, "{s}-rank-{d}", .{ job_name, rank }) catch {